# Parser for C code # Originally by Mark Shannon (mark@hotpy.org) # https://gist.github.com/markshannon/db7ab649440b5af765451bb77c7dba34 import re from dataclasses import dataclass from collections.abc import Iterator def choice(*opts: str) -> str: return "|".join("(%s)" % opt for opt in opts) # Regexes # Longer operators must go before shorter ones. PLUSPLUS = r"\+\+" MINUSMINUS = r"--" # -> ARROW = r"->" ELLIPSIS = r"\.\.\." # Assignment operators TIMESEQUAL = r"\*=" DIVEQUAL = r"/=" MODEQUAL = r"%=" PLUSEQUAL = r"\+=" MINUSEQUAL = r"-=" LSHIFTEQUAL = r"<<=" RSHIFTEQUAL = r">>=" ANDEQUAL = r"&=" OREQUAL = r"\|=" XOREQUAL = r"\^=" # Operators PLUS = r"\+" MINUS = r"-" TIMES = r"\*" DIVIDE = r"/" MOD = r"%" NOT = r"~" XOR = r"\^" LOR = r"\|\|" LAND = r"&&" LSHIFT = r"<<" RSHIFT = r">>" LE = r"<=" GE = r">=" EQ = r"==" NE = r"!=" LT = r"<" GT = r">" LNOT = r"!" OR = r"\|" AND = r"&" EQUALS = r"=" # ? CONDOP = r"\?" # Delimiters LPAREN = r"\(" RPAREN = r"\)" LBRACKET = r"\[" RBRACKET = r"\]" LBRACE = r"\{" RBRACE = r"\}" COMMA = r"," PERIOD = r"\." SEMI = r";" COLON = r":" BACKSLASH = r"\\" operators = {op: pattern for op, pattern in globals().items() if op == op.upper()} for op in operators: globals()[op] = op opmap = {pattern.replace("\\", "") or "\\": op for op, pattern in operators.items()} # Macros macro = r"# *(ifdef|ifndef|undef|define|error|endif|if|else|include|#)" MACRO = "MACRO" id_re = r"[a-zA-Z_][0-9a-zA-Z_]*" IDENTIFIER = "IDENTIFIER" suffix = r"([uU]?[lL]?[lL]?)" octal = r"0[0-7]+" + suffix hex = r"0[xX][0-9a-fA-F]+" decimal_digits = r"(0|[1-9][0-9]*)" decimal = decimal_digits + suffix exponent = r"""([eE][-+]?[0-9]+)""" fraction = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" float = "((((" + fraction + ")" + exponent + "?)|([0-9]+" + exponent + "))[FfLl]?)" number_re = choice(octal, hex, float, decimal) NUMBER = "NUMBER" simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" decimal_escape = r"""(\d+)""" hex_escape = r"""(x[0-9a-fA-F]+)""" escape_sequence = ( r"""(\\(""" + simple_escape + "|" + decimal_escape + "|" + hex_escape + "))" ) string_char = r"""([^"\\\n]|""" + escape_sequence + ")" str_re = '"' + string_char + '*"' STRING = "STRING" char = r"\'.\'" # TODO: escape sequence CHARACTER = "CHARACTER" comment_re = r"//.*|/\*([^*]|\*[^/])*\*/" COMMENT = "COMMENT" newline = r"\n" invalid = ( r"\S" # A single non-space character that's not caught by any of the other patterns ) matcher = re.compile( choice( id_re, number_re, str_re, char, newline, macro, comment_re, *operators.values(), invalid, ) ) letter = re.compile(r"[a-zA-Z_]") kwds = [] AUTO = "AUTO" kwds.append(AUTO) BREAK = "BREAK" kwds.append(BREAK) CASE = "CASE" kwds.append(CASE) CHAR = "CHAR" kwds.append(CHAR) CONST = "CONST" kwds.append(CONST) CONTINUE = "CONTINUE" kwds.append(CONTINUE) DEFAULT = "DEFAULT" kwds.append(DEFAULT) DO = "DO" kwds.append(DO) DOUBLE = "DOUBLE" kwds.append(DOUBLE) ELSE = "ELSE" kwds.append(ELSE) ENUM = "ENUM" kwds.append(ENUM) EXTERN = "EXTERN" kwds.append(EXTERN) FLOAT = "FLOAT" kwds.append(FLOAT) FOR = "FOR" kwds.append(FOR) GOTO = "GOTO" kwds.append(GOTO) IF = "IF" kwds.append(IF) INLINE = "INLINE" kwds.append(INLINE) INT = "INT" kwds.append(INT) LONG = "LONG" kwds.append(LONG) OVERRIDE = "OVERRIDE" kwds.append(OVERRIDE) REGISTER = "REGISTER" kwds.append(REGISTER) OFFSETOF = "OFFSETOF" kwds.append(OFFSETOF) RESTRICT = "RESTRICT" kwds.append(RESTRICT) RETURN = "RETURN" kwds.append(RETURN) SHORT = "SHORT" kwds.append(SHORT) SIGNED = "SIGNED" kwds.append(SIGNED) SIZEOF = "SIZEOF" kwds.append(SIZEOF) STATIC = "STATIC" kwds.append(STATIC) STRUCT = "STRUCT" kwds.append(STRUCT) SWITCH = "SWITCH" kwds.append(SWITCH) TYPEDEF = "TYPEDEF" kwds.append(TYPEDEF) UNION = "UNION" kwds.append(UNION) UNSIGNED = "UNSIGNED" kwds.append(UNSIGNED) VOID = "VOID" kwds.append(VOID) VOLATILE = "VOLATILE" kwds.append(VOLATILE) WHILE = "WHILE" kwds.append(WHILE) keywords = {name.lower(): name for name in kwds} __all__ = [] __all__.extend(kwds) def make_syntax_error( message: str, filename: str | None, line: int, column: int, line_text: str, ) -> SyntaxError: return SyntaxError(message, (filename, line, column, line_text)) @dataclass(slots=True) class Token: kind: str text: str begin: tuple[int, int] end: tuple[int, int] @property def line(self) -> int: return self.begin[0] @property def column(self) -> int: return self.begin[1] @property def end_line(self) -> int: return self.end[0] @property def end_column(self) -> int: return self.end[1] @property def width(self) -> int: return self.end[1] - self.begin[1] def replaceText(self, txt: str) -> "Token": assert isinstance(txt, str) return Token(self.kind, txt, self.begin, self.end) def __repr__(self) -> str: b0, b1 = self.begin e0, e1 = self.end if b0 == e0: return f"{self.kind}({self.text!r}, {b0}:{b1}:{e1})" else: return f"{self.kind}({self.text!r}, {b0}:{b1}, {e0}:{e1})" def tokenize(src: str, line: int = 1, filename: str | None = None) -> Iterator[Token]: linestart = -1 for m in matcher.finditer(src): start, end = m.span() text = m.group(0) if text in keywords: kind = keywords[text] elif letter.match(text): kind = IDENTIFIER elif text == "...": kind = ELLIPSIS elif text == ".": kind = PERIOD elif text[0] in "0123456789.": kind = NUMBER elif text[0] == '"': kind = STRING elif text in opmap: kind = opmap[text] elif text == "\n": linestart = start line += 1 kind = "\n" elif text[0] == "'": kind = CHARACTER elif text[0] == "#": kind = MACRO elif text[0] == "/" and text[1] in "/*": kind = COMMENT else: lineend = src.find("\n", start) if lineend == -1: lineend = len(src) raise make_syntax_error( f"Bad token: {text}", filename, line, start - linestart + 1, src[linestart:lineend], ) if kind == COMMENT: begin = line, start - linestart newlines = text.count("\n") if newlines: linestart = start + text.rfind("\n") line += newlines else: begin = line, start - linestart if kind != "\n": yield Token(kind, text, begin, (line, start - linestart + len(text))) def to_text(tkns: list[Token], dedent: int = 0) -> str: res: list[str] = [] line, col = -1, 1 + dedent for tkn in tkns: if line == -1: line, _ = tkn.begin l, c = tkn.begin # assert(l >= line), (line, txt, start, end) while l > line: line += 1 res.append("\n") col = 1 + dedent res.append(" " * (c - col)) text = tkn.text if dedent != 0 and tkn.kind == "COMMENT" and "\n" in text: if dedent < 0: text = text.replace("\n", "\n" + " " * -dedent) # TODO: dedent > 0 res.append(text) line, col = tkn.end return "".join(res) if __name__ == "__main__": import sys filename = sys.argv[1] if filename == "-c": src = sys.argv[2] else: src = open(filename).read() # print(to_text(tokenize(src))) for tkn in tokenize(src, filename=filename): print(tkn)