# Parser for C code # Originally by Mark Shannon (mark@hotpy.org) # https://gist.github.com/markshannon/db7ab649440b5af765451bb77c7dba34 import re from dataclasses import dataclass def choice(*opts): return "|".join("(%s)" % opt for opt in opts) # Regexes # Longer operators must go before shorter ones. PLUSPLUS = r'\+\+' MINUSMINUS = r'--' # -> ARROW = r'->' ELLIPSIS = r'\.\.\.' # Assignment operators TIMESEQUAL = r'\*=' DIVEQUAL = r'/=' MODEQUAL = r'%=' PLUSEQUAL = r'\+=' MINUSEQUAL = r'-=' LSHIFTEQUAL = r'<<=' RSHIFTEQUAL = r'>>=' ANDEQUAL = r'&=' OREQUAL = r'\|=' XOREQUAL = r'\^=' # Operators PLUS = r'\+' MINUS = r'-' TIMES = r'\*' DIVIDE = r'/' MOD = r'%' NOT = r'~' XOR = r'\^' LOR = r'\|\|' LAND = r'&&' LSHIFT = r'<<' RSHIFT = r'>>' LE = r'<=' GE = r'>=' EQ = r'==' NE = r'!=' LT = r'<' GT = r'>' LNOT = r'!' OR = r'\|' AND = r'&' EQUALS = r'=' # ? CONDOP = r'\?' # Delimiters LPAREN = r'\(' RPAREN = r'\)' LBRACKET = r'\[' RBRACKET = r'\]' LBRACE = r'\{' RBRACE = r'\}' COMMA = r',' PERIOD = r'\.' SEMI = r';' COLON = r':' BACKSLASH = r'\\' operators = { op: pattern for op, pattern in globals().items() if op == op.upper() } for op in operators: globals()[op] = op opmap = { pattern.replace("\\", "") or '\\' : op for op, pattern in operators.items() } # Macros macro = r'# *(ifdef|ifndef|undef|define|error|endif|if|else|include|#)' MACRO = 'MACRO' id_re = r'[a-zA-Z_][0-9a-zA-Z_]*' IDENTIFIER = 'IDENTIFIER' suffix = r'([uU]?[lL]?[lL]?)' octal = r'0[0-7]+' + suffix hex = r'0[xX][0-9a-fA-F]+' decimal_digits = r'(0|[1-9][0-9]*)' decimal = decimal_digits + suffix exponent = r"""([eE][-+]?[0-9]+)""" fraction = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" float = '(((('+fraction+')'+exponent+'?)|([0-9]+'+exponent+'))[FfLl]?)' number_re = choice(octal, hex, float, decimal) NUMBER = 'NUMBER' simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" decimal_escape = r"""(\d+)""" hex_escape = r"""(x[0-9a-fA-F]+)""" escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' string_char = r"""([^"\\\n]|"""+escape_sequence+')' str_re = '"'+string_char+'*"' STRING = 'STRING' char = r'\'.\'' # TODO: escape sequence CHARACTER = 'CHARACTER' comment_re = r'//.*|/\*([^*]|\*[^/])*\*/' COMMENT = 'COMMENT' newline = r"\n" invalid = r"\S" # A single non-space character that's not caught by any of the other patterns matcher = re.compile(choice(id_re, number_re, str_re, char, newline, macro, comment_re, *operators.values(), invalid)) letter = re.compile(r'[a-zA-Z_]') kwds = ( 'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN', 'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG', 'OVERRIDE', 'REGISTER', 'OFFSETOF', 'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', 'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID', 'VOLATILE', 'WHILE' ) for name in kwds: globals()[name] = name keywords = { name.lower() : name for name in kwds } def make_syntax_error( message: str, filename: str, line: int, column: int, line_text: str, ) -> SyntaxError: return SyntaxError(message, (filename, line, column, line_text)) @dataclass(slots=True) class Token: kind: str text: str begin: tuple[int, int] end: tuple[int, int] @property def line(self): return self.begin[0] @property def column(self): return self.begin[1] @property def end_line(self): return self.end[0] @property def end_column(self): return self.end[1] @property def width(self): return self.end[1] - self.begin[1] def replaceText(self, txt): assert isinstance(txt, str) return Token(self.kind, txt, self.begin, self.end) def __repr__(self): b0, b1 = self.begin e0, e1 = self.end if b0 == e0: return f"{self.kind}({self.text!r}, {b0}:{b1}:{e1})" else: return f"{self.kind}({self.text!r}, {b0}:{b1}, {e0}:{e1})" def tokenize(src, line=1, filename=None): linestart = -1 for m in matcher.finditer(src): start, end = m.span() text = m.group(0) if text in keywords: kind = keywords[text] elif letter.match(text): kind = IDENTIFIER elif text == '...': kind = ELLIPSIS elif text == '.': kind = PERIOD elif text[0] in '0123456789.': kind = NUMBER elif text[0] == '"': kind = STRING elif text in opmap: kind = opmap[text] elif text == '\n': linestart = start line += 1 kind = '\n' elif text[0] == "'": kind = CHARACTER elif text[0] == '#': kind = MACRO elif text[0] == '/' and text[1] in '/*': kind = COMMENT else: lineend = src.find("\n", start) if lineend == -1: lineend = len(src) raise make_syntax_error(f"Bad token: {text}", filename, line, start-linestart+1, src[linestart:lineend]) if kind == COMMENT: begin = line, start-linestart newlines = text.count('\n') if newlines: linestart = start + text.rfind('\n') line += newlines else: begin = line, start-linestart if kind != "\n": yield Token(kind, text, begin, (line, start-linestart+len(text))) __all__ = [] __all__.extend([kind for kind in globals() if kind.upper() == kind]) def to_text(tkns: list[Token], dedent: int = 0) -> str: res: list[str] = [] line, col = -1, 1+dedent for tkn in tkns: if line == -1: line, _ = tkn.begin l, c = tkn.begin #assert(l >= line), (line, txt, start, end) while l > line: line += 1 res.append('\n') col = 1+dedent res.append(' '*(c-col)) text = tkn.text if dedent != 0 and tkn.kind == 'COMMENT' and '\n' in text: if dedent < 0: text = text.replace('\n', '\n' + ' '*-dedent) # TODO: dedent > 0 res.append(text) line, col = tkn.end return ''.join(res) if __name__ == "__main__": import sys filename = sys.argv[1] if filename == "-c": src = sys.argv[2] else: src = open(filename).read() # print(to_text(tokenize(src))) for tkn in tokenize(src, filename=filename): print(tkn)