mirror of https://github.com/python/cpython
258 lines
6.2 KiB
Python
258 lines
6.2 KiB
Python
|
# Parser for C code
|
||
|
# Originally by Mark Shannon (mark@hotpy.org)
|
||
|
# https://gist.github.com/markshannon/db7ab649440b5af765451bb77c7dba34
|
||
|
|
||
|
import re
|
||
|
import sys
|
||
|
import collections
|
||
|
from dataclasses import dataclass
|
||
|
|
||
|
def choice(*opts):
|
||
|
return "|".join("(%s)" % opt for opt in opts)
|
||
|
|
||
|
# Regexes
|
||
|
|
||
|
# Longer operators must go before shorter ones.
|
||
|
|
||
|
PLUSPLUS = r'\+\+'
|
||
|
MINUSMINUS = r'--'
|
||
|
|
||
|
# ->
|
||
|
ARROW = r'->'
|
||
|
ELLIPSIS = r'\.\.\.'
|
||
|
|
||
|
# Assignment operators
|
||
|
TIMESEQUAL = r'\*='
|
||
|
DIVEQUAL = r'/='
|
||
|
MODEQUAL = r'%='
|
||
|
PLUSEQUAL = r'\+='
|
||
|
MINUSEQUAL = r'-='
|
||
|
LSHIFTEQUAL = r'<<='
|
||
|
RSHIFTEQUAL = r'>>='
|
||
|
ANDEQUAL = r'&='
|
||
|
OREQUAL = r'\|='
|
||
|
XOREQUAL = r'\^='
|
||
|
|
||
|
# Operators
|
||
|
PLUS = r'\+'
|
||
|
MINUS = r'-'
|
||
|
TIMES = r'\*'
|
||
|
DIVIDE = r'/'
|
||
|
MOD = r'%'
|
||
|
NOT = r'~'
|
||
|
XOR = r'\^'
|
||
|
LOR = r'\|\|'
|
||
|
LAND = r'&&'
|
||
|
LSHIFT = r'<<'
|
||
|
RSHIFT = r'>>'
|
||
|
LE = r'<='
|
||
|
GE = r'>='
|
||
|
EQ = r'=='
|
||
|
NE = r'!='
|
||
|
LT = r'<'
|
||
|
GT = r'>'
|
||
|
LNOT = r'!'
|
||
|
OR = r'\|'
|
||
|
AND = r'&'
|
||
|
EQUALS = r'='
|
||
|
|
||
|
# ?
|
||
|
CONDOP = r'\?'
|
||
|
|
||
|
# Delimiters
|
||
|
LPAREN = r'\('
|
||
|
RPAREN = r'\)'
|
||
|
LBRACKET = r'\['
|
||
|
RBRACKET = r'\]'
|
||
|
LBRACE = r'\{'
|
||
|
RBRACE = r'\}'
|
||
|
COMMA = r','
|
||
|
PERIOD = r'\.'
|
||
|
SEMI = r';'
|
||
|
COLON = r':'
|
||
|
BACKSLASH = r'\\'
|
||
|
|
||
|
operators = { op: pattern for op, pattern in globals().items() if op == op.upper() }
|
||
|
for op in operators:
|
||
|
globals()[op] = op
|
||
|
opmap = { pattern.replace("\\", "") or '\\' : op for op, pattern in operators.items() }
|
||
|
|
||
|
# Macros
|
||
|
macro = r'# *(ifdef|ifndef|undef|define|error|endif|if|else|include|#)'
|
||
|
MACRO = 'MACRO'
|
||
|
|
||
|
id_re = r'[a-zA-Z_][0-9a-zA-Z_]*'
|
||
|
IDENTIFIER = 'IDENTIFIER'
|
||
|
|
||
|
suffix = r'([uU]?[lL]?[lL]?)'
|
||
|
octal = r'0[0-7]+' + suffix
|
||
|
hex = r'0[xX][0-9a-fA-F]+'
|
||
|
decimal_digits = r'(0|[1-9][0-9]*)'
|
||
|
decimal = decimal_digits + suffix
|
||
|
|
||
|
|
||
|
exponent = r"""([eE][-+]?[0-9]+)"""
|
||
|
fraction = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
|
||
|
float = '(((('+fraction+')'+exponent+'?)|([0-9]+'+exponent+'))[FfLl]?)'
|
||
|
|
||
|
number_re = choice(octal, hex, float, decimal)
|
||
|
NUMBER = 'NUMBER'
|
||
|
|
||
|
simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
|
||
|
decimal_escape = r"""(\d+)"""
|
||
|
hex_escape = r"""(x[0-9a-fA-F]+)"""
|
||
|
escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
|
||
|
string_char = r"""([^"\\\n]|"""+escape_sequence+')'
|
||
|
str_re = '"'+string_char+'*"'
|
||
|
STRING = 'STRING'
|
||
|
char = r'\'.\'' # TODO: escape sequence
|
||
|
CHARACTER = 'CHARACTER'
|
||
|
|
||
|
comment_re = r'//.*|/\*([^*]|\*[^/])*\*/'
|
||
|
COMMENT = 'COMMENT'
|
||
|
|
||
|
newline = r"\n"
|
||
|
matcher = re.compile(choice(id_re, number_re, str_re, char, newline, macro, comment_re, *operators.values()))
|
||
|
letter = re.compile(r'[a-zA-Z_]')
|
||
|
|
||
|
keywords = (
|
||
|
'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST',
|
||
|
'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
|
||
|
'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG',
|
||
|
'REGISTER', 'OFFSETOF',
|
||
|
'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
|
||
|
'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
|
||
|
'VOLATILE', 'WHILE'
|
||
|
)
|
||
|
for name in keywords:
|
||
|
globals()[name] = name
|
||
|
keywords = { name.lower() : name for name in keywords }
|
||
|
|
||
|
|
||
|
def make_syntax_error(
|
||
|
message: str, filename: str, line: int, column: int, line_text: str,
|
||
|
) -> SyntaxError:
|
||
|
return SyntaxError(message, (filename, line, column, line_text))
|
||
|
|
||
|
|
||
|
@dataclass(slots=True)
|
||
|
class Token:
|
||
|
kind: str
|
||
|
text: str
|
||
|
begin: tuple[int, int]
|
||
|
end: tuple[int, int]
|
||
|
|
||
|
@property
|
||
|
def line(self):
|
||
|
return self.begin[0]
|
||
|
|
||
|
@property
|
||
|
def column(self):
|
||
|
return self.begin[1]
|
||
|
|
||
|
@property
|
||
|
def end_line(self):
|
||
|
return self.end[0]
|
||
|
|
||
|
@property
|
||
|
def end_column(self):
|
||
|
return self.end[1]
|
||
|
|
||
|
@property
|
||
|
def width(self):
|
||
|
return self.end[1] - self.begin[1]
|
||
|
|
||
|
def replaceText(self, txt):
|
||
|
assert isinstance(txt, str)
|
||
|
return Token(self.kind, txt, self.begin, self.end)
|
||
|
|
||
|
def __repr__(self):
|
||
|
b0, b1 = self.begin
|
||
|
e0, e1 = self.end
|
||
|
if b0 == e0:
|
||
|
return f"{self.kind}({self.text!r}, {b0}:{b1}:{e1})"
|
||
|
else:
|
||
|
return f"{self.kind}({self.text!r}, {b0}:{b1}, {e0}:{e1})"
|
||
|
|
||
|
|
||
|
def tokenize(src, line=1, filename=None):
|
||
|
linestart = -1
|
||
|
# TODO: finditer() skips over unrecognized characters, e.g. '@'
|
||
|
for m in matcher.finditer(src):
|
||
|
start, end = m.span()
|
||
|
text = m.group(0)
|
||
|
if text in keywords:
|
||
|
kind = keywords[text]
|
||
|
elif letter.match(text):
|
||
|
kind = IDENTIFIER
|
||
|
elif text == '...':
|
||
|
kind = ELLIPSIS
|
||
|
elif text == '.':
|
||
|
kind = PERIOD
|
||
|
elif text[0] in '0123456789.':
|
||
|
kind = NUMBER
|
||
|
elif text[0] == '"':
|
||
|
kind = STRING
|
||
|
elif text in opmap:
|
||
|
kind = opmap[text]
|
||
|
elif text == '\n':
|
||
|
linestart = start
|
||
|
line += 1
|
||
|
kind = '\n'
|
||
|
elif text[0] == "'":
|
||
|
kind = CHARACTER
|
||
|
elif text[0] == '#':
|
||
|
kind = MACRO
|
||
|
elif text[0] == '/' and text[1] in '/*':
|
||
|
kind = COMMENT
|
||
|
else:
|
||
|
lineend = src.find("\n", start)
|
||
|
if lineend == -1:
|
||
|
lineend = len(src)
|
||
|
raise make_syntax_error(f"Bad token: {text}",
|
||
|
filename, line, start-linestart+1, src[linestart:lineend])
|
||
|
if kind == COMMENT:
|
||
|
begin = line, start-linestart
|
||
|
newlines = text.count('\n')
|
||
|
if newlines:
|
||
|
linestart = start + text.rfind('\n')
|
||
|
line += newlines
|
||
|
else:
|
||
|
begin = line, start-linestart
|
||
|
if kind != "\n":
|
||
|
yield Token(kind, text, begin, (line, start-linestart+len(text)))
|
||
|
|
||
|
|
||
|
__all__ = []
|
||
|
__all__.extend([kind for kind in globals() if kind.upper() == kind])
|
||
|
|
||
|
|
||
|
def to_text(tkns: list[Token], dedent: int = 0) -> str:
|
||
|
res: list[str] = []
|
||
|
line, col = -1, 1+dedent
|
||
|
for tkn in tkns:
|
||
|
if line == -1:
|
||
|
line, _ = tkn.begin
|
||
|
l, c = tkn.begin
|
||
|
#assert(l >= line), (line, txt, start, end)
|
||
|
while l > line:
|
||
|
line += 1
|
||
|
res.append('\n')
|
||
|
col = 1+dedent
|
||
|
res.append(' '*(c-col))
|
||
|
res.append(tkn.text)
|
||
|
line, col = tkn.end
|
||
|
return ''.join(res)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
import sys
|
||
|
filename = sys.argv[1]
|
||
|
if filename == "-c":
|
||
|
src = sys.argv[2]
|
||
|
else:
|
||
|
src = open(filename).read()
|
||
|
# print(to_text(tokenize(src)))
|
||
|
for tkn in tokenize(src, filename=filename):
|
||
|
print(tkn)
|