2022-11-03 01:31:26 -03:00
|
|
|
# Parser for C code
|
|
|
|
# Originally by Mark Shannon (mark@hotpy.org)
|
|
|
|
# https://gist.github.com/markshannon/db7ab649440b5af765451bb77c7dba34
|
|
|
|
|
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
import collections
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
|
|
def choice(*opts):
|
|
|
|
return "|".join("(%s)" % opt for opt in opts)
|
|
|
|
|
|
|
|
# Regexes
|
|
|
|
|
|
|
|
# Longer operators must go before shorter ones.
|
|
|
|
|
|
|
|
PLUSPLUS = r'\+\+'
|
|
|
|
MINUSMINUS = r'--'
|
|
|
|
|
|
|
|
# ->
|
|
|
|
ARROW = r'->'
|
|
|
|
ELLIPSIS = r'\.\.\.'
|
|
|
|
|
|
|
|
# Assignment operators
|
|
|
|
TIMESEQUAL = r'\*='
|
|
|
|
DIVEQUAL = r'/='
|
|
|
|
MODEQUAL = r'%='
|
|
|
|
PLUSEQUAL = r'\+='
|
|
|
|
MINUSEQUAL = r'-='
|
|
|
|
LSHIFTEQUAL = r'<<='
|
|
|
|
RSHIFTEQUAL = r'>>='
|
|
|
|
ANDEQUAL = r'&='
|
|
|
|
OREQUAL = r'\|='
|
|
|
|
XOREQUAL = r'\^='
|
|
|
|
|
|
|
|
# Operators
|
|
|
|
PLUS = r'\+'
|
|
|
|
MINUS = r'-'
|
|
|
|
TIMES = r'\*'
|
|
|
|
DIVIDE = r'/'
|
|
|
|
MOD = r'%'
|
|
|
|
NOT = r'~'
|
|
|
|
XOR = r'\^'
|
|
|
|
LOR = r'\|\|'
|
|
|
|
LAND = r'&&'
|
|
|
|
LSHIFT = r'<<'
|
|
|
|
RSHIFT = r'>>'
|
|
|
|
LE = r'<='
|
|
|
|
GE = r'>='
|
|
|
|
EQ = r'=='
|
|
|
|
NE = r'!='
|
|
|
|
LT = r'<'
|
|
|
|
GT = r'>'
|
|
|
|
LNOT = r'!'
|
|
|
|
OR = r'\|'
|
|
|
|
AND = r'&'
|
|
|
|
EQUALS = r'='
|
|
|
|
|
|
|
|
# ?
|
|
|
|
CONDOP = r'\?'
|
|
|
|
|
|
|
|
# Delimiters
|
|
|
|
LPAREN = r'\('
|
|
|
|
RPAREN = r'\)'
|
|
|
|
LBRACKET = r'\['
|
|
|
|
RBRACKET = r'\]'
|
|
|
|
LBRACE = r'\{'
|
|
|
|
RBRACE = r'\}'
|
|
|
|
COMMA = r','
|
|
|
|
PERIOD = r'\.'
|
|
|
|
SEMI = r';'
|
|
|
|
COLON = r':'
|
|
|
|
BACKSLASH = r'\\'
|
|
|
|
|
|
|
|
operators = { op: pattern for op, pattern in globals().items() if op == op.upper() }
|
|
|
|
for op in operators:
|
|
|
|
globals()[op] = op
|
|
|
|
opmap = { pattern.replace("\\", "") or '\\' : op for op, pattern in operators.items() }
|
|
|
|
|
|
|
|
# Macros
|
|
|
|
macro = r'# *(ifdef|ifndef|undef|define|error|endif|if|else|include|#)'
|
|
|
|
MACRO = 'MACRO'
|
|
|
|
|
|
|
|
id_re = r'[a-zA-Z_][0-9a-zA-Z_]*'
|
|
|
|
IDENTIFIER = 'IDENTIFIER'
|
|
|
|
|
|
|
|
suffix = r'([uU]?[lL]?[lL]?)'
|
|
|
|
octal = r'0[0-7]+' + suffix
|
|
|
|
hex = r'0[xX][0-9a-fA-F]+'
|
|
|
|
decimal_digits = r'(0|[1-9][0-9]*)'
|
|
|
|
decimal = decimal_digits + suffix
|
|
|
|
|
|
|
|
|
|
|
|
exponent = r"""([eE][-+]?[0-9]+)"""
|
|
|
|
fraction = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
|
|
|
|
float = '(((('+fraction+')'+exponent+'?)|([0-9]+'+exponent+'))[FfLl]?)'
|
|
|
|
|
|
|
|
number_re = choice(octal, hex, float, decimal)
|
|
|
|
NUMBER = 'NUMBER'
|
|
|
|
|
|
|
|
simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
|
|
|
|
decimal_escape = r"""(\d+)"""
|
|
|
|
hex_escape = r"""(x[0-9a-fA-F]+)"""
|
|
|
|
escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'
|
|
|
|
string_char = r"""([^"\\\n]|"""+escape_sequence+')'
|
|
|
|
str_re = '"'+string_char+'*"'
|
|
|
|
STRING = 'STRING'
|
|
|
|
char = r'\'.\'' # TODO: escape sequence
|
|
|
|
CHARACTER = 'CHARACTER'
|
|
|
|
|
|
|
|
comment_re = r'//.*|/\*([^*]|\*[^/])*\*/'
|
|
|
|
COMMENT = 'COMMENT'
|
|
|
|
|
|
|
|
newline = r"\n"
|
2022-11-22 20:04:57 -04:00
|
|
|
invalid = r"\S" # A single non-space character that's not caught by any of the other patterns
|
|
|
|
matcher = re.compile(choice(id_re, number_re, str_re, char, newline, macro, comment_re, *operators.values(), invalid))
|
2022-11-03 01:31:26 -03:00
|
|
|
letter = re.compile(r'[a-zA-Z_]')
|
|
|
|
|
2022-11-17 21:06:07 -04:00
|
|
|
kwds = (
|
2022-11-03 01:31:26 -03:00
|
|
|
'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST',
|
|
|
|
'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE', 'ELSE', 'ENUM', 'EXTERN',
|
|
|
|
'FLOAT', 'FOR', 'GOTO', 'IF', 'INLINE', 'INT', 'LONG',
|
|
|
|
'REGISTER', 'OFFSETOF',
|
|
|
|
'RESTRICT', 'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT',
|
|
|
|
'SWITCH', 'TYPEDEF', 'UNION', 'UNSIGNED', 'VOID',
|
|
|
|
'VOLATILE', 'WHILE'
|
|
|
|
)
|
2022-11-17 21:06:07 -04:00
|
|
|
for name in kwds:
|
2022-11-03 01:31:26 -03:00
|
|
|
globals()[name] = name
|
2022-11-17 21:06:07 -04:00
|
|
|
keywords = { name.lower() : name for name in kwds }
|
2022-11-03 01:31:26 -03:00
|
|
|
|
|
|
|
|
|
|
|
def make_syntax_error(
|
|
|
|
message: str, filename: str, line: int, column: int, line_text: str,
|
|
|
|
) -> SyntaxError:
|
|
|
|
return SyntaxError(message, (filename, line, column, line_text))
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass(slots=True)
|
|
|
|
class Token:
|
|
|
|
kind: str
|
|
|
|
text: str
|
|
|
|
begin: tuple[int, int]
|
|
|
|
end: tuple[int, int]
|
|
|
|
|
|
|
|
@property
|
|
|
|
def line(self):
|
|
|
|
return self.begin[0]
|
|
|
|
|
|
|
|
@property
|
|
|
|
def column(self):
|
|
|
|
return self.begin[1]
|
|
|
|
|
|
|
|
@property
|
|
|
|
def end_line(self):
|
|
|
|
return self.end[0]
|
|
|
|
|
|
|
|
@property
|
|
|
|
def end_column(self):
|
|
|
|
return self.end[1]
|
|
|
|
|
|
|
|
@property
|
|
|
|
def width(self):
|
|
|
|
return self.end[1] - self.begin[1]
|
|
|
|
|
|
|
|
def replaceText(self, txt):
|
|
|
|
assert isinstance(txt, str)
|
|
|
|
return Token(self.kind, txt, self.begin, self.end)
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
b0, b1 = self.begin
|
|
|
|
e0, e1 = self.end
|
|
|
|
if b0 == e0:
|
|
|
|
return f"{self.kind}({self.text!r}, {b0}:{b1}:{e1})"
|
|
|
|
else:
|
|
|
|
return f"{self.kind}({self.text!r}, {b0}:{b1}, {e0}:{e1})"
|
|
|
|
|
|
|
|
|
|
|
|
def tokenize(src, line=1, filename=None):
|
|
|
|
linestart = -1
|
|
|
|
for m in matcher.finditer(src):
|
|
|
|
start, end = m.span()
|
|
|
|
text = m.group(0)
|
|
|
|
if text in keywords:
|
|
|
|
kind = keywords[text]
|
|
|
|
elif letter.match(text):
|
|
|
|
kind = IDENTIFIER
|
|
|
|
elif text == '...':
|
|
|
|
kind = ELLIPSIS
|
|
|
|
elif text == '.':
|
|
|
|
kind = PERIOD
|
|
|
|
elif text[0] in '0123456789.':
|
|
|
|
kind = NUMBER
|
|
|
|
elif text[0] == '"':
|
|
|
|
kind = STRING
|
|
|
|
elif text in opmap:
|
|
|
|
kind = opmap[text]
|
|
|
|
elif text == '\n':
|
|
|
|
linestart = start
|
|
|
|
line += 1
|
|
|
|
kind = '\n'
|
|
|
|
elif text[0] == "'":
|
|
|
|
kind = CHARACTER
|
|
|
|
elif text[0] == '#':
|
|
|
|
kind = MACRO
|
|
|
|
elif text[0] == '/' and text[1] in '/*':
|
|
|
|
kind = COMMENT
|
|
|
|
else:
|
|
|
|
lineend = src.find("\n", start)
|
|
|
|
if lineend == -1:
|
|
|
|
lineend = len(src)
|
|
|
|
raise make_syntax_error(f"Bad token: {text}",
|
|
|
|
filename, line, start-linestart+1, src[linestart:lineend])
|
|
|
|
if kind == COMMENT:
|
|
|
|
begin = line, start-linestart
|
|
|
|
newlines = text.count('\n')
|
|
|
|
if newlines:
|
|
|
|
linestart = start + text.rfind('\n')
|
|
|
|
line += newlines
|
|
|
|
else:
|
|
|
|
begin = line, start-linestart
|
|
|
|
if kind != "\n":
|
|
|
|
yield Token(kind, text, begin, (line, start-linestart+len(text)))
|
|
|
|
|
|
|
|
|
|
|
|
__all__ = []
|
|
|
|
__all__.extend([kind for kind in globals() if kind.upper() == kind])
|
|
|
|
|
|
|
|
|
|
|
|
def to_text(tkns: list[Token], dedent: int = 0) -> str:
|
|
|
|
res: list[str] = []
|
|
|
|
line, col = -1, 1+dedent
|
|
|
|
for tkn in tkns:
|
|
|
|
if line == -1:
|
|
|
|
line, _ = tkn.begin
|
|
|
|
l, c = tkn.begin
|
|
|
|
#assert(l >= line), (line, txt, start, end)
|
|
|
|
while l > line:
|
|
|
|
line += 1
|
|
|
|
res.append('\n')
|
|
|
|
col = 1+dedent
|
|
|
|
res.append(' '*(c-col))
|
2022-12-02 23:57:30 -04:00
|
|
|
text = tkn.text
|
|
|
|
if dedent != 0 and tkn.kind == 'COMMENT' and '\n' in text:
|
|
|
|
if dedent < 0:
|
|
|
|
text = text.replace('\n', '\n' + ' '*-dedent)
|
|
|
|
# TODO: dedent > 0
|
|
|
|
res.append(text)
|
2022-11-03 01:31:26 -03:00
|
|
|
line, col = tkn.end
|
|
|
|
return ''.join(res)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import sys
|
|
|
|
filename = sys.argv[1]
|
|
|
|
if filename == "-c":
|
|
|
|
src = sys.argv[2]
|
|
|
|
else:
|
|
|
|
src = open(filename).read()
|
|
|
|
# print(to_text(tokenize(src)))
|
|
|
|
for tkn in tokenize(src, filename=filename):
|
|
|
|
print(tkn)
|