"""Parser for the Python metagrammar""" import io import tokenize # from stdlib from .automata import NFA, NFAState class GrammarParser: """Parser for Python grammar files.""" _translation_table = { tokenize.NAME: "NAME", tokenize.STRING: "STRING", tokenize.NEWLINE: "NEWLINE", tokenize.NL: "NL", tokenize.OP: "OP", tokenize.ENDMARKER: "ENDMARKER", tokenize.COMMENT: "COMMENT", } def __init__(self, grammar): self.grammar = grammar grammar_adaptor = io.StringIO(grammar) self.generator = tokenize.generate_tokens(grammar_adaptor.readline) self._gettoken() # Initialize lookahead self._current_rule_name = None def parse(self): """Turn the grammar into a collection of NFAs""" # grammar: (NEWLINE | rule)* ENDMARKER while self.type != tokenize.ENDMARKER: while self.type == tokenize.NEWLINE: self._gettoken() # rule: NAME ':' rhs NEWLINE self._current_rule_name = self._expect(tokenize.NAME) self._expect(tokenize.OP, ":") a, z = self._parse_rhs() self._expect(tokenize.NEWLINE) yield NFA(a, z) def _parse_rhs(self): # rhs: items ('|' items)* a, z = self._parse_items() if self.value != "|": return a, z else: aa = NFAState(self._current_rule_name) zz = NFAState(self._current_rule_name) while True: # Allow to transit directly to the previous state and connect the end of the # previous state to the end of the current one, effectively allowing to skip # the current state. aa.add_arc(a) z.add_arc(zz) if self.value != "|": break self._gettoken() a, z = self._parse_items() return aa, zz def _parse_items(self): # items: item+ a, b = self._parse_item() while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["): c, d = self._parse_item() # Allow a transition between the end of the previous state # and the beginning of the new one, connecting all the items # together. In this way we can only reach the end if we visit # all the items. b.add_arc(c) b = d return a, b def _parse_item(self): # item: '[' rhs ']' | atom ['+' | '*'] if self.value == "[": self._gettoken() a, z = self._parse_rhs() self._expect(tokenize.OP, "]") # Make a transition from the beginning to the end so it is possible to # advance for free to the next state of this item # without consuming # anything from the rhs. a.add_arc(z) return a, z else: a, z = self._parse_atom() value = self.value if value not in ("+", "*"): return a, z self._gettoken() z.add_arc(a) if value == "+": # Create a cycle to the beginning so we go back to the old state in this # item and repeat. return a, z else: # The end state is the same as the beginning, so we can cycle arbitrarily # and end in the beginning if necessary. return a, a def _parse_atom(self): # atom: '(' rhs ')' | NAME | STRING if self.value == "(": self._gettoken() a, z = self._parse_rhs() self._expect(tokenize.OP, ")") return a, z elif self.type in (tokenize.NAME, tokenize.STRING): a = NFAState(self._current_rule_name) z = NFAState(self._current_rule_name) # We can transit to the next state only if we consume the value. a.add_arc(z, self.value) self._gettoken() return a, z else: self._raise_error( "expected (...) or NAME or STRING, got {} ({})", self._translation_table.get(self.type, self.type), self.value, ) def _expect(self, type_, value=None): if self.type != type_: self._raise_error( "expected {}, got {} ({})", self._translation_table.get(type_, type_), self._translation_table.get(self.type, self.type), self.value, ) if value is not None and self.value != value: self._raise_error("expected {}, got {}", value, self.value) value = self.value self._gettoken() return value def _gettoken(self): tup = next(self.generator) while tup[0] in (tokenize.COMMENT, tokenize.NL): tup = next(self.generator) self.type, self.value, self.begin, self.end, self.line = tup def _raise_error(self, msg, *args): if args: try: msg = msg.format(*args) except Exception: msg = " ".join([msg] + list(map(str, args))) line = self.grammar.splitlines()[self.begin[0] - 1] raise SyntaxError(msg, ("", self.begin[0], self.begin[1], line))