"""Python parser generator This parser generator transforms a Python grammar file into parsing tables that can be consumed by Python's LL(1) parser written in C. Concepts -------- * An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a top-down parser for a subset of context-free languages. It parses the input from Left to right, performing Leftmost derivation of the sentence, and can only use 1 token of lookahead when parsing a sentence. * A parsing table is a collection of data that a generic implementation of the LL(1) parser consumes to know how to parse a given context-free grammar. In this case the collection of data involves Deterministic Finite Automatons, calculated first sets, keywords and transition labels. * A grammar is defined by production rules (or just 'productions') that specify which symbols may replace which other symbols; these rules may be used to generate strings, or to parse them. Each such rule has a head, or left-hand side, which consists of the string that may be replaced, and a body, or right-hand side, which consists of a string that may replace it. In the Python grammar, rules are written in the form rule_name: rule_description; meaning the rule 'a: b' specifies that a can be replaced by b. A context-free grammar is a grammar in which the left-hand side of each production rule consists of only a single nonterminal symbol. Context-free grammars can always be recognized by a Non-Deterministic Automatons. * Terminal symbols are literal symbols which may appear in the outputs of the production rules of the grammar and which cannot be changed using the rules of the grammar. Applying the rules recursively to a source string of symbols will usually terminate in a final output string consisting only of terminal symbols. * Nonterminal symbols are those symbols which can be replaced. The grammar includes a start symbol a designated member of the set of nonterminals from which all the strings in the language may be derived by successive applications of the production rules. * The language defined by the grammar is defined as the set of terminal strings that can be derived using the production rules. * The first sets of a rule (FIRST(rule)) are defined to be the set of terminals that can appear in the first position of any string derived from the rule. This is useful for LL(1) parsers as the parser is only allowed to look at the next token in the input to know which rule needs to parse. For example, given this grammar: start: '(' A | B ')' A: 'a' '<' B: 'b' '<' and the input '(b<)' the parser can only look at 'b' to know if it needs to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows that needs to continue parsing rule B because only that rule can start with 'b'. Description ----------- The input for the parser generator is a grammar in extended BNF form (using * for repetition, + for at-least-once repetition, [] for optional parts, | for alternatives and () for grouping). Each rule in the grammar file is considered as a regular expression in its own right. It is turned into a Non-deterministic Finite Automaton (NFA), which is then turned into a Deterministic Finite Automaton (DFA), which is then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, or similar compiler books (this technique is more often used for lexical analyzers). The DFA's are used by the parser as parsing tables in a special way that's probably unique. Before they are usable, the FIRST sets of all non-terminals are computed so the LL(1) parser consuming the parsing tables can distinguish between different transitions. Reference --------- [Aho&Ullman 77] Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 (first edition) """ from ast import literal_eval import collections from . import grammar, token from .automata import DFA from .metaparser import GrammarParser import enum class LabelType(enum.Enum): NONTERMINAL = 0 NAMED_TOKEN = 1 KEYWORD = 2 OPERATOR = 3 NONE = 4 class Label(str): def __init__(self, value): self.type = self._get_type() def _get_type(self): if self[0].isalpha(): if self.upper() == self: # NAMED tokens (ASYNC, NAME...) are all uppercase by convention return LabelType.NAMED_TOKEN else: # If is not uppercase it must be a non terminal. return LabelType.NONTERMINAL else: # Keywords and operators are wrapped in quotes assert self[0] == self[-1] in ('"', "'"), self value = literal_eval(self) if value[0].isalpha(): return LabelType.KEYWORD else: return LabelType.OPERATOR def __repr__(self): return "{}({})".format(self.type, super().__repr__()) class ParserGenerator(object): def __init__(self, grammar_file, token_file, verbose=False, graph_file=None): with open(grammar_file) as f: self.grammar = f.read() with open(token_file) as tok_file: token_lines = tok_file.readlines() self.tokens = dict(token.generate_tokens(token_lines)) self.opmap = dict(token.generate_opmap(token_lines)) # Manually add <> so it does not collide with != self.opmap["<>"] = "NOTEQUAL" self.verbose = verbose self.filename = grammar_file self.graph_file = graph_file self.dfas, self.startsymbol = self.create_dfas() self.first = {} # map from symbol name to set of tokens self.calculate_first_sets() def create_dfas(self): rule_to_dfas = collections.OrderedDict() start_nonterminal = None for nfa in GrammarParser(self.grammar).parse(): if self.verbose: print("Dump of NFA for", nfa.name) nfa.dump() if self.graph_file is not None: nfa.dump_graph(self.graph_file.write) dfa = DFA.from_nfa(nfa) if self.verbose: print("Dump of DFA for", dfa.name) dfa.dump() dfa.simplify() if self.graph_file is not None: dfa.dump_graph(self.graph_file.write) rule_to_dfas[dfa.name] = dfa if start_nonterminal is None: start_nonterminal = dfa.name return rule_to_dfas, start_nonterminal def make_grammar(self): c = grammar.Grammar() c.all_labels = set() names = list(self.dfas.keys()) names.remove(self.startsymbol) names.insert(0, self.startsymbol) for name in names: i = 256 + len(c.symbol2number) c.symbol2number[Label(name)] = i c.number2symbol[i] = Label(name) c.all_labels.add(name) for name in names: self.make_label(c, name) dfa = self.dfas[name] states = [] for state in dfa: arcs = [] for label, next in sorted(state.arcs.items()): c.all_labels.add(label) arcs.append((self.make_label(c, label), dfa.states.index(next))) if state.is_final: arcs.append((0, dfa.states.index(state))) states.append(arcs) c.states.append(states) c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name)) c.start = c.symbol2number[self.startsymbol] if self.verbose: print("") print("Grammar summary") print("===============") print("- {n_labels} labels".format(n_labels=len(c.labels))) print("- {n_dfas} dfas".format(n_dfas=len(c.dfas))) print("- {n_tokens} tokens".format(n_tokens=len(c.tokens))) print("- {n_keywords} keywords".format(n_keywords=len(c.keywords))) print( "- Start symbol: {start_symbol}".format( start_symbol=c.number2symbol[c.start] ) ) return c def make_first_sets(self, c, name): rawfirst = self.first[name] first = set() for label in sorted(rawfirst): ilabel = self.make_label(c, label) ##assert ilabel not in first # XXX failed on <> ... != first.add(ilabel) return first def make_label(self, c, label): label = Label(label) ilabel = len(c.labels) if label.type == LabelType.NONTERMINAL: if label in c.symbol2label: return c.symbol2label[label] else: c.labels.append((c.symbol2number[label], None)) c.symbol2label[label] = ilabel return ilabel elif label.type == LabelType.NAMED_TOKEN: # A named token (NAME, NUMBER, STRING) itoken = self.tokens.get(label, None) assert isinstance(itoken, int), label assert itoken in self.tokens.values(), label if itoken in c.tokens: return c.tokens[itoken] else: c.labels.append((itoken, None)) c.tokens[itoken] = ilabel return ilabel elif label.type == LabelType.KEYWORD: # A keyword value = literal_eval(label) if value in c.keywords: return c.keywords[value] else: c.labels.append((self.tokens["NAME"], value)) c.keywords[value] = ilabel return ilabel elif label.type == LabelType.OPERATOR: # An operator (any non-numeric token) value = literal_eval(label) tok_name = self.opmap[value] # Fails if unknown token itoken = self.tokens[tok_name] if itoken in c.tokens: return c.tokens[itoken] else: c.labels.append((itoken, None)) c.tokens[itoken] = ilabel return ilabel else: raise ValueError("Cannot categorize label {}".format(label)) def calculate_first_sets(self): names = list(self.dfas.keys()) for name in names: if name not in self.first: self.calculate_first_sets_for_rule(name) if self.verbose: print("First set for {dfa_name}".format(dfa_name=name)) for item in self.first[name]: print(" - {terminal}".format(terminal=item)) def calculate_first_sets_for_rule(self, name): dfa = self.dfas[name] self.first[name] = None # dummy to detect left recursion state = dfa.states[0] totalset = set() overlapcheck = {} for label, next in state.arcs.items(): if label in self.dfas: if label in self.first: fset = self.first[label] if fset is None: raise ValueError("recursion for rule %r" % name) else: self.calculate_first_sets_for_rule(label) fset = self.first[label] totalset.update(fset) overlapcheck[label] = fset else: totalset.add(label) overlapcheck[label] = {label} inverse = {} for label, itsfirst in overlapcheck.items(): for symbol in itsfirst: if symbol in inverse: raise ValueError( "rule %s is ambiguous; %s is in the" " first sets of %s as well as %s" % (name, symbol, label, inverse[symbol]) ) inverse[symbol] = label self.first[name] = totalset