cpython/Parser/pgen/pgen.py

"""Python parser generator


This parser generator transforms a Python grammar file into parsing tables
that can be consumed by Python's LL(1) parser written in C.

Concepts
--------

* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a
  top-down parser for a subset of context-free languages. It parses the input
  from Left to right, performing Leftmost derivation of the sentence, and can
  only use 1 token of lookahead when parsing a sentence.

* A parsing table is a collection of data that a generic implementation of the
  LL(1) parser consumes to know how to parse a given context-free grammar. In
  this case the collection of data involves Deterministic Finite Automatons,
  calculated first sets, keywords and transition labels.

* A grammar is defined by production rules (or just 'productions') that specify
  which symbols may replace which other symbols; these rules may be used to
  generate strings, or to parse them. Each such rule has a head, or left-hand
  side, which consists of the string that may be replaced, and a body, or
  right-hand side, which consists of a string that may replace it. In the
  Python grammar, rules are written in the form

  rule_name: rule_description;

  meaning the rule 'a: b' specifies that a can be replaced by b. A context-free
  grammar is a grammar in which the left-hand side of each production rule
  consists of only a single nonterminal symbol. Context-free grammars can
  always be recognized by a Non-Deterministic Automatons.

* Terminal symbols are literal symbols which may appear in the outputs of the
  production rules of the grammar and which cannot be changed using the rules
  of the grammar. Applying the rules recursively to a source string of symbols
  will usually terminate in a final output string consisting only of terminal
  symbols.

* Nonterminal symbols are those symbols which can be replaced. The grammar
  includes a start symbol a designated member of the set of nonterminals from
  which all the strings in the language may be derived by successive
  applications of the production rules.

* The language defined by the grammar is defined as the set of terminal strings
  that can be derived using the production rules.

* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals
  that can appear in the first position of any string derived from the rule.
  This is useful for LL(1) parsers as the parser is only allowed to look at the
  next token in the input to know which rule needs to parse. For example, given
  this grammar:

  start: '(' A | B ')'
  A: 'a' '<'
  B: 'b' '<'

  and the input '(b<)' the parser can only look at 'b' to know if it needs
  to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows
  that needs to continue parsing rule B because only that rule can start
  with 'b'.

Description
-----------

The input for the parser generator is a grammar in extended BNF form (using *
for repetition, + for at-least-once repetition, [] for optional parts, | for
alternatives and () for grouping).

Each rule in the grammar file is considered as a regular expression in its
own right. It is turned into a Non-deterministic Finite Automaton (NFA),
which is then turned into a Deterministic Finite Automaton (DFA), which is
then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
or similar compiler books (this technique is more often used for lexical
analyzers).

The DFA's are used by the parser as parsing tables in a special way that's
probably unique. Before they are usable, the FIRST sets of all non-terminals
are computed so the LL(1) parser consuming the parsing tables can distinguish
between different transitions.
Reference
---------

[Aho&Ullman 77]
    Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
    (first edition)
"""

from ast import literal_eval
import collections

from . import grammar, token
from .automata import DFA
from .metaparser import GrammarParser

import enum


class LabelType(enum.Enum):
    NONTERMINAL = 0
    NAMED_TOKEN = 1
    KEYWORD = 2
    OPERATOR = 3
    NONE = 4


class Label(str):
    def __init__(self, value):
        self.type = self._get_type()

    def _get_type(self):
        if self[0].isalpha():
            if self.upper() == self:
                # NAMED tokens (ASYNC, NAME...) are all uppercase by convention
                return LabelType.NAMED_TOKEN
            else:
                # If is not uppercase it must be a non terminal.
                return LabelType.NONTERMINAL
        else:
            # Keywords and operators are wrapped in quotes
            assert self[0] == self[-1] in ('"', "'"), self
            value = literal_eval(self)
            if value[0].isalpha():
                return LabelType.KEYWORD
            else:
                return LabelType.OPERATOR

    def __repr__(self):
        return "{}({})".format(self.type, super().__repr__())


class ParserGenerator(object):
    def __init__(self, grammar_file, token_file, verbose=False):
        with open(grammar_file) as f:
            self.grammar = f.read()
        with open(token_file) as tok_file:
            token_lines = tok_file.readlines()
        self.tokens = dict(token.generate_tokens(token_lines))
        self.opmap = dict(token.generate_opmap(token_lines))
        # Manually add <> so it does not collide with !=
        self.opmap["<>"] = "NOTEQUAL"
        self.verbose = verbose
        self.filename = grammar_file
        self.dfas, self.startsymbol = self.create_dfas()
        self.first = {}  # map from symbol name to set of tokens
        self.calculate_first_sets()

    def create_dfas(self):
        rule_to_dfas = collections.OrderedDict()
        start_nonterminal = None
        for nfa in GrammarParser(self.grammar).parse():
            if self.verbose:
                print("Dump of NFA for", nfa.name)
                nfa.dump()
            dfa = DFA.from_nfa(nfa)
            if self.verbose:
                print("Dump of DFA for", dfa.name)
                dfa.dump()
            dfa.simplify()
            rule_to_dfas[dfa.name] = dfa

            if start_nonterminal is None:
                start_nonterminal = dfa.name

        return rule_to_dfas, start_nonterminal

    def make_grammar(self):
        c = grammar.Grammar()
        c.all_labels = set()
        names = list(self.dfas.keys())
        names.remove(self.startsymbol)
        names.insert(0, self.startsymbol)
        for name in names:
            i = 256 + len(c.symbol2number)
            c.symbol2number[Label(name)] = i
            c.number2symbol[i] = Label(name)
            c.all_labels.add(name)
        for name in names:
            self.make_label(c, name)
            dfa = self.dfas[name]
            states = []
            for state in dfa:
                arcs = []
                for label, next in sorted(state.arcs.items()):
                    c.all_labels.add(label)
                    arcs.append((self.make_label(c, label), dfa.states.index(next)))
                if state.is_final:
                    arcs.append((0, dfa.states.index(state)))
                states.append(arcs)
            c.states.append(states)
            c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name))
        c.start = c.symbol2number[self.startsymbol]

        if self.verbose:
            print("")
            print("Grammar summary")
            print("===============")

            print("- {n_labels} labels".format(n_labels=len(c.labels)))
            print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
            print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
            print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
            print(
                "- Start symbol: {start_symbol}".format(
                    start_symbol=c.number2symbol[c.start]
                )
            )
        return c

    def make_first_sets(self, c, name):
        rawfirst = self.first[name]
        first = set()
        for label in sorted(rawfirst):
            ilabel = self.make_label(c, label)
            ##assert ilabel not in first # XXX failed on <> ... !=
            first.add(ilabel)
        return first

    def make_label(self, c, label):
        label = Label(label)
        ilabel = len(c.labels)

        if label.type == LabelType.NONTERMINAL:
            if label in c.symbol2label:
                return c.symbol2label[label]
            else:
                c.labels.append((c.symbol2number[label], None))
                c.symbol2label[label] = ilabel
                return ilabel
        elif label.type == LabelType.NAMED_TOKEN:
            # A named token (NAME, NUMBER, STRING)
            itoken = self.tokens.get(label, None)
            assert isinstance(itoken, int), label
            assert itoken in self.tokens.values(), label
            if itoken in c.tokens:
                return c.tokens[itoken]
            else:
                c.labels.append((itoken, None))
                c.tokens[itoken] = ilabel
                return ilabel
        elif label.type == LabelType.KEYWORD:
            # A keyword
            value = literal_eval(label)
            if value in c.keywords:
                return c.keywords[value]
            else:
                c.labels.append((self.tokens["NAME"], value))
                c.keywords[value] = ilabel
                return ilabel
        elif label.type == LabelType.OPERATOR:
            # An operator (any non-numeric token)
            value = literal_eval(label)
            tok_name = self.opmap[value]  # Fails if unknown token
            itoken = self.tokens[tok_name]
            if itoken in c.tokens:
                return c.tokens[itoken]
            else:
                c.labels.append((itoken, None))
                c.tokens[itoken] = ilabel
                return ilabel
        else:
            raise ValueError("Cannot categorize label {}".format(label))

    def calculate_first_sets(self):
        names = list(self.dfas.keys())
        for name in names:
            if name not in self.first:
                self.calculate_first_sets_for_rule(name)

            if self.verbose:
                print("First set for {dfa_name}".format(dfa_name=name))
                for item in self.first[name]:
                    print("    - {terminal}".format(terminal=item))

    def calculate_first_sets_for_rule(self, name):
        dfa = self.dfas[name]
        self.first[name] = None  # dummy to detect left recursion
        state = dfa.states[0]
        totalset = set()
        overlapcheck = {}
        for label, next in state.arcs.items():
            if label in self.dfas:
                if label in self.first:
                    fset = self.first[label]
                    if fset is None:
                        raise ValueError("recursion for rule %r" % name)
                else:
                    self.calculate_first_sets_for_rule(label)
                    fset = self.first[label]
                totalset.update(fset)
                overlapcheck[label] = fset
            else:
                totalset.add(label)
                overlapcheck[label] = {label}
        inverse = {}
        for label, itsfirst in overlapcheck.items():
            for symbol in itsfirst:
                if symbol in inverse:
                    raise ValueError(
                        "rule %s is ambiguous; %s is in the"
                        " first sets of %s as well as %s"
                        % (name, symbol, label, inverse[symbol])
                    )
                inverse[symbol] = label
        self.first[name] = totalset