cpython/Parser/pgen/pgen.py

311 lines
12 KiB
Python
Raw Normal View History

"""Python parser generator
This parser generator transforms a Python grammar file into parsing tables
that can be consumed by Python's LL(1) parser written in C.
Concepts
--------
* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a
top-down parser for a subset of context-free languages. It parses the input
from Left to right, performing Leftmost derivation of the sentence, and can
only use 1 token of lookahead when parsing a sentence.
* A parsing table is a collection of data that a generic implementation of the
LL(1) parser consumes to know how to parse a given context-free grammar. In
this case the collection of data involves Deterministic Finite Automatons,
calculated first sets, keywords and transition labels.
* A grammar is defined by production rules (or just 'productions') that specify
which symbols may replace which other symbols; these rules may be used to
generate strings, or to parse them. Each such rule has a head, or left-hand
side, which consists of the string that may be replaced, and a body, or
right-hand side, which consists of a string that may replace it. In the
Python grammar, rules are written in the form
rule_name: rule_description;
meaning the rule 'a: b' specifies that a can be replaced by b. A context-free
grammar is a grammar in which the left-hand side of each production rule
consists of only a single nonterminal symbol. Context-free grammars can
always be recognized by a Non-Deterministic Automatons.
* Terminal symbols are literal symbols which may appear in the outputs of the
production rules of the grammar and which cannot be changed using the rules
of the grammar. Applying the rules recursively to a source string of symbols
will usually terminate in a final output string consisting only of terminal
symbols.
* Nonterminal symbols are those symbols which can be replaced. The grammar
includes a start symbol a designated member of the set of nonterminals from
which all the strings in the language may be derived by successive
applications of the production rules.
* The language defined by the grammar is defined as the set of terminal strings
that can be derived using the production rules.
* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals
that can appear in the first position of any string derived from the rule.
This is useful for LL(1) parsers as the parser is only allowed to look at the
next token in the input to know which rule needs to parse. For example, given
this grammar:
start: '(' A | B ')'
A: 'a' '<'
B: 'b' '<'
and the input '(b<)' the parser can only look at 'b' to know if it needs
to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows
that needs to continue parsing rule B because only that rule can start
with 'b'.
Description
-----------
The input for the parser generator is a grammar in extended BNF form (using *
for repetition, + for at-least-once repetition, [] for optional parts, | for
alternatives and () for grouping).
Each rule in the grammar file is considered as a regular expression in its
own right. It is turned into a Non-deterministic Finite Automaton (NFA),
which is then turned into a Deterministic Finite Automaton (DFA), which is
then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
or similar compiler books (this technique is more often used for lexical
analyzers).
The DFA's are used by the parser as parsing tables in a special way that's
probably unique. Before they are usable, the FIRST sets of all non-terminals
are computed so the LL(1) parser consuming the parsing tables can distinguish
between different transitions.
Reference
---------
[Aho&Ullman 77]
Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
(first edition)
"""
from ast import literal_eval
import collections
from . import grammar, token
from .automata import DFA
from .metaparser import GrammarParser
import enum
class LabelType(enum.Enum):
NONTERMINAL = 0
NAMED_TOKEN = 1
KEYWORD = 2
OPERATOR = 3
NONE = 4
class Label(str):
def __init__(self, value):
self.type = self._get_type()
def _get_type(self):
if self[0].isalpha():
if self.upper() == self:
# NAMED tokens (ASYNC, NAME...) are all uppercase by convention
return LabelType.NAMED_TOKEN
else:
# If is not uppercase it must be a non terminal.
return LabelType.NONTERMINAL
else:
# Keywords and operators are wrapped in quotes
assert self[0] == self[-1] in ('"', "'"), self
value = literal_eval(self)
if value[0].isalpha():
return LabelType.KEYWORD
else:
return LabelType.OPERATOR
def __repr__(self):
return "{}({})".format(self.type, super().__repr__())
class ParserGenerator(object):
def __init__(self, grammar_file, token_file, verbose=False, graph_file=None):
with open(grammar_file) as f:
self.grammar = f.read()
with open(token_file) as tok_file:
token_lines = tok_file.readlines()
self.tokens = dict(token.generate_tokens(token_lines))
self.opmap = dict(token.generate_opmap(token_lines))
# Manually add <> so it does not collide with !=
self.opmap["<>"] = "NOTEQUAL"
self.verbose = verbose
self.filename = grammar_file
self.graph_file = graph_file
self.dfas, self.startsymbol = self.create_dfas()
self.first = {} # map from symbol name to set of tokens
self.calculate_first_sets()
def create_dfas(self):
rule_to_dfas = collections.OrderedDict()
start_nonterminal = None
for nfa in GrammarParser(self.grammar).parse():
if self.verbose:
print("Dump of NFA for", nfa.name)
nfa.dump()
if self.graph_file is not None:
nfa.dump_graph(self.graph_file.write)
dfa = DFA.from_nfa(nfa)
if self.verbose:
print("Dump of DFA for", dfa.name)
dfa.dump()
dfa.simplify()
if self.graph_file is not None:
dfa.dump_graph(self.graph_file.write)
rule_to_dfas[dfa.name] = dfa
if start_nonterminal is None:
start_nonterminal = dfa.name
return rule_to_dfas, start_nonterminal
def make_grammar(self):
c = grammar.Grammar()
c.all_labels = set()
names = list(self.dfas.keys())
names.remove(self.startsymbol)
names.insert(0, self.startsymbol)
for name in names:
i = 256 + len(c.symbol2number)
c.symbol2number[Label(name)] = i
c.number2symbol[i] = Label(name)
c.all_labels.add(name)
for name in names:
self.make_label(c, name)
dfa = self.dfas[name]
states = []
for state in dfa:
arcs = []
for label, next in sorted(state.arcs.items()):
c.all_labels.add(label)
arcs.append((self.make_label(c, label), dfa.states.index(next)))
if state.is_final:
arcs.append((0, dfa.states.index(state)))
states.append(arcs)
c.states.append(states)
c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name))
c.start = c.symbol2number[self.startsymbol]
if self.verbose:
print("")
print("Grammar summary")
print("===============")
print("- {n_labels} labels".format(n_labels=len(c.labels)))
print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
print(
"- Start symbol: {start_symbol}".format(
start_symbol=c.number2symbol[c.start]
)
)
return c
def make_first_sets(self, c, name):
rawfirst = self.first[name]
first = set()
for label in sorted(rawfirst):
ilabel = self.make_label(c, label)
##assert ilabel not in first # XXX failed on <> ... !=
first.add(ilabel)
return first
def make_label(self, c, label):
label = Label(label)
ilabel = len(c.labels)
if label.type == LabelType.NONTERMINAL:
if label in c.symbol2label:
return c.symbol2label[label]
else:
c.labels.append((c.symbol2number[label], None))
c.symbol2label[label] = ilabel
return ilabel
elif label.type == LabelType.NAMED_TOKEN:
# A named token (NAME, NUMBER, STRING)
itoken = self.tokens.get(label, None)
assert isinstance(itoken, int), label
assert itoken in self.tokens.values(), label
if itoken in c.tokens:
return c.tokens[itoken]
else:
c.labels.append((itoken, None))
c.tokens[itoken] = ilabel
return ilabel
elif label.type == LabelType.KEYWORD:
# A keyword
value = literal_eval(label)
if value in c.keywords:
return c.keywords[value]
else:
c.labels.append((self.tokens["NAME"], value))
c.keywords[value] = ilabel
return ilabel
elif label.type == LabelType.OPERATOR:
# An operator (any non-numeric token)
value = literal_eval(label)
tok_name = self.opmap[value] # Fails if unknown token
itoken = self.tokens[tok_name]
if itoken in c.tokens:
return c.tokens[itoken]
else:
c.labels.append((itoken, None))
c.tokens[itoken] = ilabel
return ilabel
else:
raise ValueError("Cannot categorize label {}".format(label))
def calculate_first_sets(self):
names = list(self.dfas.keys())
for name in names:
if name not in self.first:
self.calculate_first_sets_for_rule(name)
if self.verbose:
print("First set for {dfa_name}".format(dfa_name=name))
for item in self.first[name]:
print(" - {terminal}".format(terminal=item))
def calculate_first_sets_for_rule(self, name):
dfa = self.dfas[name]
self.first[name] = None # dummy to detect left recursion
state = dfa.states[0]
totalset = set()
overlapcheck = {}
for label, next in state.arcs.items():
if label in self.dfas:
if label in self.first:
fset = self.first[label]
if fset is None:
raise ValueError("recursion for rule %r" % name)
else:
self.calculate_first_sets_for_rule(label)
fset = self.first[label]
totalset.update(fset)
overlapcheck[label] = fset
else:
totalset.add(label)
overlapcheck[label] = {label}
inverse = {}
for label, itsfirst in overlapcheck.items():
for symbol in itsfirst:
if symbol in inverse:
raise ValueError(
"rule %s is ambiguous; %s is in the"
" first sets of %s as well as %s"
% (name, symbol, label, inverse[symbol])
)
inverse[symbol] = label
self.first[name] = totalset