cpython/Parser/pgen/pgen.py

import collections
import tokenize  # from stdlib

from . import grammar, token


class ParserGenerator(object):

    def __init__(self, grammar_file, token_file, stream=None, verbose=False):
        close_stream = None
        if stream is None:
            stream = open(grammar_file)
            close_stream = stream.close
        with open(token_file) as tok_file:
            token_lines = tok_file.readlines()
        self.tokens = dict(token.generate_tokens(token_lines))
        self.opmap = dict(token.generate_opmap(token_lines))
        # Manually add <> so it does not collide with !=
        self.opmap['<>'] = "NOTEQUAL"
        self.verbose = verbose
        self.filename = grammar_file
        self.stream = stream
        self.generator = tokenize.generate_tokens(stream.readline)
        self.gettoken() # Initialize lookahead
        self.dfas, self.startsymbol = self.parse()
        if close_stream is not None:
            close_stream()
        self.first = {} # map from symbol name to set of tokens
        self.addfirstsets()

    def make_grammar(self):
        c = grammar.Grammar()
        names = list(self.dfas.keys())
        names.remove(self.startsymbol)
        names.insert(0, self.startsymbol)
        for name in names:
            i = 256 + len(c.symbol2number)
            c.symbol2number[name] = i
            c.number2symbol[i] = name
        for name in names:
            self.make_label(c, name)
            dfa = self.dfas[name]
            states = []
            for state in dfa:
                arcs = []
                for label, next in sorted(state.arcs.items()):
                    arcs.append((self.make_label(c, label), dfa.index(next)))
                if state.isfinal:
                    arcs.append((0, dfa.index(state)))
                states.append(arcs)
            c.states.append(states)
            c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
        c.start = c.symbol2number[self.startsymbol]

        if self.verbose:
            print("")
            print("Grammar summary")
            print("===============")

            print("- {n_labels} labels".format(n_labels=len(c.labels)))
            print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
            print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
            print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
            print(
                "- Start symbol: {start_symbol}".format(
                    start_symbol=c.number2symbol[c.start]
                )
            )
        return c

    def make_first(self, c, name):
        rawfirst = self.first[name]
        first = set()
        for label in sorted(rawfirst):
            ilabel = self.make_label(c, label)
            ##assert ilabel not in first # XXX failed on <> ... !=
            first.add(ilabel)
        return first

    def make_label(self, c, label):
        # XXX Maybe this should be a method on a subclass of converter?
        ilabel = len(c.labels)
        if label[0].isalpha():
            # Either a symbol name or a named token
            if label in c.symbol2number:
                # A symbol name (a non-terminal)
                if label in c.symbol2label:
                    return c.symbol2label[label]
                else:
                    c.labels.append((c.symbol2number[label], None))
                    c.symbol2label[label] = ilabel
                    return ilabel
            else:
                # A named token (NAME, NUMBER, STRING)
                itoken = self.tokens.get(label, None)
                assert isinstance(itoken, int), label
                assert itoken in self.tokens.values(), label
                if itoken in c.tokens:
                    return c.tokens[itoken]
                else:
                    c.labels.append((itoken, None))
                    c.tokens[itoken] = ilabel
                    return ilabel
        else:
            # Either a keyword or an operator
            assert label[0] in ('"', "'"), label
            value = eval(label)
            if value[0].isalpha():
                # A keyword
                if value in c.keywords:
                    return c.keywords[value]
                else:
                    c.labels.append((self.tokens["NAME"], value))
                    c.keywords[value] = ilabel
                    return ilabel
            else:
                # An operator (any non-numeric token)
                tok_name = self.opmap[value] # Fails if unknown token
                itoken = self.tokens[tok_name]
                if itoken in c.tokens:
                    return c.tokens[itoken]
                else:
                    c.labels.append((itoken, None))
                    c.tokens[itoken] = ilabel
                    return ilabel

    def addfirstsets(self):
        names = list(self.dfas.keys())
        for name in names:
            if name not in self.first:
                self.calcfirst(name)

            if self.verbose:
                print("First set for {dfa_name}".format(dfa_name=name))
                for item in self.first[name]:
                    print("    - {terminal}".format(terminal=item))

    def calcfirst(self, name):
        dfa = self.dfas[name]
        self.first[name] = None # dummy to detect left recursion
        state = dfa[0]
        totalset = set()
        overlapcheck = {}
        for label, next in state.arcs.items():
            if label in self.dfas:
                if label in self.first:
                    fset = self.first[label]
                    if fset is None:
                        raise ValueError("recursion for rule %r" % name)
                else:
                    self.calcfirst(label)
                    fset = self.first[label]
                totalset.update(fset)
                overlapcheck[label] = fset
            else:
                totalset.add(label)
                overlapcheck[label] = {label}
        inverse = {}
        for label, itsfirst in overlapcheck.items():
            for symbol in itsfirst:
                if symbol in inverse:
                    raise ValueError("rule %s is ambiguous; %s is in the"
                                     " first sets of %s as well as %s" %
                                     (name, symbol, label, inverse[symbol]))
                inverse[symbol] = label
        self.first[name] = totalset

    def parse(self):
        dfas = collections.OrderedDict()
        startsymbol = None
        # MSTART: (NEWLINE | RULE)* ENDMARKER
        while self.type != tokenize.ENDMARKER:
            while self.type == tokenize.NEWLINE:
                self.gettoken()
            # RULE: NAME ':' RHS NEWLINE
            name = self.expect(tokenize.NAME)
            if self.verbose:
                print("Processing rule {dfa_name}".format(dfa_name=name))
            self.expect(tokenize.OP, ":")
            a, z = self.parse_rhs()
            self.expect(tokenize.NEWLINE)
            if self.verbose:
                self.dump_nfa(name, a, z)
            dfa = self.make_dfa(a, z)
            if self.verbose:
                self.dump_dfa(name, dfa)
            self.simplify_dfa(dfa)
            dfas[name] = dfa
            if startsymbol is None:
                startsymbol = name
        return dfas, startsymbol

    def make_dfa(self, start, finish):
        # To turn an NFA into a DFA, we define the states of the DFA
        # to correspond to *sets* of states of the NFA.  Then do some
        # state reduction.  Let's represent sets as dicts with 1 for
        # values.
        assert isinstance(start, NFAState)
        assert isinstance(finish, NFAState)
        def closure(state):
            base = set()
            addclosure(state, base)
            return base
        def addclosure(state, base):
            assert isinstance(state, NFAState)
            if state in base:
                return
            base.add(state)
            for label, next in state.arcs:
                if label is None:
                    addclosure(next, base)
        states = [DFAState(closure(start), finish)]
        for state in states: # NB states grows while we're iterating
            arcs = {}
            for nfastate in state.nfaset:
                for label, next in nfastate.arcs:
                    if label is not None:
                        addclosure(next, arcs.setdefault(label, set()))
            for label, nfaset in sorted(arcs.items()):
                for st in states:
                    if st.nfaset == nfaset:
                        break
                else:
                    st = DFAState(nfaset, finish)
                    states.append(st)
                state.addarc(st, label)
        return states # List of DFAState instances; first one is start

    def dump_nfa(self, name, start, finish):
        print("Dump of NFA for", name)
        todo = [start]
        for i, state in enumerate(todo):
            print("  State", i, state is finish and "(final)" or "")
            for label, next in state.arcs:
                if next in todo:
                    j = todo.index(next)
                else:
                    j = len(todo)
                    todo.append(next)
                if label is None:
                    print("    -> %d" % j)
                else:
                    print("    %s -> %d" % (label, j))

    def dump_dfa(self, name, dfa):
        print("Dump of DFA for", name)
        for i, state in enumerate(dfa):
            print("  State", i, state.isfinal and "(final)" or "")
            for label, next in sorted(state.arcs.items()):
                print("    %s -> %d" % (label, dfa.index(next)))

    def simplify_dfa(self, dfa):
        # This is not theoretically optimal, but works well enough.
        # Algorithm: repeatedly look for two states that have the same
        # set of arcs (same labels pointing to the same nodes) and
        # unify them, until things stop changing.

        # dfa is a list of DFAState instances
        changes = True
        while changes:
            changes = False
            for i, state_i in enumerate(dfa):
                for j in range(i+1, len(dfa)):
                    state_j = dfa[j]
                    if state_i == state_j:
                        #print "  unify", i, j
                        del dfa[j]
                        for state in dfa:
                            state.unifystate(state_j, state_i)
                        changes = True
                        break

    def parse_rhs(self):
        # RHS: ALT ('|' ALT)*
        a, z = self.parse_alt()
        if self.value != "|":
            return a, z
        else:
            aa = NFAState()
            zz = NFAState()
            aa.addarc(a)
            z.addarc(zz)
            while self.value == "|":
                self.gettoken()
                a, z = self.parse_alt()
                aa.addarc(a)
                z.addarc(zz)
            return aa, zz

    def parse_alt(self):
        # ALT: ITEM+
        a, b = self.parse_item()
        while (self.value in ("(", "[") or
               self.type in (tokenize.NAME, tokenize.STRING)):
            c, d = self.parse_item()
            b.addarc(c)
            b = d
        return a, b

    def parse_item(self):
        # ITEM: '[' RHS ']' | ATOM ['+' | '*']
        if self.value == "[":
            self.gettoken()
            a, z = self.parse_rhs()
            self.expect(tokenize.OP, "]")
            a.addarc(z)
            return a, z
        else:
            a, z = self.parse_atom()
            value = self.value
            if value not in ("+", "*"):
                return a, z
            self.gettoken()
            z.addarc(a)
            if value == "+":
                return a, z
            else:
                return a, a

    def parse_atom(self):
        # ATOM: '(' RHS ')' | NAME | STRING
        if self.value == "(":
            self.gettoken()
            a, z = self.parse_rhs()
            self.expect(tokenize.OP, ")")
            return a, z
        elif self.type in (tokenize.NAME, tokenize.STRING):
            a = NFAState()
            z = NFAState()
            a.addarc(z, self.value)
            self.gettoken()
            return a, z
        else:
            self.raise_error("expected (...) or NAME or STRING, got %s/%s",
                             self.type, self.value)

    def expect(self, type, value=None):
        if self.type != type or (value is not None and self.value != value):
            self.raise_error("expected %s/%s, got %s/%s",
                             type, value, self.type, self.value)
        value = self.value
        self.gettoken()
        return value

    def gettoken(self):
        tup = next(self.generator)
        while tup[0] in (tokenize.COMMENT, tokenize.NL):
            tup = next(self.generator)
        self.type, self.value, self.begin, self.end, self.line = tup
        # print(getattr(tokenize, 'tok_name')[self.type], repr(self.value))

    def raise_error(self, msg, *args):
        if args:
            try:
                msg = msg % args
            except Exception:
                msg = " ".join([msg] + list(map(str, args)))
        raise SyntaxError(msg, (self.filename, self.end[0],
                                self.end[1], self.line))

class NFAState(object):

    def __init__(self):
        self.arcs = [] # list of (label, NFAState) pairs

    def addarc(self, next, label=None):
        assert label is None or isinstance(label, str)
        assert isinstance(next, NFAState)
        self.arcs.append((label, next))

class DFAState(object):

    def __init__(self, nfaset, final):
        assert isinstance(nfaset, set)
        assert isinstance(next(iter(nfaset)), NFAState)
        assert isinstance(final, NFAState)
        self.nfaset = nfaset
        self.isfinal = final in nfaset
        self.arcs = {} # map from label to DFAState

    def addarc(self, next, label):
        assert isinstance(label, str)
        assert label not in self.arcs
        assert isinstance(next, DFAState)
        self.arcs[label] = next

    def unifystate(self, old, new):
        for label, next in self.arcs.items():
            if next is old:
                self.arcs[label] = new

    def __eq__(self, other):
        # Equality test -- ignore the nfaset instance variable
        assert isinstance(other, DFAState)
        if self.isfinal != other.isfinal:
            return False
        # Can't just return self.arcs == other.arcs, because that
        # would invoke this method recursively, with cycles...
        if len(self.arcs) != len(other.arcs):
            return False
        for label, next in self.arcs.items():
            if next is not other.arcs.get(label):
                return False
        return True

    __hash__ = None # For Py3 compatibility.
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00			`import collections`
			`import tokenize # from stdlib`

			`from . import grammar, token`

Clean implementation of Parser/pgen and fix some style issues (GH-12156) 2019-03-04 03:26:13 -04:00
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00			`class ParserGenerator(object):`

			`def __init__(self, grammar_file, token_file, stream=None, verbose=False):`
			`close_stream = None`
			`if stream is None:`
			`stream = open(grammar_file)`
			`close_stream = stream.close`
			`with open(token_file) as tok_file:`
			`token_lines = tok_file.readlines()`
			`self.tokens = dict(token.generate_tokens(token_lines))`
			`self.opmap = dict(token.generate_opmap(token_lines))`
			`# Manually add <> so it does not collide with !=`
			`self.opmap['<>'] = "NOTEQUAL"`
			`self.verbose = verbose`
			`self.filename = grammar_file`
			`self.stream = stream`
			`self.generator = tokenize.generate_tokens(stream.readline)`
			`self.gettoken() # Initialize lookahead`
			`self.dfas, self.startsymbol = self.parse()`
			`if close_stream is not None:`
			`close_stream()`
			`self.first = {} # map from symbol name to set of tokens`
			`self.addfirstsets()`

			`def make_grammar(self):`
			`c = grammar.Grammar()`
			`names = list(self.dfas.keys())`
			`names.remove(self.startsymbol)`
			`names.insert(0, self.startsymbol)`
			`for name in names:`
			`i = 256 + len(c.symbol2number)`
			`c.symbol2number[name] = i`
			`c.number2symbol[i] = name`
			`for name in names:`
			`self.make_label(c, name)`
			`dfa = self.dfas[name]`
			`states = []`
			`for state in dfa:`
			`arcs = []`
			`for label, next in sorted(state.arcs.items()):`
			`arcs.append((self.make_label(c, label), dfa.index(next)))`
			`if state.isfinal:`
			`arcs.append((0, dfa.index(state)))`
			`states.append(arcs)`
			`c.states.append(states)`
			`c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))`
			`c.start = c.symbol2number[self.startsymbol]`

			`if self.verbose:`
			`print("")`
			`print("Grammar summary")`
			`print("===============")`

			`print("- {n_labels} labels".format(n_labels=len(c.labels)))`
			`print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))`
			`print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))`
			`print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))`
			`print(`
			`"- Start symbol: {start_symbol}".format(`
			`start_symbol=c.number2symbol[c.start]`
			`)`
			`)`
			`return c`

			`def make_first(self, c, name):`
			`rawfirst = self.first[name]`
			`first = set()`
			`for label in sorted(rawfirst):`
			`ilabel = self.make_label(c, label)`
			`##assert ilabel not in first # XXX failed on <> ... !=`
			`first.add(ilabel)`
			`return first`

			`def make_label(self, c, label):`
			`# XXX Maybe this should be a method on a subclass of converter?`
			`ilabel = len(c.labels)`
			`if label[0].isalpha():`
			`# Either a symbol name or a named token`
			`if label in c.symbol2number:`
			`# A symbol name (a non-terminal)`
			`if label in c.symbol2label:`
			`return c.symbol2label[label]`
			`else:`
			`c.labels.append((c.symbol2number[label], None))`
			`c.symbol2label[label] = ilabel`
			`return ilabel`
			`else:`
			`# A named token (NAME, NUMBER, STRING)`
			`itoken = self.tokens.get(label, None)`
			`assert isinstance(itoken, int), label`
			`assert itoken in self.tokens.values(), label`
			`if itoken in c.tokens:`
			`return c.tokens[itoken]`
			`else:`
			`c.labels.append((itoken, None))`
			`c.tokens[itoken] = ilabel`
			`return ilabel`
			`else:`
			`# Either a keyword or an operator`
			`assert label[0] in ('"', "'"), label`
			`value = eval(label)`
			`if value[0].isalpha():`
			`# A keyword`
			`if value in c.keywords:`
			`return c.keywords[value]`
			`else:`
			`c.labels.append((self.tokens["NAME"], value))`
			`c.keywords[value] = ilabel`
			`return ilabel`
			`else:`
			`# An operator (any non-numeric token)`
			`tok_name = self.opmap[value] # Fails if unknown token`
			`itoken = self.tokens[tok_name]`
			`if itoken in c.tokens:`
			`return c.tokens[itoken]`
			`else:`
			`c.labels.append((itoken, None))`
			`c.tokens[itoken] = ilabel`
			`return ilabel`

			`def addfirstsets(self):`
			`names = list(self.dfas.keys())`
			`for name in names:`
			`if name not in self.first:`
			`self.calcfirst(name)`

			`if self.verbose:`
			`print("First set for {dfa_name}".format(dfa_name=name))`
			`for item in self.first[name]:`
			`print(" - {terminal}".format(terminal=item))`

			`def calcfirst(self, name):`
			`dfa = self.dfas[name]`
			`self.first[name] = None # dummy to detect left recursion`
			`state = dfa[0]`
			`totalset = set()`
			`overlapcheck = {}`
			`for label, next in state.arcs.items():`
			`if label in self.dfas:`
			`if label in self.first:`
			`fset = self.first[label]`
			`if fset is None:`
			`raise ValueError("recursion for rule %r" % name)`
			`else:`
			`self.calcfirst(label)`
			`fset = self.first[label]`
			`totalset.update(fset)`
			`overlapcheck[label] = fset`
			`else:`
			`totalset.add(label)`
			`overlapcheck[label] = {label}`
			`inverse = {}`
			`for label, itsfirst in overlapcheck.items():`
			`for symbol in itsfirst:`
			`if symbol in inverse:`
			`raise ValueError("rule %s is ambiguous; %s is in the"`
			`" first sets of %s as well as %s" %`
			`(name, symbol, label, inverse[symbol]))`
			`inverse[symbol] = label`
			`self.first[name] = totalset`

			`def parse(self):`
			`dfas = collections.OrderedDict()`
			`startsymbol = None`
			`# MSTART: (NEWLINE \| RULE)* ENDMARKER`
			`while self.type != tokenize.ENDMARKER:`
			`while self.type == tokenize.NEWLINE:`
			`self.gettoken()`
			`# RULE: NAME ':' RHS NEWLINE`
			`name = self.expect(tokenize.NAME)`
			`if self.verbose:`
			`print("Processing rule {dfa_name}".format(dfa_name=name))`
			`self.expect(tokenize.OP, ":")`
			`a, z = self.parse_rhs()`
			`self.expect(tokenize.NEWLINE)`
			`if self.verbose:`
			`self.dump_nfa(name, a, z)`
			`dfa = self.make_dfa(a, z)`
			`if self.verbose:`
			`self.dump_dfa(name, dfa)`
			`self.simplify_dfa(dfa)`
			`dfas[name] = dfa`
			`if startsymbol is None:`
			`startsymbol = name`
			`return dfas, startsymbol`

			`def make_dfa(self, start, finish):`
			`# To turn an NFA into a DFA, we define the states of the DFA`
			`# to correspond to sets of states of the NFA. Then do some`
			`# state reduction. Let's represent sets as dicts with 1 for`
			`# values.`
			`assert isinstance(start, NFAState)`
			`assert isinstance(finish, NFAState)`
			`def closure(state):`
			`base = set()`
			`addclosure(state, base)`
			`return base`
			`def addclosure(state, base):`
			`assert isinstance(state, NFAState)`
			`if state in base:`
			`return`
			`base.add(state)`
			`for label, next in state.arcs:`
			`if label is None:`
			`addclosure(next, base)`
			`states = [DFAState(closure(start), finish)]`
			`for state in states: # NB states grows while we're iterating`
			`arcs = {}`
			`for nfastate in state.nfaset:`
			`for label, next in nfastate.arcs:`
			`if label is not None:`
			`addclosure(next, arcs.setdefault(label, set()))`
			`for label, nfaset in sorted(arcs.items()):`
			`for st in states:`
			`if st.nfaset == nfaset:`
			`break`
			`else:`
			`st = DFAState(nfaset, finish)`
			`states.append(st)`
			`state.addarc(st, label)`
			`return states # List of DFAState instances; first one is start`

			`def dump_nfa(self, name, start, finish):`
			`print("Dump of NFA for", name)`
			`todo = [start]`
			`for i, state in enumerate(todo):`
			`print(" State", i, state is finish and "(final)" or "")`
			`for label, next in state.arcs:`
			`if next in todo:`
			`j = todo.index(next)`
			`else:`
			`j = len(todo)`
			`todo.append(next)`
			`if label is None:`
			`print(" -> %d" % j)`
			`else:`
			`print(" %s -> %d" % (label, j))`

			`def dump_dfa(self, name, dfa):`
			`print("Dump of DFA for", name)`
			`for i, state in enumerate(dfa):`
			`print(" State", i, state.isfinal and "(final)" or "")`
			`for label, next in sorted(state.arcs.items()):`
			`print(" %s -> %d" % (label, dfa.index(next)))`

			`def simplify_dfa(self, dfa):`
			`# This is not theoretically optimal, but works well enough.`
			`# Algorithm: repeatedly look for two states that have the same`
			`# set of arcs (same labels pointing to the same nodes) and`
			`# unify them, until things stop changing.`

			`# dfa is a list of DFAState instances`
			`changes = True`
			`while changes:`
			`changes = False`
			`for i, state_i in enumerate(dfa):`
			`for j in range(i+1, len(dfa)):`
			`state_j = dfa[j]`
			`if state_i == state_j:`
			`#print " unify", i, j`
			`del dfa[j]`
			`for state in dfa:`
			`state.unifystate(state_j, state_i)`
			`changes = True`
			`break`

			`def parse_rhs(self):`
			`# RHS: ALT ('\|' ALT)*`
			`a, z = self.parse_alt()`
			`if self.value != "\|":`
			`return a, z`
			`else:`
			`aa = NFAState()`
			`zz = NFAState()`
			`aa.addarc(a)`
			`z.addarc(zz)`
			`while self.value == "\|":`
			`self.gettoken()`
			`a, z = self.parse_alt()`
			`aa.addarc(a)`
			`z.addarc(zz)`
			`return aa, zz`

			`def parse_alt(self):`
			`# ALT: ITEM+`
			`a, b = self.parse_item()`
			`while (self.value in ("(", "[") or`
			`self.type in (tokenize.NAME, tokenize.STRING)):`
			`c, d = self.parse_item()`
			`b.addarc(c)`
			`b = d`
			`return a, b`

			`def parse_item(self):`
			`# ITEM: '[' RHS ']' \| ATOM ['+' \| '*']`
			`if self.value == "[":`
			`self.gettoken()`
			`a, z = self.parse_rhs()`
			`self.expect(tokenize.OP, "]")`
			`a.addarc(z)`
			`return a, z`
			`else:`
			`a, z = self.parse_atom()`
			`value = self.value`
			`if value not in ("+", "*"):`
			`return a, z`
			`self.gettoken()`
			`z.addarc(a)`
			`if value == "+":`
			`return a, z`
			`else:`
			`return a, a`

			`def parse_atom(self):`
			`# ATOM: '(' RHS ')' \| NAME \| STRING`
			`if self.value == "(":`
			`self.gettoken()`
			`a, z = self.parse_rhs()`
			`self.expect(tokenize.OP, ")")`
			`return a, z`
			`elif self.type in (tokenize.NAME, tokenize.STRING):`
			`a = NFAState()`
			`z = NFAState()`
			`a.addarc(z, self.value)`
			`self.gettoken()`
			`return a, z`
			`else:`
			`self.raise_error("expected (...) or NAME or STRING, got %s/%s",`
			`self.type, self.value)`

			`def expect(self, type, value=None):`
			`if self.type != type or (value is not None and self.value != value):`
			`self.raise_error("expected %s/%s, got %s/%s",`
			`type, value, self.type, self.value)`
			`value = self.value`
			`self.gettoken()`
			`return value`

			`def gettoken(self):`
			`tup = next(self.generator)`
			`while tup[0] in (tokenize.COMMENT, tokenize.NL):`
			`tup = next(self.generator)`
			`self.type, self.value, self.begin, self.end, self.line = tup`
			`# print(getattr(tokenize, 'tok_name')[self.type], repr(self.value))`

			`def raise_error(self, msg, *args):`
			`if args:`
			`try:`
			`msg = msg % args`
Clean implementation of Parser/pgen and fix some style issues (GH-12156) 2019-03-04 03:26:13 -04:00			`except Exception:`
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00			`msg = " ".join([msg] + list(map(str, args)))`
			`raise SyntaxError(msg, (self.filename, self.end[0],`
			`self.end[1], self.line))`

			`class NFAState(object):`

			`def __init__(self):`
			`self.arcs = [] # list of (label, NFAState) pairs`

			`def addarc(self, next, label=None):`
			`assert label is None or isinstance(label, str)`
			`assert isinstance(next, NFAState)`
			`self.arcs.append((label, next))`

			`class DFAState(object):`

			`def __init__(self, nfaset, final):`
			`assert isinstance(nfaset, set)`
			`assert isinstance(next(iter(nfaset)), NFAState)`
			`assert isinstance(final, NFAState)`
			`self.nfaset = nfaset`
			`self.isfinal = final in nfaset`
			`self.arcs = {} # map from label to DFAState`

			`def addarc(self, next, label):`
			`assert isinstance(label, str)`
			`assert label not in self.arcs`
			`assert isinstance(next, DFAState)`
			`self.arcs[label] = next`

			`def unifystate(self, old, new):`
			`for label, next in self.arcs.items():`
			`if next is old:`
			`self.arcs[label] = new`

			`def __eq__(self, other):`
			`# Equality test -- ignore the nfaset instance variable`
			`assert isinstance(other, DFAState)`
			`if self.isfinal != other.isfinal:`
			`return False`
			`# Can't just return self.arcs == other.arcs, because that`
			`# would invoke this method recursively, with cycles...`
			`if len(self.arcs) != len(other.arcs):`
			`return False`
			`for label, next in self.arcs.items():`
			`if next is not other.arcs.get(label):`
			`return False`
			`return True`

			`__hash__ = None # For Py3 compatibility.`