cpython/Parser/pgen/grammar.py

import collections


class Grammar:
    """Pgen parsing tables class.

    The instance variables are as follows:

    symbol2number -- a dict mapping symbol names to numbers.  Symbol
                     numbers are always 256 or higher, to distinguish
                     them from token numbers, which are between 0 and
                     255 (inclusive).

    number2symbol -- a dict mapping numbers to symbol names;
                     these two are each other's inverse.

    states        -- a list of DFAs, where each DFA is a list of
                     states, each state is a list of arcs, and each
                     arc is a (i, j) pair where i is a label and j is
                     a state number.  The DFA number is the index into
                     this list.  (This name is slightly confusing.)
                     Final states are represented by a special arc of
                     the form (0, j) where j is its own state number.

    dfas          -- a dict mapping symbol numbers to (DFA, first)
                     pairs, where DFA is an item from the states list
                     above, and first is a set of tokens that can
                     begin this grammar rule.

    labels        -- a list of (x, y) pairs where x is either a token
                     number or a symbol number, and y is either None
                     or a string; the strings are keywords.  The label
                     number is the index in this list; label numbers
                     are used to mark state transitions (arcs) in the
                     DFAs.

    start         -- the number of the grammar's start symbol.

    keywords      -- a dict mapping keyword strings to arc labels.

    tokens        -- a dict mapping token numbers to arc labels.

    """

    def __init__(self):
        self.symbol2number = collections.OrderedDict()
        self.number2symbol = collections.OrderedDict()
        self.states = []
        self.dfas = collections.OrderedDict()
        self.labels = [(0, "EMPTY")]
        self.keywords = collections.OrderedDict()
        self.tokens = collections.OrderedDict()
        self.symbol2label = collections.OrderedDict()
        self.start = 256

    def produce_graminit_h(self, writer):
        writer("/* Generated by Parser/pgen */\n\n")
        for number, symbol in self.number2symbol.items():
            writer("#define {} {}\n".format(symbol, number))

    def produce_graminit_c(self, writer):
        writer("/* Generated by Parser/pgen */\n\n")

        writer('#include "grammar.h"\n')
        writer("grammar _PyParser_Grammar;\n")

        self.print_dfas(writer)
        self.print_labels(writer)

        writer("grammar _PyParser_Grammar = {\n")
        writer("    {n_dfas},\n".format(n_dfas=len(self.dfas)))
        writer("    dfas,\n")
        writer("    {{{n_labels}, labels}},\n".format(n_labels=len(self.labels)))
        writer("    {start_number}\n".format(start_number=self.start))
        writer("};\n")

    def print_labels(self, writer):
        writer(
            "static const label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels))
        )
        for label, name in self.labels:
            label_name = '"{}"'.format(name) if name is not None else 0
            writer(
                '    {{{label}, {label_name}}},\n'.format(
                    label=label, label_name=label_name
                )
            )
        writer("};\n")

    def print_dfas(self, writer):
        self.print_states(writer)
        writer("static const dfa dfas[{}] = {{\n".format(len(self.dfas)))
        for dfaindex, dfa_elem in enumerate(self.dfas.items()):
            symbol, (dfa, first_sets) = dfa_elem
            writer(
                '    {{{dfa_symbol}, "{symbol_name}", '.format(
                    dfa_symbol=symbol, symbol_name=self.number2symbol[symbol]
                )
                + "{n_states}, states_{dfa_index},\n".format(
                    n_states=len(dfa), dfa_index=dfaindex
                )
                + '     "'
            )

            bitset = bytearray((len(self.labels) >> 3) + 1)
            for token in first_sets:
                bitset[token >> 3] |= 1 << (token & 7)
            for byte in bitset:
                writer("\\%03o" % (byte & 0xFF))
            writer('"},\n')
        writer("};\n")

    def print_states(self, write):
        for dfaindex, dfa in enumerate(self.states):
            self.print_arcs(write, dfaindex, dfa)
            write(
                "static state states_{dfa_index}[{n_states}] = {{\n".format(
                    dfa_index=dfaindex, n_states=len(dfa)
                )
            )
            for stateindex, state in enumerate(dfa):
                narcs = len(state)
                write(
                    "    {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format(
                        n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex
                    )
                )
            write("};\n")

    def print_arcs(self, write, dfaindex, states):
        for stateindex, state in enumerate(states):
            narcs = len(state)
            write(
                "static const arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format(
                    dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs
                )
            )
            for a, b in state:
                write(
                    "    {{{from_label}, {to_state}}},\n".format(
                        from_label=a, to_state=b
                    )
                )
            write("};\n")
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00			`import collections`


Clean implementation of Parser/pgen and fix some style issues (GH-12156) 2019-03-04 03:26:13 -04:00			`class Grammar:`
			`"""Pgen parsing tables class.`
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00
			`The instance variables are as follows:`

			`symbol2number -- a dict mapping symbol names to numbers. Symbol`
			`numbers are always 256 or higher, to distinguish`
			`them from token numbers, which are between 0 and`
			`255 (inclusive).`

			`number2symbol -- a dict mapping numbers to symbol names;`
			`these two are each other's inverse.`

			`states -- a list of DFAs, where each DFA is a list of`
			`states, each state is a list of arcs, and each`
			`arc is a (i, j) pair where i is a label and j is`
			`a state number. The DFA number is the index into`
			`this list. (This name is slightly confusing.)`
			`Final states are represented by a special arc of`
			`the form (0, j) where j is its own state number.`

			`dfas -- a dict mapping symbol numbers to (DFA, first)`
			`pairs, where DFA is an item from the states list`
			`above, and first is a set of tokens that can`
Clean implementation of Parser/pgen and fix some style issues (GH-12156) 2019-03-04 03:26:13 -04:00			`begin this grammar rule.`
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00
			`labels -- a list of (x, y) pairs where x is either a token`
			`number or a symbol number, and y is either None`
			`or a string; the strings are keywords. The label`
			`number is the index in this list; label numbers`
			`are used to mark state transitions (arcs) in the`
			`DFAs.`

			`start -- the number of the grammar's start symbol.`

			`keywords -- a dict mapping keyword strings to arc labels.`

			`tokens -- a dict mapping token numbers to arc labels.`

			`"""`

			`def __init__(self):`
			`self.symbol2number = collections.OrderedDict()`
			`self.number2symbol = collections.OrderedDict()`
			`self.states = []`
			`self.dfas = collections.OrderedDict()`
			`self.labels = [(0, "EMPTY")]`
			`self.keywords = collections.OrderedDict()`
			`self.tokens = collections.OrderedDict()`
			`self.symbol2label = collections.OrderedDict()`
			`self.start = 256`

			`def produce_graminit_h(self, writer):`
			`writer("/* Generated by Parser/pgen */\n\n")`
			`for number, symbol in self.number2symbol.items():`
			`writer("#define {} {}\n".format(symbol, number))`

			`def produce_graminit_c(self, writer):`
			`writer("/* Generated by Parser/pgen */\n\n")`

			`writer('#include "grammar.h"\n')`
			`writer("grammar _PyParser_Grammar;\n")`

			`self.print_dfas(writer)`
			`self.print_labels(writer)`

			`writer("grammar _PyParser_Grammar = {\n")`
			`writer(" {n_dfas},\n".format(n_dfas=len(self.dfas)))`
			`writer(" dfas,\n")`
			`writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels)))`
			`writer(" {start_number}\n".format(start_number=self.start))`
			`writer("};\n")`

			`def print_labels(self, writer):`
			`writer(`
use `const` in graminit.c (GH-12713) 2019-04-23 06:29:57 -03:00			`"static const label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels))`
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00			`)`
			`for label, name in self.labels:`
Clean implementation of Parser/pgen and fix some style issues (GH-12156) 2019-03-04 03:26:13 -04:00			`label_name = '"{}"'.format(name) if name is not None else 0`
			`writer(`
			`' {{{label}, {label_name}}},\n'.format(`
			`label=label, label_name=label_name`
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00			`)`
Clean implementation of Parser/pgen and fix some style issues (GH-12156) 2019-03-04 03:26:13 -04:00			`)`
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00			`writer("};\n")`

			`def print_dfas(self, writer):`
			`self.print_states(writer)`
use `const` in graminit.c (GH-12713) 2019-04-23 06:29:57 -03:00			`writer("static const dfa dfas[{}] = {{\n".format(len(self.dfas)))`
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00			`for dfaindex, dfa_elem in enumerate(self.dfas.items()):`
			`symbol, (dfa, first_sets) = dfa_elem`
			`writer(`
			`' {{{dfa_symbol}, "{symbol_name}", '.format(`
			`dfa_symbol=symbol, symbol_name=self.number2symbol[symbol]`
			`)`
Remove d_initial from the parser as it is unused (GH-12212) d_initial, the first state of a particular DFA in the parser has always been initialized to 0 in the old pgen as well as the new pgen. As this value is not used and the first state of each DFA is assumed to be the first element in the array representing it, remove d_initial from the parser to reduce complexity. 2019-03-09 11:35:50 -04:00			`+ "{n_states}, states_{dfa_index},\n".format(`
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00			`n_states=len(dfa), dfa_index=dfaindex`
			`)`
Clean implementation of Parser/pgen and fix some style issues (GH-12156) 2019-03-04 03:26:13 -04:00			`+ ' "'`
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00			`)`

			`bitset = bytearray((len(self.labels) >> 3) + 1)`
			`for token in first_sets:`
			`bitset[token >> 3] \|= 1 << (token & 7)`
			`for byte in bitset:`
			`writer("\\%03o" % (byte & 0xFF))`
			`writer('"},\n')`
			`writer("};\n")`

			`def print_states(self, write):`
			`for dfaindex, dfa in enumerate(self.states):`
			`self.print_arcs(write, dfaindex, dfa)`
			`write(`
			`"static state states_{dfa_index}[{n_states}] = {{\n".format(`
			`dfa_index=dfaindex, n_states=len(dfa)`
			`)`
			`)`
			`for stateindex, state in enumerate(dfa):`
			`narcs = len(state)`
			`write(`
			`" {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format(`
			`n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex`
			`)`
			`)`
			`write("};\n")`

			`def print_arcs(self, write, dfaindex, states):`
			`for stateindex, state in enumerate(states):`
			`narcs = len(state)`
			`write(`
use `const` in graminit.c (GH-12713) 2019-04-23 06:29:57 -03:00			`"static const arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format(`
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814) Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions. 2019-03-01 19:34:44 -04:00			`dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs`
			`)`
			`)`
			`for a, b in state:`
			`write(`
			`" {{{from_label}, {to_state}}},\n".format(`
			`from_label=a, to_state=b`
			`)`
			`)`
			`write("};\n")`