cpython/Lib/lib2to3/pgen2/grammar.py

# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.

"""This module defines the data structures used to represent a grammar.

These are a bit arcane because they are derived from the data
structures used by Python's 'pgen' parser generator.

There's also a table here mapping operators to their names in the
token module; the Python tokenize module reports all operators as the
fallback token code OP, but the parser needs the actual token code.

"""

# Python imports
import pickle

# Local imports
from . import token, tokenize


class Grammar(object):
    """Pgen parsing tables tables conversion class.

    Once initialized, this class supplies the grammar tables for the
    parsing engine implemented by parse.py.  The parsing engine
    accesses the instance variables directly.  The class here does not
    provide initialization of the tables; several subclasses exist to
    do this (see the conv and pgen modules).

    The load() method reads the tables from a pickle file, which is
    much faster than the other ways offered by subclasses.  The pickle
    file is written by calling dump() (after loading the grammar
    tables using a subclass).  The report() method prints a readable
    representation of the tables to stdout, for debugging.

    The instance variables are as follows:

    symbol2number -- a dict mapping symbol names to numbers.  Symbol
                     numbers are always 256 or higher, to distinguish
                     them from token numbers, which are between 0 and
                     255 (inclusive).

    number2symbol -- a dict mapping numbers to symbol names;
                     these two are each other's inverse.

    states        -- a list of DFAs, where each DFA is a list of
                     states, each state is is a list of arcs, and each
                     arc is a (i, j) pair where i is a label and j is
                     a state number.  The DFA number is the index into
                     this list.  (This name is slightly confusing.)
                     Final states are represented by a special arc of
                     the form (0, j) where j is its own state number.

    dfas          -- a dict mapping symbol numbers to (DFA, first)
                     pairs, where DFA is an item from the states list
                     above, and first is a set of tokens that can
                     begin this grammar rule (represented by a dict
                     whose values are always 1).

    labels        -- a list of (x, y) pairs where x is either a token
                     number or a symbol number, and y is either None
                     or a string; the strings are keywords.  The label
                     number is the index in this list; label numbers
                     are used to mark state transitions (arcs) in the
                     DFAs.

    start         -- the number of the grammar's start symbol.

    keywords      -- a dict mapping keyword strings to arc labels.

    tokens        -- a dict mapping token numbers to arc labels.

    """

    def __init__(self):
        self.symbol2number = {}
        self.number2symbol = {}
        self.states = []
        self.dfas = {}
        self.labels = [(0, "EMPTY")]
        self.keywords = {}
        self.tokens = {}
        self.symbol2label = {}
        self.start = 256

    def dump(self, filename):
        """Dump the grammar tables to a pickle file."""
        f = open(filename, "wb")
        pickle.dump(self.__dict__, f, 2)
        f.close()

    def load(self, filename):
        """Load the grammar tables from a pickle file."""
        f = open(filename, "rb")
        d = pickle.load(f)
        f.close()
        self.__dict__.update(d)

    def copy(self):
        """
        Copy the grammar.
        """
        new = self.__class__()
        for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
                          "tokens", "symbol2label"):
            setattr(new, dict_attr, getattr(self, dict_attr).copy())
        new.labels = self.labels[:]
        new.states = self.states[:]
        new.start = self.start
        return new

    def report(self):
        """Dump the grammar tables to standard output, for debugging."""
        from pprint import pprint
        print "s2n"
        pprint(self.symbol2number)
        print "n2s"
        pprint(self.number2symbol)
        print "states"
        pprint(self.states)
        print "dfas"
        pprint(self.dfas)
        print "labels"
        pprint(self.labels)
        print "start", self.start


# Map from operator to number (since tokenize doesn't do this)

opmap_raw = """
( LPAR
) RPAR
[ LSQB
] RSQB
: COLON
, COMMA
; SEMI
+ PLUS
- MINUS
* STAR
/ SLASH
| VBAR
& AMPER
< LESS
> GREATER
= EQUAL
. DOT
% PERCENT
` BACKQUOTE
{ LBRACE
} RBRACE
@ AT
== EQEQUAL
!= NOTEQUAL
<> NOTEQUAL
<= LESSEQUAL
>= GREATEREQUAL
~ TILDE
^ CIRCUMFLEX
<< LEFTSHIFT
>> RIGHTSHIFT
** DOUBLESTAR
+= PLUSEQUAL
-= MINEQUAL
*= STAREQUAL
/= SLASHEQUAL
%= PERCENTEQUAL
&= AMPEREQUAL
|= VBAREQUAL
^= CIRCUMFLEXEQUAL
<<= LEFTSHIFTEQUAL
>>= RIGHTSHIFTEQUAL
**= DOUBLESTAREQUAL
// DOUBLESLASH
//= DOUBLESLASHEQUAL
-> RARROW
"""

opmap = {}
for line in opmap_raw.splitlines():
    if line:
        op, name = line.split()
        opmap[op] = getattr(token, name)
Import lib2to3. 2008-03-19 01:43:46 -03:00			`# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.`
			`# Licensed to PSF under a Contributor Agreement.`

			`"""This module defines the data structures used to represent a grammar.`

			`These are a bit arcane because they are derived from the data`
			`structures used by Python's 'pgen' parser generator.`

			`There's also a table here mapping operators to their names in the`
			`token module; the Python tokenize module reports all operators as the`
			`fallback token code OP, but the parser needs the actual token code.`

			`"""`

			`# Python imports`
			`import pickle`

			`# Local imports`
			`from . import token, tokenize`


			`class Grammar(object):`
			`"""Pgen parsing tables tables conversion class.`

			`Once initialized, this class supplies the grammar tables for the`
			`parsing engine implemented by parse.py. The parsing engine`
			`accesses the instance variables directly. The class here does not`
			`provide initialization of the tables; several subclasses exist to`
			`do this (see the conv and pgen modules).`

			`The load() method reads the tables from a pickle file, which is`
			`much faster than the other ways offered by subclasses. The pickle`
			`file is written by calling dump() (after loading the grammar`
			`tables using a subclass). The report() method prints a readable`
			`representation of the tables to stdout, for debugging.`

			`The instance variables are as follows:`

			`symbol2number -- a dict mapping symbol names to numbers. Symbol`
			`numbers are always 256 or higher, to distinguish`
			`them from token numbers, which are between 0 and`
			`255 (inclusive).`

			`number2symbol -- a dict mapping numbers to symbol names;`
			`these two are each other's inverse.`

			`states -- a list of DFAs, where each DFA is a list of`
			`states, each state is is a list of arcs, and each`
			`arc is a (i, j) pair where i is a label and j is`
			`a state number. The DFA number is the index into`
			`this list. (This name is slightly confusing.)`
			`Final states are represented by a special arc of`
			`the form (0, j) where j is its own state number.`

			`dfas -- a dict mapping symbol numbers to (DFA, first)`
			`pairs, where DFA is an item from the states list`
			`above, and first is a set of tokens that can`
			`begin this grammar rule (represented by a dict`
			`whose values are always 1).`

			`labels -- a list of (x, y) pairs where x is either a token`
			`number or a symbol number, and y is either None`
			`or a string; the strings are keywords. The label`
			`number is the index in this list; label numbers`
			`are used to mark state transitions (arcs) in the`
			`DFAs.`

			`start -- the number of the grammar's start symbol.`

			`keywords -- a dict mapping keyword strings to arc labels.`

			`tokens -- a dict mapping token numbers to arc labels.`

			`"""`

			`def __init__(self):`
			`self.symbol2number = {}`
			`self.number2symbol = {}`
			`self.states = []`
			`self.dfas = {}`
			`self.labels = [(0, "EMPTY")]`
			`self.keywords = {}`
			`self.tokens = {}`
			`self.symbol2label = {}`
			`self.start = 256`

			`def dump(self, filename):`
			`"""Dump the grammar tables to a pickle file."""`
			`f = open(filename, "wb")`
			`pickle.dump(self.__dict__, f, 2)`
			`f.close()`

			`def load(self, filename):`
			`"""Load the grammar tables from a pickle file."""`
			`f = open(filename, "rb")`
			`d = pickle.load(f)`
			`f.close()`
			`self.__dict__.update(d)`

Merged revisions 73771,73811,73840,73842,73848-73849,73861,73957-73960,73964-73969,73972-73974,73977,73981,73984,74065,74113 via svnmerge from svn+ssh://pythondev@svn.python.org/sandbox/trunk/2to3/lib2to3 ........ r73771 \| benjamin.peterson \| 2009-07-02 10:56:55 -0500 (Thu, 02 Jul 2009) \| 1 line force the imports fixer to be run after the import one #6400 ........ r73811 \| benjamin.peterson \| 2009-07-03 09:03:14 -0500 (Fri, 03 Jul 2009) \| 1 line check for sep, not pathsep when looking for a subpackage #6408 ........ r73840 \| benjamin.peterson \| 2009-07-04 09:52:28 -0500 (Sat, 04 Jul 2009) \| 1 line don't print diffs by default; it's annoying ........ r73842 \| benjamin.peterson \| 2009-07-04 09:58:46 -0500 (Sat, 04 Jul 2009) \| 1 line complain when not showing diffs or writing ........ r73848 \| alexandre.vassalotti \| 2009-07-04 23:38:19 -0500 (Sat, 04 Jul 2009) \| 2 lines Fix test_refactor_stdin to handle print_output() method with 4 arguments. ........ r73849 \| alexandre.vassalotti \| 2009-07-04 23:43:18 -0500 (Sat, 04 Jul 2009) \| 5 lines Issue 2370: Add fixer for the removal of operator.isCallable() and operator.sequenceIncludes(). Patch contributed by Jeff Balogh (and updated by me). ........ r73861 \| benjamin.peterson \| 2009-07-05 09:15:53 -0500 (Sun, 05 Jul 2009) \| 1 line cleanup and use unicode where appropiate ........ r73957 \| benjamin.peterson \| 2009-07-11 15:49:56 -0500 (Sat, 11 Jul 2009) \| 1 line fix calls to str() with unicode() ........ r73958 \| benjamin.peterson \| 2009-07-11 15:51:51 -0500 (Sat, 11 Jul 2009) \| 1 line more str() -> unicode() ........ r73959 \| benjamin.peterson \| 2009-07-11 16:40:08 -0500 (Sat, 11 Jul 2009) \| 1 line add tests for refactor_dir() ........ r73960 \| benjamin.peterson \| 2009-07-11 16:44:32 -0500 (Sat, 11 Jul 2009) \| 1 line don't parse files just because they end with 'py' (no dot) ........ r73964 \| benjamin.peterson \| 2009-07-11 17:30:15 -0500 (Sat, 11 Jul 2009) \| 1 line simplify ........ r73965 \| benjamin.peterson \| 2009-07-11 17:31:30 -0500 (Sat, 11 Jul 2009) \| 1 line remove usage of get_prefix() ........ r73966 \| benjamin.peterson \| 2009-07-11 17:33:35 -0500 (Sat, 11 Jul 2009) \| 1 line revert unintended change in 73965 ........ r73967 \| benjamin.peterson \| 2009-07-11 17:34:44 -0500 (Sat, 11 Jul 2009) \| 1 line avoid expensive checks and assume the node did change ........ r73968 \| benjamin.peterson \| 2009-07-11 20:46:46 -0500 (Sat, 11 Jul 2009) \| 1 line use a regular dict for the heads to avoid adding lists in the loop ........ r73969 \| benjamin.peterson \| 2009-07-11 20:50:43 -0500 (Sat, 11 Jul 2009) \| 1 line prefix headnode functions with '_' ........ r73972 \| benjamin.peterson \| 2009-07-11 21:25:45 -0500 (Sat, 11 Jul 2009) \| 1 line try to make the head node dict as sparse as possible ........ r73973 \| benjamin.peterson \| 2009-07-11 21:59:49 -0500 (Sat, 11 Jul 2009) \| 1 line a better idea; add an option to not print diffs ........ r73974 \| benjamin.peterson \| 2009-07-11 22:00:29 -0500 (Sat, 11 Jul 2009) \| 1 line add space ........ r73977 \| benjamin.peterson \| 2009-07-12 10:16:07 -0500 (Sun, 12 Jul 2009) \| 1 line update get_headnode_dict tests for recent changes ........ r73981 \| benjamin.peterson \| 2009-07-12 12:06:39 -0500 (Sun, 12 Jul 2009) \| 4 lines detect when "from __future__ import print_function" is given Deprecate the 'print_function' option and the -p flag ........ r73984 \| benjamin.peterson \| 2009-07-12 16:16:37 -0500 (Sun, 12 Jul 2009) \| 1 line add tests for Call; thanks Joe Amenta ........ r74065 \| benjamin.peterson \| 2009-07-17 12:52:49 -0500 (Fri, 17 Jul 2009) \| 1 line pathname2url and url2pathname are in urllib.request not urllib.parse #6496 ........ r74113 \| benjamin.peterson \| 2009-07-20 08:56:57 -0500 (Mon, 20 Jul 2009) \| 1 line fix deprecation warnings in tests ........ 2009-07-20 12:33:09 -03:00			`def copy(self):`
			`"""`
			`Copy the grammar.`
			`"""`
			`new = self.__class__()`
			`for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",`
			`"tokens", "symbol2label"):`
			`setattr(new, dict_attr, getattr(self, dict_attr).copy())`
			`new.labels = self.labels[:]`
			`new.states = self.states[:]`
			`new.start = self.start`
			`return new`

Import lib2to3. 2008-03-19 01:43:46 -03:00			`def report(self):`
			`"""Dump the grammar tables to standard output, for debugging."""`
			`from pprint import pprint`
			`print "s2n"`
			`pprint(self.symbol2number)`
			`print "n2s"`
			`pprint(self.number2symbol)`
			`print "states"`
			`pprint(self.states)`
			`print "dfas"`
			`pprint(self.dfas)`
			`print "labels"`
			`pprint(self.labels)`
			`print "start", self.start`


			`# Map from operator to number (since tokenize doesn't do this)`

			`opmap_raw = """`
			`( LPAR`
			`) RPAR`
			`[ LSQB`
			`] RSQB`
			`: COLON`
			`, COMMA`
			`; SEMI`
			`+ PLUS`
			`- MINUS`
			`* STAR`
			`/ SLASH`
			`\| VBAR`
			`& AMPER`
			`< LESS`
			`> GREATER`
			`= EQUAL`
			`. DOT`
			`% PERCENT`
			` BACKQUOTE
			`{ LBRACE`
			`} RBRACE`
			`@ AT`
			`== EQEQUAL`
			`!= NOTEQUAL`
			`<> NOTEQUAL`
			`<= LESSEQUAL`
			`>= GREATEREQUAL`
			`~ TILDE`
			`^ CIRCUMFLEX`
			`<< LEFTSHIFT`
			`>> RIGHTSHIFT`
			`** DOUBLESTAR`
			`+= PLUSEQUAL`
			`-= MINEQUAL`
			`*= STAREQUAL`
			`/= SLASHEQUAL`
			`%= PERCENTEQUAL`
			`&= AMPEREQUAL`
			`\|= VBAREQUAL`
			`^= CIRCUMFLEXEQUAL`
			`<<= LEFTSHIFTEQUAL`
			`>>= RIGHTSHIFTEQUAL`
			`**= DOUBLESTAREQUAL`
			`// DOUBLESLASH`
			`//= DOUBLESLASHEQUAL`
			`-> RARROW`
			`"""`

			`opmap = {}`
			`for line in opmap_raw.splitlines():`
			`if line:`
			`op, name = line.split()`
			`opmap[op] = getattr(token, name)`