2008-03-19 02:04:44 -03:00
|
|
|
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
|
|
|
# Licensed to PSF under a Contributor Agreement.
|
|
|
|
|
|
|
|
"""This module defines the data structures used to represent a grammar.
|
|
|
|
|
|
|
|
These are a bit arcane because they are derived from the data
|
|
|
|
structures used by Python's 'pgen' parser generator.
|
|
|
|
|
|
|
|
There's also a table here mapping operators to their names in the
|
|
|
|
token module; the Python tokenize module reports all operators as the
|
|
|
|
fallback token code OP, but the parser needs the actual token code.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Python imports
|
|
|
|
import pickle
|
|
|
|
|
|
|
|
# Local imports
|
|
|
|
from . import token, tokenize
|
|
|
|
|
|
|
|
|
|
|
|
class Grammar(object):
|
2013-03-11 18:57:08 -03:00
|
|
|
"""Pgen parsing tables conversion class.
|
2008-03-19 02:04:44 -03:00
|
|
|
|
|
|
|
Once initialized, this class supplies the grammar tables for the
|
|
|
|
parsing engine implemented by parse.py. The parsing engine
|
|
|
|
accesses the instance variables directly. The class here does not
|
|
|
|
provide initialization of the tables; several subclasses exist to
|
|
|
|
do this (see the conv and pgen modules).
|
|
|
|
|
|
|
|
The load() method reads the tables from a pickle file, which is
|
|
|
|
much faster than the other ways offered by subclasses. The pickle
|
|
|
|
file is written by calling dump() (after loading the grammar
|
|
|
|
tables using a subclass). The report() method prints a readable
|
|
|
|
representation of the tables to stdout, for debugging.
|
|
|
|
|
|
|
|
The instance variables are as follows:
|
|
|
|
|
|
|
|
symbol2number -- a dict mapping symbol names to numbers. Symbol
|
|
|
|
numbers are always 256 or higher, to distinguish
|
|
|
|
them from token numbers, which are between 0 and
|
|
|
|
255 (inclusive).
|
|
|
|
|
|
|
|
number2symbol -- a dict mapping numbers to symbol names;
|
|
|
|
these two are each other's inverse.
|
|
|
|
|
|
|
|
states -- a list of DFAs, where each DFA is a list of
|
2013-03-11 18:57:08 -03:00
|
|
|
states, each state is a list of arcs, and each
|
2008-03-19 02:04:44 -03:00
|
|
|
arc is a (i, j) pair where i is a label and j is
|
|
|
|
a state number. The DFA number is the index into
|
|
|
|
this list. (This name is slightly confusing.)
|
|
|
|
Final states are represented by a special arc of
|
|
|
|
the form (0, j) where j is its own state number.
|
|
|
|
|
|
|
|
dfas -- a dict mapping symbol numbers to (DFA, first)
|
|
|
|
pairs, where DFA is an item from the states list
|
|
|
|
above, and first is a set of tokens that can
|
|
|
|
begin this grammar rule (represented by a dict
|
|
|
|
whose values are always 1).
|
|
|
|
|
|
|
|
labels -- a list of (x, y) pairs where x is either a token
|
|
|
|
number or a symbol number, and y is either None
|
|
|
|
or a string; the strings are keywords. The label
|
|
|
|
number is the index in this list; label numbers
|
|
|
|
are used to mark state transitions (arcs) in the
|
|
|
|
DFAs.
|
|
|
|
|
|
|
|
start -- the number of the grammar's start symbol.
|
|
|
|
|
|
|
|
keywords -- a dict mapping keyword strings to arc labels.
|
|
|
|
|
|
|
|
tokens -- a dict mapping token numbers to arc labels.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.symbol2number = {}
|
|
|
|
self.number2symbol = {}
|
|
|
|
self.states = []
|
|
|
|
self.dfas = {}
|
|
|
|
self.labels = [(0, "EMPTY")]
|
|
|
|
self.keywords = {}
|
|
|
|
self.tokens = {}
|
|
|
|
self.symbol2label = {}
|
|
|
|
self.start = 256
|
|
|
|
|
|
|
|
def dump(self, filename):
|
|
|
|
"""Dump the grammar tables to a pickle file."""
|
2013-02-11 21:04:27 -04:00
|
|
|
with open(filename, "wb") as f:
|
|
|
|
pickle.dump(self.__dict__, f, 2)
|
2008-03-19 02:04:44 -03:00
|
|
|
|
|
|
|
def load(self, filename):
|
|
|
|
"""Load the grammar tables from a pickle file."""
|
2013-02-11 21:04:27 -04:00
|
|
|
with open(filename, "rb") as f:
|
|
|
|
d = pickle.load(f)
|
2008-03-19 02:04:44 -03:00
|
|
|
self.__dict__.update(d)
|
|
|
|
|
Merged revisions 74114 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
................
r74114 | benjamin.peterson | 2009-07-20 10:33:09 -0500 (Mon, 20 Jul 2009) | 110 lines
Merged revisions 73771,73811,73840,73842,73848-73849,73861,73957-73960,73964-73969,73972-73974,73977,73981,73984,74065,74113 via svnmerge from
svn+ssh://pythondev@svn.python.org/sandbox/trunk/2to3/lib2to3
........
r73771 | benjamin.peterson | 2009-07-02 10:56:55 -0500 (Thu, 02 Jul 2009) | 1 line
force the imports fixer to be run after the import one #6400
........
r73811 | benjamin.peterson | 2009-07-03 09:03:14 -0500 (Fri, 03 Jul 2009) | 1 line
check for sep, not pathsep when looking for a subpackage #6408
........
r73840 | benjamin.peterson | 2009-07-04 09:52:28 -0500 (Sat, 04 Jul 2009) | 1 line
don't print diffs by default; it's annoying
........
r73842 | benjamin.peterson | 2009-07-04 09:58:46 -0500 (Sat, 04 Jul 2009) | 1 line
complain when not showing diffs or writing
........
r73848 | alexandre.vassalotti | 2009-07-04 23:38:19 -0500 (Sat, 04 Jul 2009) | 2 lines
Fix test_refactor_stdin to handle print_output() method with 4 arguments.
........
r73849 | alexandre.vassalotti | 2009-07-04 23:43:18 -0500 (Sat, 04 Jul 2009) | 5 lines
Issue 2370: Add fixer for the removal of operator.isCallable() and
operator.sequenceIncludes().
Patch contributed by Jeff Balogh (and updated by me).
........
r73861 | benjamin.peterson | 2009-07-05 09:15:53 -0500 (Sun, 05 Jul 2009) | 1 line
cleanup and use unicode where appropiate
........
r73957 | benjamin.peterson | 2009-07-11 15:49:56 -0500 (Sat, 11 Jul 2009) | 1 line
fix calls to str() with unicode()
........
r73958 | benjamin.peterson | 2009-07-11 15:51:51 -0500 (Sat, 11 Jul 2009) | 1 line
more str() -> unicode()
........
r73959 | benjamin.peterson | 2009-07-11 16:40:08 -0500 (Sat, 11 Jul 2009) | 1 line
add tests for refactor_dir()
........
r73960 | benjamin.peterson | 2009-07-11 16:44:32 -0500 (Sat, 11 Jul 2009) | 1 line
don't parse files just because they end with 'py' (no dot)
........
r73964 | benjamin.peterson | 2009-07-11 17:30:15 -0500 (Sat, 11 Jul 2009) | 1 line
simplify
........
r73965 | benjamin.peterson | 2009-07-11 17:31:30 -0500 (Sat, 11 Jul 2009) | 1 line
remove usage of get_prefix()
........
r73966 | benjamin.peterson | 2009-07-11 17:33:35 -0500 (Sat, 11 Jul 2009) | 1 line
revert unintended change in 73965
........
r73967 | benjamin.peterson | 2009-07-11 17:34:44 -0500 (Sat, 11 Jul 2009) | 1 line
avoid expensive checks and assume the node did change
........
r73968 | benjamin.peterson | 2009-07-11 20:46:46 -0500 (Sat, 11 Jul 2009) | 1 line
use a regular dict for the heads to avoid adding lists in the loop
........
r73969 | benjamin.peterson | 2009-07-11 20:50:43 -0500 (Sat, 11 Jul 2009) | 1 line
prefix headnode functions with '_'
........
r73972 | benjamin.peterson | 2009-07-11 21:25:45 -0500 (Sat, 11 Jul 2009) | 1 line
try to make the head node dict as sparse as possible
........
r73973 | benjamin.peterson | 2009-07-11 21:59:49 -0500 (Sat, 11 Jul 2009) | 1 line
a better idea; add an option to *not* print diffs
........
r73974 | benjamin.peterson | 2009-07-11 22:00:29 -0500 (Sat, 11 Jul 2009) | 1 line
add space
........
r73977 | benjamin.peterson | 2009-07-12 10:16:07 -0500 (Sun, 12 Jul 2009) | 1 line
update get_headnode_dict tests for recent changes
........
r73981 | benjamin.peterson | 2009-07-12 12:06:39 -0500 (Sun, 12 Jul 2009) | 4 lines
detect when "from __future__ import print_function" is given
Deprecate the 'print_function' option and the -p flag
........
r73984 | benjamin.peterson | 2009-07-12 16:16:37 -0500 (Sun, 12 Jul 2009) | 1 line
add tests for Call; thanks Joe Amenta
........
r74065 | benjamin.peterson | 2009-07-17 12:52:49 -0500 (Fri, 17 Jul 2009) | 1 line
pathname2url and url2pathname are in urllib.request not urllib.parse #6496
........
r74113 | benjamin.peterson | 2009-07-20 08:56:57 -0500 (Mon, 20 Jul 2009) | 1 line
fix deprecation warnings in tests
........
................
2009-07-20 13:42:03 -03:00
|
|
|
def copy(self):
|
|
|
|
"""
|
|
|
|
Copy the grammar.
|
|
|
|
"""
|
|
|
|
new = self.__class__()
|
|
|
|
for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
|
|
|
|
"tokens", "symbol2label"):
|
|
|
|
setattr(new, dict_attr, getattr(self, dict_attr).copy())
|
|
|
|
new.labels = self.labels[:]
|
|
|
|
new.states = self.states[:]
|
|
|
|
new.start = self.start
|
|
|
|
return new
|
|
|
|
|
2008-03-19 02:04:44 -03:00
|
|
|
def report(self):
|
|
|
|
"""Dump the grammar tables to standard output, for debugging."""
|
|
|
|
from pprint import pprint
|
2008-03-19 02:33:36 -03:00
|
|
|
print("s2n")
|
2008-03-19 02:04:44 -03:00
|
|
|
pprint(self.symbol2number)
|
2008-03-19 02:33:36 -03:00
|
|
|
print("n2s")
|
2008-03-19 02:04:44 -03:00
|
|
|
pprint(self.number2symbol)
|
2008-03-19 02:33:36 -03:00
|
|
|
print("states")
|
2008-03-19 02:04:44 -03:00
|
|
|
pprint(self.states)
|
2008-03-19 02:33:36 -03:00
|
|
|
print("dfas")
|
2008-03-19 02:04:44 -03:00
|
|
|
pprint(self.dfas)
|
2008-03-19 02:33:36 -03:00
|
|
|
print("labels")
|
2008-03-19 02:04:44 -03:00
|
|
|
pprint(self.labels)
|
2008-03-19 02:33:36 -03:00
|
|
|
print("start", self.start)
|
2008-03-19 02:04:44 -03:00
|
|
|
|
|
|
|
|
|
|
|
# Map from operator to number (since tokenize doesn't do this)
|
|
|
|
|
|
|
|
opmap_raw = """
|
|
|
|
( LPAR
|
|
|
|
) RPAR
|
|
|
|
[ LSQB
|
|
|
|
] RSQB
|
|
|
|
: COLON
|
|
|
|
, COMMA
|
|
|
|
; SEMI
|
|
|
|
+ PLUS
|
|
|
|
- MINUS
|
|
|
|
* STAR
|
|
|
|
/ SLASH
|
|
|
|
| VBAR
|
|
|
|
& AMPER
|
|
|
|
< LESS
|
|
|
|
> GREATER
|
|
|
|
= EQUAL
|
|
|
|
. DOT
|
|
|
|
% PERCENT
|
|
|
|
` BACKQUOTE
|
|
|
|
{ LBRACE
|
|
|
|
} RBRACE
|
|
|
|
@ AT
|
|
|
|
== EQEQUAL
|
|
|
|
!= NOTEQUAL
|
|
|
|
<> NOTEQUAL
|
|
|
|
<= LESSEQUAL
|
|
|
|
>= GREATEREQUAL
|
|
|
|
~ TILDE
|
|
|
|
^ CIRCUMFLEX
|
|
|
|
<< LEFTSHIFT
|
|
|
|
>> RIGHTSHIFT
|
|
|
|
** DOUBLESTAR
|
|
|
|
+= PLUSEQUAL
|
|
|
|
-= MINEQUAL
|
|
|
|
*= STAREQUAL
|
|
|
|
/= SLASHEQUAL
|
|
|
|
%= PERCENTEQUAL
|
|
|
|
&= AMPEREQUAL
|
|
|
|
|= VBAREQUAL
|
|
|
|
^= CIRCUMFLEXEQUAL
|
|
|
|
<<= LEFTSHIFTEQUAL
|
|
|
|
>>= RIGHTSHIFTEQUAL
|
|
|
|
**= DOUBLESTAREQUAL
|
|
|
|
// DOUBLESLASH
|
|
|
|
//= DOUBLESLASHEQUAL
|
|
|
|
-> RARROW
|
|
|
|
"""
|
|
|
|
|
|
|
|
opmap = {}
|
|
|
|
for line in opmap_raw.splitlines():
|
|
|
|
if line:
|
|
|
|
op, name = line.split()
|
|
|
|
opmap[op] = getattr(token, name)
|