lib2to3.pgen3.driver.load_grammar() now creates a stable cache file

between runs given the same Grammar.txt input regardless of the hash
randomization setting.
This commit is contained in:
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D) 2016-09-08 00:46:26 +00:00
commit 0c578d62fc
6 changed files with 116 additions and 16 deletions

View File

@ -106,16 +106,19 @@ class Driver(object):
return self.parse_tokens(tokens, debug)
def _generate_pickle_name(gt):
head, tail = os.path.splitext(gt)
if tail == ".txt":
tail = ""
return head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
def load_grammar(gt="Grammar.txt", gp=None,
save=True, force=False, logger=None):
"""Load the grammar (maybe from a pickle)."""
if logger is None:
logger = logging.getLogger()
if gp is None:
head, tail = os.path.splitext(gt)
if tail == ".txt":
tail = ""
gp = head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
gp = _generate_pickle_name(gt) if gp is None else gp
if force or not _newer(gp, gt):
logger.info("Generating grammar tables from %s", gt)
g = pgen.generate_grammar(gt)
@ -124,7 +127,7 @@ def load_grammar(gt="Grammar.txt", gp=None,
try:
g.dump(gp)
except OSError as e:
logger.info("Writing failed:"+str(e))
logger.info("Writing failed: %s", e)
else:
g = grammar.Grammar()
g.load(gp)

View File

@ -13,6 +13,7 @@ fallback token code OP, but the parser needs the actual token code.
"""
# Python imports
import collections
import pickle
# Local imports
@ -85,9 +86,21 @@ class Grammar(object):
self.start = 256
def dump(self, filename):
"""Dump the grammar tables to a pickle file."""
"""Dump the grammar tables to a pickle file.
dump() recursively changes all dict to OrderedDict, so the pickled file
is not exactly the same as what was passed in to dump(). load() uses the
pickled file to create the tables, but only changes OrderedDict to dict
at the top level; it does not recursively change OrderedDict to dict.
So, the loaded tables are different from the original tables that were
passed to load() in that some of the OrderedDict (from the pickled file)
are not changed back to dict. For parsing, this has no effect on
performance because OrderedDict uses dict's __getitem__ with nothing in
between.
"""
with open(filename, "wb") as f:
pickle.dump(self.__dict__, f, 2)
d = _make_deterministic(self.__dict__)
pickle.dump(d, f, 2)
def load(self, filename):
"""Load the grammar tables from a pickle file."""
@ -124,6 +137,17 @@ class Grammar(object):
print("start", self.start)
def _make_deterministic(top):
if isinstance(top, dict):
return collections.OrderedDict(
sorted(((k, _make_deterministic(v)) for k, v in top.items())))
if isinstance(top, list):
return [_make_deterministic(e) for e in top]
if isinstance(top, tuple):
return tuple(_make_deterministic(e) for e in top)
return top
# Map from operator to number (since tokenize doesn't do this)
opmap_raw = """

View File

@ -39,7 +39,7 @@ class ParserGenerator(object):
states = []
for state in dfa:
arcs = []
for label, next in state.arcs.items():
for label, next in sorted(state.arcs.items()):
arcs.append((self.make_label(c, label), dfa.index(next)))
if state.isfinal:
arcs.append((0, dfa.index(state)))
@ -52,7 +52,7 @@ class ParserGenerator(object):
def make_first(self, c, name):
rawfirst = self.first[name]
first = {}
for label in rawfirst:
for label in sorted(rawfirst):
ilabel = self.make_label(c, label)
##assert ilabel not in first # XXX failed on <> ... !=
first[ilabel] = 1
@ -192,7 +192,7 @@ class ParserGenerator(object):
for label, next in nfastate.arcs:
if label is not None:
addclosure(next, arcs.setdefault(label, {}))
for label, nfaset in arcs.items():
for label, nfaset in sorted(arcs.items()):
for st in states:
if st.nfaset == nfaset:
break
@ -222,7 +222,7 @@ class ParserGenerator(object):
print("Dump of DFA for", name)
for i, state in enumerate(dfa):
print(" State", i, state.isfinal and "(final)" or "")
for label, next in state.arcs.items():
for label, next in sorted(state.arcs.items()):
print(" %s -> %d" % (label, dfa.index(next)))
def simplify_dfa(self, dfa):

View File

@ -9,13 +9,13 @@ from textwrap import dedent
# Local imports
from lib2to3 import pytree, refactor
from lib2to3.pgen2 import driver
from lib2to3.pgen2 import driver as pgen2_driver
test_dir = os.path.dirname(__file__)
proj_dir = os.path.normpath(os.path.join(test_dir, ".."))
grammar_path = os.path.join(test_dir, "..", "Grammar.txt")
grammar = driver.load_grammar(grammar_path)
driver = driver.Driver(grammar, convert=pytree.convert)
grammar = pgen2_driver.load_grammar(grammar_path)
driver = pgen2_driver.Driver(grammar, convert=pytree.convert)
def parse_string(string):
return driver.parse_string(reformat(string), debug=True)

View File

@ -15,11 +15,15 @@ from test.support import verbose
# Python imports
import os
import shutil
import subprocess
import sys
import tempfile
import unittest
import warnings
import subprocess
# Local imports
from lib2to3.pgen2 import driver as pgen2_driver
from lib2to3.pgen2 import tokenize
from ..pgen2.parse import ParseError
from lib2to3.pygram import python_symbols as syms
@ -34,6 +38,71 @@ class TestDriver(support.TestCase):
self.assertEqual(t.children[1].children[0].type, syms.print_stmt)
class TestPgen2Caching(support.TestCase):
def test_load_grammar_from_txt_file(self):
pgen2_driver.load_grammar(support.grammar_path, save=False, force=True)
def test_load_grammar_from_pickle(self):
# Make a copy of the grammar file in a temp directory we are
# guaranteed to be able to write to.
tmpdir = tempfile.mkdtemp()
try:
grammar_copy = os.path.join(
tmpdir, os.path.basename(support.grammar_path))
shutil.copy(support.grammar_path, grammar_copy)
pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
self.assertTrue(os.path.exists(pickle_name))
os.unlink(grammar_copy) # Only the pickle remains...
pgen2_driver.load_grammar(grammar_copy, save=False, force=False)
finally:
shutil.rmtree(tmpdir)
@unittest.skipIf(sys.executable is None, 'sys.executable required')
def test_load_grammar_from_subprocess(self):
tmpdir = tempfile.mkdtemp()
tmpsubdir = os.path.join(tmpdir, 'subdir')
try:
os.mkdir(tmpsubdir)
grammar_base = os.path.basename(support.grammar_path)
grammar_copy = os.path.join(tmpdir, grammar_base)
grammar_sub_copy = os.path.join(tmpsubdir, grammar_base)
shutil.copy(support.grammar_path, grammar_copy)
shutil.copy(support.grammar_path, grammar_sub_copy)
pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
pickle_sub_name = pgen2_driver._generate_pickle_name(
grammar_sub_copy)
self.assertNotEqual(pickle_name, pickle_sub_name)
# Generate a pickle file from this process.
pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
self.assertTrue(os.path.exists(pickle_name))
# Generate a new pickle file in a subprocess with a most likely
# different hash randomization seed.
sub_env = dict(os.environ)
sub_env['PYTHONHASHSEED'] = 'random'
subprocess.check_call(
[sys.executable, '-c', """
from lib2to3.pgen2 import driver as pgen2_driver
pgen2_driver.load_grammar(%r, save=True, force=True)
""" % (grammar_sub_copy,)],
env=sub_env)
self.assertTrue(os.path.exists(pickle_sub_name))
with open(pickle_name, 'rb') as pickle_f_1, \
open(pickle_sub_name, 'rb') as pickle_f_2:
self.assertEqual(
pickle_f_1.read(), pickle_f_2.read(),
msg='Grammar caches generated using different hash seeds'
' were not identical.')
finally:
shutil.rmtree(tmpdir)
class GrammarTest(support.TestCase):
def validate(self, code):
support.parse_string(code)

View File

@ -99,6 +99,10 @@ Core and Builtins
Library
-------
- lib2to3.pgen3.driver.load_grammar() now creates a stable cache file
between runs given the same Grammar.txt input regardless of the hash
randomization setting.
- Issue #28005: Allow ImportErrors in encoding implementation to propagate.
- Issue #27570: Avoid zero-length memcpy() etc calls with null source