From 5b9f4988c94f47fa35e84f154a7b5aa17bc04722 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Tue, 28 Apr 2020 13:11:55 +0100 Subject: [PATCH] bpo-40334: Refactor peg_generator to receive a Tokens file when building c code (GH-19745) --- Makefile.pre.in | 4 +- PCbuild/regen.vcxproj | 2 +- Tools/peg_generator/Makefile | 11 +- Tools/peg_generator/pegen/__main__.py | 127 ++++++++++++------ Tools/peg_generator/pegen/build.py | 103 ++++++++++---- Tools/peg_generator/pegen/c_generator.py | 37 +++-- Tools/peg_generator/pegen/testutil.py | 10 +- .../scripts/test_parse_directory.py | 17 ++- 8 files changed, 220 insertions(+), 91 deletions(-) diff --git a/Makefile.pre.in b/Makefile.pre.in index 18fa97bec33..200fd319ebb 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -823,7 +823,9 @@ regen-grammar: regen-token .PHONY: regen-pegen regen-pegen: @$(MKDIR_P) $(srcdir)/Parser/pegen - PYTHONPATH=$(srcdir)/Tools/peg_generator $(PYTHON_FOR_REGEN) -m pegen -c -q $(srcdir)/Grammar/python.gram \ + PYTHONPATH=$(srcdir)/Tools/peg_generator $(PYTHON_FOR_REGEN) -m pegen -q c \ + $(srcdir)/Grammar/python.gram \ + $(srcdir)/Grammar/Tokens \ -o $(srcdir)/Parser/pegen/parse.new.c $(UPDATE_FILE) $(srcdir)/Parser/pegen/parse.c $(srcdir)/Parser/pegen/parse.new.c diff --git a/PCbuild/regen.vcxproj b/PCbuild/regen.vcxproj index 9fe8d6d0c3e..285a8a1b9e4 100644 --- a/PCbuild/regen.vcxproj +++ b/PCbuild/regen.vcxproj @@ -168,7 +168,7 @@ - + diff --git a/Tools/peg_generator/Makefile b/Tools/peg_generator/Makefile index fb67a21b67b..a37cbfcaa85 100644 --- a/Tools/peg_generator/Makefile +++ b/Tools/peg_generator/Makefile @@ -10,6 +10,7 @@ CPYTHON ?= ../../Lib MYPY ?= mypy GRAMMAR = ../../Grammar/python.gram +TOKENS = ../../Grammar/Tokens TESTFILE = data/cprog.py TIMEFILE = data/xxl.py TESTDIR = . @@ -20,8 +21,8 @@ data/xxl.py: build: peg_extension/parse.c -peg_extension/parse.c: $(GRAMMAR) pegen/*.py peg_extension/peg_extension.c ../../Parser/pegen/pegen.c ../../Parser/pegen/parse_string.c ../../Parser/pegen/*.h pegen/grammar_parser.py - $(PYTHON) -m pegen -q -c $(GRAMMAR) -o peg_extension/parse.c --compile-extension +peg_extension/parse.c: $(GRAMMAR) $(TOKENS) pegen/*.py peg_extension/peg_extension.c ../../Parser/pegen/pegen.c ../../Parser/pegen/parse_string.c ../../Parser/pegen/*.h pegen/grammar_parser.py + $(PYTHON) -m pegen -q c $(GRAMMAR) $(TOKENS) -o peg_extension/parse.c --compile-extension clean: -rm -f peg_extension/*.o peg_extension/*.so peg_extension/parse.c @@ -79,7 +80,8 @@ time_stdlib_parse: data/xxl.py test_local: $(PYTHON) scripts/test_parse_directory.py \ - -g $(GRAMMAR) \ + --grammar-file $(GRAMMAR) \ + --tokens-file $(TOKENS) \ -d $(TESTDIR) \ $(TESTFLAGS) \ --exclude "*/failset/*" \ @@ -88,7 +90,8 @@ test_local: test_global: $(CPYTHON) $(PYTHON) scripts/test_parse_directory.py \ - -g $(GRAMMAR) \ + --grammar-file $(GRAMMAR) \ + --tokens-file $(TOKENS) \ -d $(CPYTHON) \ $(TESTFLAGS) \ --exclude "*/test2to3/*" \ diff --git a/Tools/peg_generator/pegen/__main__.py b/Tools/peg_generator/pegen/__main__.py index 6696d135a8b..1dcbaad1c38 100755 --- a/Tools/peg_generator/pegen/__main__.py +++ b/Tools/peg_generator/pegen/__main__.py @@ -11,6 +11,64 @@ import time import token import traceback +from typing import Tuple + +from pegen.build import Grammar, Parser, Tokenizer, ParserGenerator + + +def generate_c_code( + args: argparse.Namespace, +) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: + from pegen.build import build_c_parser_and_generator + + verbose = args.verbose + verbose_tokenizer = verbose >= 3 + verbose_parser = verbose == 2 or verbose >= 4 + try: + grammar, parser, tokenizer, gen = build_c_parser_and_generator( + args.grammar_filename, + args.tokens_filename, + args.output, + args.compile_extension, + verbose_tokenizer, + verbose_parser, + args.verbose, + keep_asserts_in_extension=False if args.optimized else True, + skip_actions=args.skip_actions, + ) + return grammar, parser, tokenizer, gen + except Exception as err: + if args.verbose: + raise # Show traceback + traceback.print_exception(err.__class__, err, None) + sys.stderr.write("For full traceback, use -v\n") + sys.exit(1) + + +def generate_python_code( + args: argparse.Namespace, +) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: + from pegen.build import build_python_parser_and_generator + + verbose = args.verbose + verbose_tokenizer = verbose >= 3 + verbose_parser = verbose == 2 or verbose >= 4 + try: + grammar, parser, tokenizer, gen = build_python_parser_and_generator( + args.grammar_filename, + args.output, + verbose_tokenizer, + verbose_parser, + skip_actions=args.skip_actions, + ) + return grammar, parser, tokenizer, gen + except Exception as err: + if args.verbose: + raise # Show traceback + traceback.print_exception(err.__class__, err, None) + sys.stderr.write("For full traceback, use -v\n") + sys.exit(1) + argparser = argparse.ArgumentParser( prog="pegen", description="Experimental PEG-like parser generator" @@ -23,63 +81,52 @@ argparser.add_argument( default=0, help="Print timing stats; repeat for more debug output", ) -argparser.add_argument( - "-c", "--cpython", action="store_true", help="Generate C code for inclusion into CPython" +subparsers = argparser.add_subparsers(help="target language for the generated code") + +c_parser = subparsers.add_parser("c", help="Generate C code for inclusion into CPython") +c_parser.set_defaults(func=generate_c_code) +c_parser.add_argument("grammar_filename", help="Grammar description") +c_parser.add_argument("tokens_filename", help="Tokens description") +c_parser.add_argument( + "-o", "--output", metavar="OUT", default="parse.c", help="Where to write the generated parser" ) -argparser.add_argument( +c_parser.add_argument( "--compile-extension", action="store_true", help="Compile generated C code into an extension module", ) -argparser.add_argument( +c_parser.add_argument( + "--optimized", action="store_true", help="Compile the extension in optimized mode" +) +c_parser.add_argument( + "--skip-actions", action="store_true", help="Suppress code emission for rule actions", +) + +python_parser = subparsers.add_parser("python", help="Generate Python code") +python_parser.set_defaults(func=generate_python_code) +python_parser.add_argument("grammar_filename", help="Grammar description") +python_parser.add_argument( "-o", "--output", metavar="OUT", - help="Where to write the generated parser (default parse.py or parse.c)", + default="parse.py", + help="Where to write the generated parser", ) -argparser.add_argument("filename", help="Grammar description") -argparser.add_argument( - "--optimized", action="store_true", help="Compile the extension in optimized mode" -) -argparser.add_argument( +python_parser.add_argument( "--skip-actions", action="store_true", help="Suppress code emission for rule actions", ) def main() -> None: - from pegen.build import build_parser_and_generator from pegen.testutil import print_memstats args = argparser.parse_args() - verbose = args.verbose - verbose_tokenizer = verbose >= 3 - verbose_parser = verbose == 2 or verbose >= 4 + if "func" not in args: + argparser.error("Must specify the target language mode ('c' or 'python')") + t0 = time.time() - - output_file = args.output - if not output_file: - if args.cpython: - output_file = "parse.c" - else: - output_file = "parse.py" - - try: - grammar, parser, tokenizer, gen = build_parser_and_generator( - args.filename, - output_file, - args.compile_extension, - verbose_tokenizer, - verbose_parser, - args.verbose, - keep_asserts_in_extension=False if args.optimized else True, - skip_actions=args.skip_actions, - ) - except Exception as err: - if args.verbose: - raise # Show traceback - traceback.print_exception(err.__class__, err, None) - sys.stderr.write("For full traceback, use -v\n") - sys.exit(1) + grammar, parser, tokenizer, gen = args.func(args) + t1 = time.time() if not args.quiet: if args.verbose: @@ -110,8 +157,6 @@ def main() -> None: else: print() - t1 = time.time() - if args.verbose: dt = t1 - t0 diag = tokenizer.diagnose() diff --git a/Tools/peg_generator/pegen/build.py b/Tools/peg_generator/pegen/build.py index 0f5d73ee5fe..94248ffd943 100644 --- a/Tools/peg_generator/pegen/build.py +++ b/Tools/peg_generator/pegen/build.py @@ -3,8 +3,9 @@ import shutil import tokenize import sys import sysconfig +import itertools -from typing import Optional, Tuple +from typing import Optional, Tuple, List, IO, Iterator, Set, Dict from pegen.c_generator import CParserGenerator from pegen.grammar import Grammar @@ -17,12 +18,12 @@ from pegen.tokenizer import Tokenizer MOD_DIR = pathlib.Path(__file__).parent -def get_extra_flags(compiler_flags, compiler_py_flags_nodist): +def get_extra_flags(compiler_flags: str, compiler_py_flags_nodist: str) -> List[str]: flags = sysconfig.get_config_var(compiler_flags) py_flags_nodist = sysconfig.get_config_var(compiler_py_flags_nodist) if flags is None or py_flags_nodist is None: return [] - return f'{flags} {py_flags_nodist}'.split() + return f"{flags} {py_flags_nodist}".split() def compile_c_extension( @@ -45,15 +46,15 @@ def compile_c_extension( from distutils.core import Distribution, Extension from distutils.command.clean import clean # type: ignore from distutils.command.build_ext import build_ext # type: ignore - from distutils.tests.support import fixup_build_ext + from distutils.tests.support import fixup_build_ext # type: ignore if verbose: distutils.log.set_verbosity(distutils.log.DEBUG) source_file_path = pathlib.Path(generated_source_path) extension_name = source_file_path.stem - extra_compile_args = get_extra_flags('CFLAGS', 'PY_CFLAGS_NODIST') - extra_link_args = get_extra_flags('LDFLAGS', 'PY_LDFLAGS_NODIST') + extra_compile_args = get_extra_flags("CFLAGS", "PY_CFLAGS_NODIST") + extra_link_args = get_extra_flags("LDFLAGS", "PY_LDFLAGS_NODIST") if keep_asserts: extra_compile_args.append("-UNDEBUG") extension = [ @@ -111,39 +112,69 @@ def build_parser( return grammar, parser, tokenizer -def build_generator( - tokenizer: Tokenizer, +def generate_token_definitions(tokens: IO[str]) -> Tuple[Dict[str, int], Set[str]]: + exact_tokens = {} + non_exact_tokens = set() + numbers = itertools.count(0) + + for line in tokens: + line = line.strip() + + if not line or line.startswith("#"): + continue + + pieces = line.split() + index = next(numbers) + + if len(pieces) == 1: + (token,) = pieces + non_exact_tokens.add(token) + elif len(pieces) == 2: + _, op = pieces + exact_tokens[op.strip("'")] = index + else: + raise ValueError(f"Unexpected line found in Tokens file: {line}") + + return exact_tokens, non_exact_tokens + + +def build_c_generator( grammar: Grammar, grammar_file: str, + tokens_file: str, output_file: str, compile_extension: bool = False, verbose_c_extension: bool = False, keep_asserts_in_extension: bool = True, skip_actions: bool = False, ) -> ParserGenerator: - # TODO: Allow other extensions; pass the output type as an argument. - if not output_file.endswith((".c", ".py")): - raise RuntimeError("Your output file must either be a .c or .py file") + with open(tokens_file, "r") as tok_file: + exact_tok, non_exact_tok = generate_token_definitions(tok_file) with open(output_file, "w") as file: - gen: ParserGenerator - if output_file.endswith(".c"): - gen = CParserGenerator(grammar, file, skip_actions=skip_actions) - elif output_file.endswith(".py"): - gen = PythonParserGenerator(grammar, file) # TODO: skip_actions - else: - assert False # Should have been checked above + gen: ParserGenerator = CParserGenerator( + grammar, exact_tok, non_exact_tok, file, skip_actions=skip_actions + ) gen.generate(grammar_file) - if compile_extension and output_file.endswith(".c"): + if compile_extension: compile_c_extension( output_file, verbose=verbose_c_extension, keep_asserts=keep_asserts_in_extension ) - return gen -def build_parser_and_generator( +def build_python_generator( + grammar: Grammar, grammar_file: str, output_file: str, skip_actions: bool = False, +) -> ParserGenerator: + with open(output_file, "w") as file: + gen: ParserGenerator = PythonParserGenerator(grammar, file) # TODO: skip_actions + gen.generate(grammar_file) + return gen + + +def build_c_parser_and_generator( grammar_file: str, + tokens_file: str, output_file: str, compile_extension: bool = False, verbose_tokenizer: bool = False, @@ -152,10 +183,11 @@ def build_parser_and_generator( keep_asserts_in_extension: bool = True, skip_actions: bool = False, ) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: - """Generate rules, parser, tokenizer, parser generator for a given grammar + """Generate rules, C parser, tokenizer, parser generator for a given grammar Args: grammar_file (string): Path for the grammar file + tokens_file (string): Path for the tokens file output_file (string): Path for the output file compile_extension (bool, optional): Whether to compile the C extension. Defaults to False. @@ -170,10 +202,10 @@ def build_parser_and_generator( skip_actions (bool, optional): Whether to pretend no rule has any actions. """ grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser) - gen = build_generator( - tokenizer, + gen = build_c_generator( grammar, grammar_file, + tokens_file, output_file, compile_extension, verbose_c_extension, @@ -182,3 +214,26 @@ def build_parser_and_generator( ) return grammar, parser, tokenizer, gen + + +def build_python_parser_and_generator( + grammar_file: str, + output_file: str, + verbose_tokenizer: bool = False, + verbose_parser: bool = False, + skip_actions: bool = False, +) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]: + """Generate rules, python parser, tokenizer, parser generator for a given grammar + + Args: + grammar_file (string): Path for the grammar file + output_file (string): Path for the output file + verbose_tokenizer (bool, optional): Whether to display additional output + when generating the tokenizer. Defaults to False. + verbose_parser (bool, optional): Whether to display additional output + when generating the parser. Defaults to False. + skip_actions (bool, optional): Whether to pretend no rule has any actions. + """ + grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser) + gen = build_python_generator(grammar, grammar_file, output_file, skip_actions=skip_actions,) + return grammar, parser, tokenizer, gen diff --git a/Tools/peg_generator/pegen/c_generator.py b/Tools/peg_generator/pegen/c_generator.py index 6c4b8f1e7df..a01c3097c36 100644 --- a/Tools/peg_generator/pegen/c_generator.py +++ b/Tools/peg_generator/pegen/c_generator.py @@ -1,6 +1,6 @@ import ast import re -from typing import Any, cast, Dict, IO, Optional, List, Text, Tuple +from typing import Any, cast, Dict, IO, Optional, List, Text, Tuple, Set from pegen.grammar import ( Cut, @@ -22,7 +22,6 @@ from pegen.grammar import ( ) from pegen import grammar from pegen.parser_generator import dedupe, ParserGenerator -from pegen.tokenizer import exact_token_types EXTENSION_PREFIX = """\ #include "pegen.h" @@ -43,8 +42,15 @@ _PyPegen_parse(Parser *p) class CCallMakerVisitor(GrammarVisitor): - def __init__(self, parser_generator: ParserGenerator): + def __init__( + self, + parser_generator: ParserGenerator, + exact_tokens: Dict[str, int], + non_exact_tokens: Set[str], + ): self.gen = parser_generator + self.exact_tokens = exact_tokens + self.non_exact_tokens = non_exact_tokens self.cache: Dict[Any, Any] = {} self.keyword_cache: Dict[str, int] = {} @@ -55,10 +61,7 @@ class CCallMakerVisitor(GrammarVisitor): def visit_NameLeaf(self, node: NameLeaf) -> Tuple[str, str]: name = node.value - if name in ("NAME", "NUMBER", "STRING"): - name = name.lower() - return f"{name}_var", f"_PyPegen_{name}_token(p)" - if name in ("NEWLINE", "DEDENT", "INDENT", "ENDMARKER", "ASYNC", "AWAIT"): + if name in self.non_exact_tokens: name = name.lower() return f"{name}_var", f"_PyPegen_{name}_token(p)" return f"{name}_var", f"{name}_rule(p)" @@ -68,12 +71,12 @@ class CCallMakerVisitor(GrammarVisitor): if re.match(r"[a-zA-Z_]\w*\Z", val): # This is a keyword return self.keyword_helper(val) else: - assert val in exact_token_types, f"{node.value} is not a known literal" - type = exact_token_types[val] + assert val in self.exact_tokens, f"{node.value} is not a known literal" + type = self.exact_tokens[val] return "literal", f"_PyPegen_expect_token(p, {type})" def visit_Rhs(self, node: Rhs) -> Tuple[Optional[str], str]: - def can_we_inline(node): + def can_we_inline(node: Rhs) -> int: if len(node.alts) != 1 or len(node.alts[0].items) != 1: return False # If the alternative has an action we cannot inline @@ -152,12 +155,16 @@ class CParserGenerator(ParserGenerator, GrammarVisitor): def __init__( self, grammar: grammar.Grammar, + exact_tokens: Dict[str, int], + non_exact_tokens: Set[str], file: Optional[IO[Text]], debug: bool = False, skip_actions: bool = False, ): super().__init__(grammar, file) - self.callmakervisitor: CCallMakerVisitor = CCallMakerVisitor(self) + self.callmakervisitor: CCallMakerVisitor = CCallMakerVisitor( + self, exact_tokens, non_exact_tokens + ) self._varname_counter = 0 self.debug = debug self.skip_actions = skip_actions @@ -184,7 +191,11 @@ class CParserGenerator(ParserGenerator, GrammarVisitor): self.print(f"}}") def out_of_memory_return( - self, expr: str, returnval: str, message: str = "Parser out of memory", cleanup_code=None + self, + expr: str, + returnval: str, + message: str = "Parser out of memory", + cleanup_code: Optional[str] = None, ) -> None: self.print(f"if ({expr}) {{") with self.indent(): @@ -465,7 +476,7 @@ class CParserGenerator(ParserGenerator, GrammarVisitor): self.visit(item, names=names) self.print(")") - def emit_action(self, node: Alt, cleanup_code=None) -> None: + def emit_action(self, node: Alt, cleanup_code: Optional[str] = None) -> None: self.print(f"res = {node.action};") self.print("if (res == NULL && PyErr_Occurred()) {") diff --git a/Tools/peg_generator/pegen/testutil.py b/Tools/peg_generator/pegen/testutil.py index 5a91862be12..1f79d8f702f 100644 --- a/Tools/peg_generator/pegen/testutil.py +++ b/Tools/peg_generator/pegen/testutil.py @@ -5,6 +5,7 @@ import pathlib import sys import textwrap import tokenize +import token from typing import Any, cast, Dict, IO, Type, Final @@ -16,6 +17,11 @@ from pegen.parser import Parser from pegen.python_generator import PythonParserGenerator from pegen.tokenizer import Tokenizer +EXACT_TOKENS = token.EXACT_TOKEN_TYPES # type: ignore +NON_EXACT_TOKENS = { + name for index, name in token.tok_name.items() if index not in EXACT_TOKENS.values() +} + def generate_parser(grammar: Grammar) -> Type[Parser]: # Generate a parser. @@ -70,7 +76,7 @@ def import_file(full_name: str, path: str) -> Any: def generate_c_parser_source(grammar: Grammar) -> str: out = io.StringIO() - genr = CParserGenerator(grammar, out) + genr = CParserGenerator(grammar, EXACT_TOKENS, NON_EXACT_TOKENS, out) genr.generate("") return out.getvalue() @@ -90,7 +96,7 @@ def generate_parser_c_extension( assert not os.listdir(path) source = path / "parse.c" with open(source, "w") as file: - genr = CParserGenerator(grammar, file, debug=debug) + genr = CParserGenerator(grammar, EXACT_TOKENS, NON_EXACT_TOKENS, file, debug=debug) genr.generate("parse.c") compile_c_extension(str(source), build_dir=str(path)) diff --git a/Tools/peg_generator/scripts/test_parse_directory.py b/Tools/peg_generator/scripts/test_parse_directory.py index 06a38fca67a..6511a2d932f 100755 --- a/Tools/peg_generator/scripts/test_parse_directory.py +++ b/Tools/peg_generator/scripts/test_parse_directory.py @@ -13,7 +13,7 @@ from pathlib import PurePath from typing import List, Optional, Any sys.path.insert(0, os.getcwd()) -from pegen.build import build_parser_and_generator +from pegen.build import build_c_parser_and_generator from pegen.testutil import print_memstats from scripts import show_parse @@ -26,7 +26,8 @@ argparser = argparse.ArgumentParser( description="Helper program to test directories or files for pegen", ) argparser.add_argument("-d", "--directory", help="Directory path containing files to test") -argparser.add_argument("-g", "--grammar-file", help="Grammar file path") +argparser.add_argument("--grammar-file", help="Grammar file path") +argparser.add_argument("--tokens-file", help="Tokens file path") argparser.add_argument( "-e", "--exclude", action="append", default=[], help="Glob(s) for matching files to exclude" ) @@ -114,6 +115,7 @@ def compare_trees( def parse_directory( directory: str, grammar_file: str, + tokens_file: str, verbose: bool, excluded_files: List[str], skip_actions: bool, @@ -131,15 +133,16 @@ def parse_directory( print("You must specify a directory of files to test.", file=sys.stderr) return 1 - if grammar_file: + if grammar_file and tokens_file: if not os.path.exists(grammar_file): print(f"The specified grammar file, {grammar_file}, does not exist.", file=sys.stderr) return 1 try: if not extension and parser == "pegen": - build_parser_and_generator( + build_c_parser_and_generator( grammar_file, + tokens_file, "peg_extension/parse.c", compile_extension=True, skip_actions=skip_actions, @@ -154,7 +157,9 @@ def parse_directory( return 1 else: - print("A grammar file was not provided - attempting to use existing file...\n") + print( + "A grammar file or a tokens file was not provided - attempting to use existing parser from stdlib...\n" + ) if parser == "pegen": try: @@ -264,6 +269,7 @@ def main() -> None: args = argparser.parse_args() directory = args.directory grammar_file = args.grammar_file + tokens_file = args.tokens_file verbose = args.verbose excluded_files = args.exclude skip_actions = args.skip_actions @@ -273,6 +279,7 @@ def main() -> None: parse_directory( directory, grammar_file, + tokens_file, verbose, excluded_files, skip_actions,