From 1bfe659ee5c6f07c55487d9ef7c2e653cf697f72 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 27 May 2020 23:20:07 +0300 Subject: [PATCH] [3.9] Backport GH-20370 and GH-20436: Soft keywords (GH-20458) --- Lib/test/test_peg_generator/test_c_parser.py | 42 ++++++++++++++++++++ Parser/pegen/pegen.c | 34 +++++++++++++++- Parser/pegen/pegen.h | 2 + Tools/peg_generator/pegen/c_generator.py | 33 ++++++++++++--- 4 files changed, 105 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_peg_generator/test_c_parser.py b/Lib/test/test_peg_generator/test_c_parser.py index f66b92def9f..a5d88501f77 100644 --- a/Lib/test/test_peg_generator/test_c_parser.py +++ b/Lib/test/test_peg_generator/test_c_parser.py @@ -402,3 +402,45 @@ class TestCParser(TempdirManager, unittest.TestCase): parse.parse_string("a", mode=0) """ self.run_test(grammar_source, test_source) + + def test_no_soft_keywords(self) -> None: + grammar_source = """ + start: expr+ NEWLINE? ENDMARKER + expr: 'foo' + """ + grammar = parse_string(grammar_source, GrammarParser) + parser_source = generate_c_parser_source(grammar) + assert "expect_soft_keyword" not in parser_source + + def test_soft_keywords(self) -> None: + grammar_source = """ + start: expr+ NEWLINE? ENDMARKER + expr: "foo" + """ + grammar = parse_string(grammar_source, GrammarParser) + parser_source = generate_c_parser_source(grammar) + assert "expect_soft_keyword" in parser_source + + def test_soft_keywords_parse(self) -> None: + grammar_source = """ + start: "if" expr '+' expr NEWLINE + expr: NAME + """ + test_source = """ + valid_cases = ["if if + if"] + invalid_cases = ["if if"] + self.check_input_strings_for_grammar(valid_cases, invalid_cases) + """ + self.run_test(grammar_source, test_source) + + def test_soft_keywords_lookahead(self) -> None: + grammar_source = """ + start: &"if" "if" expr '+' expr NEWLINE + expr: NAME + """ + test_source = """ + valid_cases = ["if if + if"] + invalid_cases = ["if if"] + self.check_input_strings_for_grammar(valid_cases, invalid_cases) + """ + self.run_test(grammar_source, test_source) diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index cd87a9ffd93..b858b6b9d38 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -708,7 +708,6 @@ _PyPegen_is_memoized(Parser *p, int type, void *pres) return 0; } - int _PyPegen_lookahead_with_name(int positive, expr_ty (func)(Parser *), Parser *p) { @@ -718,6 +717,15 @@ _PyPegen_lookahead_with_name(int positive, expr_ty (func)(Parser *), Parser *p) return (res != NULL) == positive; } +int +_PyPegen_lookahead_with_string(int positive, expr_ty (func)(Parser *, const char*), Parser *p, const char* arg) +{ + int mark = p->mark; + void *res = func(p, arg); + p->mark = mark; + return (res != NULL) == positive; +} + int _PyPegen_lookahead_with_int(int positive, Token *(func)(Parser *, int), Parser *p, int arg) { @@ -753,6 +761,30 @@ _PyPegen_expect_token(Parser *p, int type) return t; } +expr_ty +_PyPegen_expect_soft_keyword(Parser *p, const char *keyword) +{ + if (p->mark == p->fill) { + if (_PyPegen_fill_token(p) < 0) { + p->error_indicator = 1; + return NULL; + } + } + Token *t = p->tokens[p->mark]; + if (t->type != NAME) { + return NULL; + } + char* s = PyBytes_AsString(t->bytes); + if (!s) { + p->error_indicator = 1; + return NULL; + } + if (strcmp(s, keyword) != 0) { + return NULL; + } + return _PyPegen_name_token(p); +} + Token * _PyPegen_get_last_nonnwhitespace_token(Parser *p) { diff --git a/Parser/pegen/pegen.h b/Parser/pegen/pegen.h index bd3056e6f2b..25853921fb3 100644 --- a/Parser/pegen/pegen.h +++ b/Parser/pegen/pegen.h @@ -118,10 +118,12 @@ int _PyPegen_update_memo(Parser *p, int mark, int type, void *node); int _PyPegen_is_memoized(Parser *p, int type, void *pres); int _PyPegen_lookahead_with_name(int, expr_ty (func)(Parser *), Parser *); +int _PyPegen_lookahead_with_string(int , expr_ty (func)(Parser *, const char*), Parser *, const char*); int _PyPegen_lookahead_with_int(int, Token *(func)(Parser *, int), Parser *, int); int _PyPegen_lookahead(int, void *(func)(Parser *), Parser *); Token *_PyPegen_expect_token(Parser *p, int type); +expr_ty _PyPegen_expect_soft_keyword(Parser *p, const char *keyword); Token *_PyPegen_get_last_nonnwhitespace_token(Parser *); int _PyPegen_fill_token(Parser *p); expr_ty _PyPegen_name_token(Parser *p); diff --git a/Tools/peg_generator/pegen/c_generator.py b/Tools/peg_generator/pegen/c_generator.py index 8bc23911bbb..ce1d6bb7bf3 100644 --- a/Tools/peg_generator/pegen/c_generator.py +++ b/Tools/peg_generator/pegen/c_generator.py @@ -58,7 +58,8 @@ class NodeTypes(Enum): STRING_TOKEN = 2 GENERIC_TOKEN = 3 KEYWORD = 4 - CUT_OPERATOR = 5 + SOFT_KEYWORD = 5 + CUT_OPERATOR = 6 BASE_NODETYPES = { @@ -117,6 +118,16 @@ class CCallMakerVisitor(GrammarVisitor): comment=f"token='{keyword}'", ) + def soft_keyword_helper(self, value: str) -> FunctionCall: + return FunctionCall( + assigned_variable="_keyword", + function="_PyPegen_expect_soft_keyword", + arguments=["p", value], + return_type="expr_ty", + nodetype=NodeTypes.SOFT_KEYWORD, + comment=f"soft_keyword='{value}'", + ) + def visit_NameLeaf(self, node: NameLeaf) -> FunctionCall: name = node.value if name in self.non_exact_tokens: @@ -154,7 +165,10 @@ class CCallMakerVisitor(GrammarVisitor): def visit_StringLeaf(self, node: StringLeaf) -> FunctionCall: val = ast.literal_eval(node.value) if re.match(r"[a-zA-Z_]\w*\Z", val): # This is a keyword - return self.keyword_helper(val) + if node.value.endswith("'"): + return self.keyword_helper(val) + else: + return self.soft_keyword_helper(node.value) else: assert val in self.exact_tokens, f"{node.value} is not a known literal" type = self.exact_tokens[val] @@ -204,6 +218,12 @@ class CCallMakerVisitor(GrammarVisitor): arguments=[positive, call.function, *call.arguments], return_type="int", ) + elif call.nodetype == NodeTypes.SOFT_KEYWORD: + return FunctionCall( + function=f"_PyPegen_lookahead_with_string", + arguments=[positive, call.function, *call.arguments], + return_type="int", + ) elif call.nodetype in {NodeTypes.GENERIC_TOKEN, NodeTypes.KEYWORD}: return FunctionCall( function=f"_PyPegen_lookahead_with_int", @@ -656,8 +676,9 @@ class CParserGenerator(ParserGenerator, GrammarVisitor): self.print("{") # We have parsed successfully all the conditions for the option. with self.indent(): + node_str = str(node).replace('"', '\\"') self.print( - f'D(fprintf(stderr, "%*c+ {rulename}[%d-%d]: %s succeeded!\\n", p->level, \' \', _mark, p->mark, "{node}"));' + f'D(fprintf(stderr, "%*c+ {rulename}[%d-%d]: %s succeeded!\\n", p->level, \' \', _mark, p->mark, "{node_str}"));' ) # Prepare to emmit the rule action and do so if node.action and "EXTRA" in node.action: @@ -710,8 +731,9 @@ class CParserGenerator(ParserGenerator, GrammarVisitor): self.print(f"{{ // {node}") with self.indent(): self._check_for_errors() + node_str = str(node).replace('"', '\\"') self.print( - f'D(fprintf(stderr, "%*c> {rulename}[%d-%d]: %s\\n", p->level, \' \', _mark, p->mark, "{node}"));' + f'D(fprintf(stderr, "%*c> {rulename}[%d-%d]: %s\\n", p->level, \' \', _mark, p->mark, "{node_str}"));' ) # Prepare variable declarations for the alternative vars = self.collect_vars(node) @@ -733,9 +755,10 @@ class CParserGenerator(ParserGenerator, GrammarVisitor): self.handle_alt_normal(node, is_gather, rulename) self.print("p->mark = _mark;") + node_str = str(node).replace('"', '\\"') self.print( f"D(fprintf(stderr, \"%*c%s {rulename}[%d-%d]: %s failed!\\n\", p->level, ' ',\n" - f' p->error_indicator ? "ERROR!" : "-", _mark, p->mark, "{node}"));' + f' p->error_indicator ? "ERROR!" : "-", _mark, p->mark, "{node_str}"));' ) if "_cut_var" in vars: self.print("if (_cut_var) {")