bpo-40334: Avoid collisions between parser variables and grammar variables (GH-19987)

This is for the C generator:
- Disallow rule and variable names starting with `_`
- Rename most local variable names generated by the parser to start with `_`

Exceptions:
- Renaming `p` to `_p` will be a separate PR
- There are still some names that might clash, e.g.
  - anything starting with `Py`
  - C reserved words (`if` etc.)
  - Macros like `EXTRA` and `CHECK`
This commit is contained in:
Pablo Galindo 2020-05-10 05:34:50 +01:00 committed by GitHub
parent 2c3d508c5f
commit ac7a92cc0a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 5758 additions and 5716 deletions

View File

@ -540,6 +540,33 @@ class TestPegen(unittest.TestCase):
with self.assertRaises(GrammarError): with self.assertRaises(GrammarError):
parser_class = make_parser(grammar) parser_class = make_parser(grammar)
def test_invalid_rule_name(self) -> None:
grammar = """
start: _a b
_a: 'a'
b: 'b'
"""
with self.assertRaisesRegex(GrammarError, "cannot start with underscore: '_a'"):
parser_class = make_parser(grammar)
def test_invalid_variable_name(self) -> None:
grammar = """
start: a b
a: _x='a'
b: 'b'
"""
with self.assertRaisesRegex(GrammarError, "cannot start with underscore: '_x'"):
parser_class = make_parser(grammar)
def test_invalid_variable_name_in_temporal_rule(self) -> None:
grammar = """
start: a b
a: (_x='a' | 'b') | 'c'
b: 'b'
"""
with self.assertRaisesRegex(GrammarError, "cannot start with underscore: '_x'"):
parser_class = make_parser(grammar)
class TestGrammarVisitor: class TestGrammarVisitor:
class Visitor(GrammarVisitor): class Visitor(GrammarVisitor):

File diff suppressed because it is too large Load Diff

View File

@ -132,7 +132,7 @@ void *_PyPegen_dummy_name(Parser *p, ...);
#define UNUSED(expr) do { (void)(expr); } while (0) #define UNUSED(expr) do { (void)(expr); } while (0)
#define EXTRA_EXPR(head, tail) head->lineno, head->col_offset, tail->end_lineno, tail->end_col_offset, p->arena #define EXTRA_EXPR(head, tail) head->lineno, head->col_offset, tail->end_lineno, tail->end_col_offset, p->arena
#define EXTRA start_lineno, start_col_offset, end_lineno, end_col_offset, p->arena #define EXTRA _start_lineno, _start_col_offset, _end_lineno, _end_col_offset, p->arena
#define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 1, msg, ##__VA_ARGS__) #define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 1, msg, ##__VA_ARGS__)
#define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, 1, msg, ##__VA_ARGS__) #define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, 1, msg, ##__VA_ARGS__)
#define RAISE_SYNTAX_ERROR_NO_COL_OFFSET(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 0, msg, ##__VA_ARGS__) #define RAISE_SYNTAX_ERROR_NO_COL_OFFSET(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 0, msg, ##__VA_ARGS__)

View File

@ -1,5 +1,5 @@
import ast import ast
from dataclasses import dataclass, field from dataclasses import dataclass
import re import re
from typing import Any, Dict, IO, Optional, List, Text, Tuple, Set from typing import Any, Dict, IO, Optional, List, Text, Tuple, Set
from enum import Enum from enum import Enum
@ -101,7 +101,7 @@ class CCallMakerVisitor(GrammarVisitor):
if keyword not in self.keyword_cache: if keyword not in self.keyword_cache:
self.keyword_cache[keyword] = self.gen.keyword_type() self.keyword_cache[keyword] = self.gen.keyword_type()
return FunctionCall( return FunctionCall(
assigned_variable="keyword", assigned_variable="_keyword",
function="_PyPegen_expect_token", function="_PyPegen_expect_token",
arguments=["p", self.keyword_cache[keyword]], arguments=["p", self.keyword_cache[keyword]],
return_type="Token *", return_type="Token *",
@ -140,7 +140,7 @@ class CCallMakerVisitor(GrammarVisitor):
function=f"{name}_rule", function=f"{name}_rule",
arguments=["p"], arguments=["p"],
return_type=type, return_type=type,
comment=f"{node}" comment=f"{node}",
) )
def visit_StringLeaf(self, node: StringLeaf) -> FunctionCall: def visit_StringLeaf(self, node: StringLeaf) -> FunctionCall:
@ -151,7 +151,7 @@ class CCallMakerVisitor(GrammarVisitor):
assert val in self.exact_tokens, f"{node.value} is not a known literal" assert val in self.exact_tokens, f"{node.value} is not a known literal"
type = self.exact_tokens[val] type = self.exact_tokens[val]
return FunctionCall( return FunctionCall(
assigned_variable="literal", assigned_variable="_literal",
function=f"_PyPegen_expect_token", function=f"_PyPegen_expect_token",
arguments=["p", type], arguments=["p", type],
nodetype=NodeTypes.GENERIC_TOKEN, nodetype=NodeTypes.GENERIC_TOKEN,
@ -175,8 +175,10 @@ class CCallMakerVisitor(GrammarVisitor):
else: else:
name = self.gen.name_node(node) name = self.gen.name_node(node)
self.cache[node] = FunctionCall( self.cache[node] = FunctionCall(
assigned_variable=f"{name}_var", function=f"{name}_rule", arguments=["p"], assigned_variable=f"{name}_var",
comment=f"{node}" function=f"{name}_rule",
arguments=["p"],
comment=f"{node}",
) )
return self.cache[node] return self.cache[node]
@ -217,11 +219,11 @@ class CCallMakerVisitor(GrammarVisitor):
def visit_Opt(self, node: Opt) -> FunctionCall: def visit_Opt(self, node: Opt) -> FunctionCall:
call = self.visit(node.node) call = self.visit(node.node)
return FunctionCall( return FunctionCall(
assigned_variable="opt_var", assigned_variable="_opt_var",
function=call.function, function=call.function,
arguments=call.arguments, arguments=call.arguments,
force_true=True, force_true=True,
comment=f"{node}" comment=f"{node}",
) )
def visit_Repeat0(self, node: Repeat0) -> FunctionCall: def visit_Repeat0(self, node: Repeat0) -> FunctionCall:
@ -268,7 +270,7 @@ class CCallMakerVisitor(GrammarVisitor):
def visit_Cut(self, node: Cut) -> FunctionCall: def visit_Cut(self, node: Cut) -> FunctionCall:
return FunctionCall( return FunctionCall(
assigned_variable="cut_var", assigned_variable="_cut_var",
return_type="int", return_type="int",
function="1", function="1",
nodetype=NodeTypes.CUT_OPERATOR, nodetype=NodeTypes.CUT_OPERATOR,
@ -418,46 +420,46 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
self.print("p->error_indicator = 1;") self.print("p->error_indicator = 1;")
self.print("return NULL;") self.print("return NULL;")
self.print("}") self.print("}")
self.print("int start_lineno = p->tokens[mark]->lineno;") self.print("int _start_lineno = p->tokens[_mark]->lineno;")
self.print("UNUSED(start_lineno); // Only used by EXTRA macro") self.print("UNUSED(_start_lineno); // Only used by EXTRA macro")
self.print("int start_col_offset = p->tokens[mark]->col_offset;") self.print("int _start_col_offset = p->tokens[_mark]->col_offset;")
self.print("UNUSED(start_col_offset); // Only used by EXTRA macro") self.print("UNUSED(_start_col_offset); // Only used by EXTRA macro")
def _set_up_token_end_metadata_extraction(self) -> None: def _set_up_token_end_metadata_extraction(self) -> None:
self.print("Token *token = _PyPegen_get_last_nonnwhitespace_token(p);") self.print("Token *_token = _PyPegen_get_last_nonnwhitespace_token(p);")
self.print("if (token == NULL) {") self.print("if (_token == NULL) {")
with self.indent(): with self.indent():
self.print("return NULL;") self.print("return NULL;")
self.print("}") self.print("}")
self.print(f"int end_lineno = token->end_lineno;") self.print("int _end_lineno = _token->end_lineno;")
self.print("UNUSED(end_lineno); // Only used by EXTRA macro") self.print("UNUSED(_end_lineno); // Only used by EXTRA macro")
self.print(f"int end_col_offset = token->end_col_offset;") self.print("int _end_col_offset = _token->end_col_offset;")
self.print("UNUSED(end_col_offset); // Only used by EXTRA macro") self.print("UNUSED(_end_col_offset); // Only used by EXTRA macro")
def _set_up_rule_memoization(self, node: Rule, result_type: str) -> None: def _set_up_rule_memoization(self, node: Rule, result_type: str) -> None:
self.print("{") self.print("{")
with self.indent(): with self.indent():
self.print(f"{result_type} res = NULL;") self.print(f"{result_type} _res = NULL;")
self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &res))") self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &_res))")
with self.indent(): with self.indent():
self.print("return res;") self.print("return _res;")
self.print("int mark = p->mark;") self.print("int _mark = p->mark;")
self.print("int resmark = p->mark;") self.print("int _resmark = p->mark;")
self.print("while (1) {") self.print("while (1) {")
with self.indent(): with self.indent():
self.call_with_errorcheck_return( self.call_with_errorcheck_return(
f"_PyPegen_update_memo(p, mark, {node.name}_type, res)", "res" f"_PyPegen_update_memo(p, _mark, {node.name}_type, _res)", "_res"
) )
self.print("p->mark = mark;") self.print("p->mark = _mark;")
self.print(f"void *raw = {node.name}_raw(p);") self.print(f"void *_raw = {node.name}_raw(p);")
self.print("if (raw == NULL || p->mark <= resmark)") self.print("if (_raw == NULL || p->mark <= _resmark)")
with self.indent(): with self.indent():
self.print("break;") self.print("break;")
self.print("resmark = p->mark;") self.print(f"_resmark = p->mark;")
self.print("res = raw;") self.print("_res = _raw;")
self.print("}") self.print("}")
self.print("p->mark = resmark;") self.print(f"p->mark = _resmark;")
self.print("return res;") self.print("return _res;")
self.print("}") self.print("}")
self.print(f"static {result_type}") self.print(f"static {result_type}")
self.print(f"{node.name}_raw(Parser *p)") self.print(f"{node.name}_raw(Parser *p)")
@ -473,12 +475,12 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
with self.indent(): with self.indent():
self.print("return NULL;") self.print("return NULL;")
self.print("}") self.print("}")
self.print(f"{result_type} res = NULL;") self.print(f"{result_type} _res = NULL;")
if memoize: if memoize:
self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &res))") self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &_res))")
with self.indent(): with self.indent():
self.print("return res;") self.print("return _res;")
self.print("int mark = p->mark;") self.print("int _mark = p->mark;")
if any(alt.action and "EXTRA" in alt.action for alt in rhs.alts): if any(alt.action and "EXTRA" in alt.action for alt in rhs.alts):
self._set_up_token_start_metadata_extraction() self._set_up_token_start_metadata_extraction()
self.visit( self.visit(
@ -488,13 +490,13 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
rulename=node.name if memoize else None, rulename=node.name if memoize else None,
) )
if self.debug: if self.debug:
self.print(f'fprintf(stderr, "Fail at %d: {node.name}\\n", p->mark);') self.print('fprintf(stderr, "Fail at %d: {node.name}\\n", p->mark);')
self.print("res = NULL;") self.print("_res = NULL;")
self.print(" done:") self.print(" done:")
with self.indent(): with self.indent():
if memoize: if memoize:
self.print(f"_PyPegen_insert_memo(p, mark, {node.name}_type, res);") self.print(f"_PyPegen_insert_memo(p, _mark, {node.name}_type, _res);")
self.print("return res;") self.print("return _res;")
def _handle_loop_rule_body(self, node: Rule, rhs: Rhs) -> None: def _handle_loop_rule_body(self, node: Rule, rhs: Rhs) -> None:
memoize = self._should_memoize(node) memoize = self._should_memoize(node)
@ -505,17 +507,17 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
with self.indent(): with self.indent():
self.print("return NULL;") self.print("return NULL;")
self.print("}") self.print("}")
self.print(f"void *res = NULL;") self.print("void *_res = NULL;")
if memoize: if memoize:
self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &res))") self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &_res))")
with self.indent(): with self.indent():
self.print("return res;") self.print("return _res;")
self.print("int mark = p->mark;") self.print("int _mark = p->mark;")
self.print("int start_mark = p->mark;") self.print("int _start_mark = p->mark;")
self.print("void **children = PyMem_Malloc(sizeof(void *));") self.print("void **_children = PyMem_Malloc(sizeof(void *));")
self.out_of_memory_return(f"!children", "NULL") self.out_of_memory_return(f"!_children", "NULL")
self.print("ssize_t children_capacity = 1;") self.print("ssize_t _children_capacity = 1;")
self.print("ssize_t n = 0;") self.print("ssize_t _n = 0;")
if any(alt.action and "EXTRA" in alt.action for alt in rhs.alts): if any(alt.action and "EXTRA" in alt.action for alt in rhs.alts):
self._set_up_token_start_metadata_extraction() self._set_up_token_start_metadata_extraction()
self.visit( self.visit(
@ -525,23 +527,23 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
rulename=node.name if memoize else None, rulename=node.name if memoize else None,
) )
if is_repeat1: if is_repeat1:
self.print("if (n == 0 || p->error_indicator) {") self.print("if (_n == 0 || p->error_indicator) {")
with self.indent(): with self.indent():
self.print("PyMem_Free(children);") self.print("PyMem_Free(_children);")
self.print("return NULL;") self.print("return NULL;")
self.print("}") self.print("}")
self.print("asdl_seq *seq = _Py_asdl_seq_new(n, p->arena);") self.print("asdl_seq *_seq = _Py_asdl_seq_new(_n, p->arena);")
self.out_of_memory_return( self.out_of_memory_return(
f"!seq", "!_seq",
"NULL", "NULL",
message=f"asdl_seq_new {node.name}", message=f"asdl_seq_new {node.name}",
cleanup_code="PyMem_Free(children);", cleanup_code="PyMem_Free(_children);",
) )
self.print("for (int i = 0; i < n; i++) asdl_seq_SET(seq, i, children[i]);") self.print("for (int i = 0; i < _n; i++) asdl_seq_SET(_seq, i, _children[i]);")
self.print("PyMem_Free(children);") self.print("PyMem_Free(_children);")
if node.name: if node.name:
self.print(f"_PyPegen_insert_memo(p, start_mark, {node.name}_type, seq);") self.print(f"_PyPegen_insert_memo(p, _start_mark, {node.name}_type, _seq);")
self.print("return seq;") self.print("return _seq;")
def visit_Rule(self, node: Rule) -> None: def visit_Rule(self, node: Rule) -> None:
is_loop = node.is_loop() is_loop = node.is_loop()
@ -599,9 +601,9 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
self.print(")") self.print(")")
def emit_action(self, node: Alt, cleanup_code: Optional[str] = None) -> None: def emit_action(self, node: Alt, cleanup_code: Optional[str] = None) -> None:
self.print(f"res = {node.action};") self.print(f"_res = {node.action};")
self.print("if (res == NULL && PyErr_Occurred()) {") self.print("if (_res == NULL && PyErr_Occurred()) {")
with self.indent(): with self.indent():
self.print("p->error_indicator = 1;") self.print("p->error_indicator = 1;")
if cleanup_code: if cleanup_code:
@ -611,7 +613,7 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
if self.debug: if self.debug:
self.print( self.print(
f'fprintf(stderr, "Hit with action [%d-%d]: %s\\n", mark, p->mark, "{node}");' f'fprintf(stderr, "Hit with action [%d-%d]: %s\\n", _mark, p->mark, "{node}");'
) )
def emit_default_action(self, is_gather: bool, node: Alt) -> None: def emit_default_action(self, is_gather: bool, node: Alt) -> None:
@ -619,7 +621,7 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
if is_gather: if is_gather:
assert len(self.local_variable_names) == 2 assert len(self.local_variable_names) == 2
self.print( self.print(
f"res = _PyPegen_seq_insert_in_front(p, " f"_res = _PyPegen_seq_insert_in_front(p, "
f"{self.local_variable_names[0]}, {self.local_variable_names[1]});" f"{self.local_variable_names[0]}, {self.local_variable_names[1]});"
) )
else: else:
@ -628,17 +630,17 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
f'fprintf(stderr, "Hit without action [%d:%d]: %s\\n", mark, p->mark, "{node}");' f'fprintf(stderr, "Hit without action [%d:%d]: %s\\n", mark, p->mark, "{node}");'
) )
self.print( self.print(
f"res = _PyPegen_dummy_name(p, {', '.join(self.local_variable_names)});" f"_res = _PyPegen_dummy_name(p, {', '.join(self.local_variable_names)});"
) )
else: else:
if self.debug: if self.debug:
self.print( self.print(
f'fprintf(stderr, "Hit with default action [%d:%d]: %s\\n", mark, p->mark, "{node}");' f'fprintf(stderr, "Hit with default action [%d:%d]: %s\\n", mark, p->mark, "{node}");'
) )
self.print(f"res = {self.local_variable_names[0]};") self.print(f"_res = {self.local_variable_names[0]};")
def emit_dummy_action(self) -> None: def emit_dummy_action(self) -> None:
self.print(f"res = _PyPegen_dummy_name(p);") self.print("_res = _PyPegen_dummy_name(p);")
def handle_alt_normal(self, node: Alt, is_gather: bool) -> None: def handle_alt_normal(self, node: Alt, is_gather: bool) -> None:
self.join_conditions(keyword="if", node=node) self.join_conditions(keyword="if", node=node)
@ -671,20 +673,22 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
if self.skip_actions: if self.skip_actions:
self.emit_dummy_action() self.emit_dummy_action()
elif node.action: elif node.action:
self.emit_action(node, cleanup_code="PyMem_Free(children);") self.emit_action(node, cleanup_code="PyMem_Free(_children);")
else: else:
self.emit_default_action(is_gather, node) self.emit_default_action(is_gather, node)
# Add the result of rule to the temporary buffer of children. This buffer # Add the result of rule to the temporary buffer of children. This buffer
# will populate later an asdl_seq with all elements to return. # will populate later an asdl_seq with all elements to return.
self.print("if (n == children_capacity) {") self.print("if (_n == _children_capacity) {")
with self.indent(): with self.indent():
self.print("children_capacity *= 2;") self.print("_children_capacity *= 2;")
self.print("children = PyMem_Realloc(children, children_capacity*sizeof(void *));") self.print(
self.out_of_memory_return(f"!children", "NULL", message=f"realloc {rulename}") "_children = PyMem_Realloc(_children, _children_capacity*sizeof(void *));"
)
self.out_of_memory_return(f"!_children", "NULL", message=f"realloc {rulename}")
self.print("}") self.print("}")
self.print(f"children[n++] = res;") self.print("_children[_n++] = _res;")
self.print("mark = p->mark;") self.print("_mark = p->mark;")
self.print("}") self.print("}")
def visit_Alt( def visit_Alt(
@ -699,11 +703,11 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
var_type = "void *" var_type = "void *"
else: else:
var_type += " " var_type += " "
if v == "cut_var": if v == "_cut_var":
v += " = 0" # cut_var must be initialized v += " = 0" # cut_var must be initialized
self.print(f"{var_type}{v};") self.print(f"{var_type}{v};")
if v == "opt_var": if v == "_opt_var":
self.print("UNUSED(opt_var); // Silence compiler warnings") self.print("UNUSED(_opt_var); // Silence compiler warnings")
with self.local_variable_context(): with self.local_variable_context():
if is_loop: if is_loop:
@ -711,9 +715,9 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
else: else:
self.handle_alt_normal(node, is_gather) self.handle_alt_normal(node, is_gather)
self.print("p->mark = mark;") self.print("p->mark = _mark;")
if "cut_var" in vars: if "_cut_var" in vars:
self.print("if (cut_var) return NULL;") self.print("if (_cut_var) return NULL;")
self.print("}") self.print("}")
def collect_vars(self, node: Alt) -> Dict[Optional[str], Optional[str]]: def collect_vars(self, node: Alt) -> Dict[Optional[str], Optional[str]]:

View File

@ -27,6 +27,11 @@ class RuleCheckingVisitor(GrammarVisitor):
# TODO: Add line/col info to (leaf) nodes # TODO: Add line/col info to (leaf) nodes
raise GrammarError(f"Dangling reference to rule {node.value!r}") raise GrammarError(f"Dangling reference to rule {node.value!r}")
def visit_NamedItem(self, node: NameLeaf) -> None:
if node.name and node.name.startswith("_"):
raise GrammarError(f"Variable names cannot start with underscore: '{node.name}'")
self.visit(node.item)
class ParserGenerator: class ParserGenerator:
@ -36,6 +41,7 @@ class ParserGenerator:
self.grammar = grammar self.grammar = grammar
self.tokens = tokens self.tokens = tokens
self.rules = grammar.rules self.rules = grammar.rules
self.validate_rule_names()
if "trailer" not in grammar.metas and "start" not in self.rules: if "trailer" not in grammar.metas and "start" not in self.rules:
raise GrammarError("Grammar without a trailer must have a 'start' rule") raise GrammarError("Grammar without a trailer must have a 'start' rule")
checker = RuleCheckingVisitor(self.rules, self.tokens) checker = RuleCheckingVisitor(self.rules, self.tokens)
@ -51,6 +57,11 @@ class ParserGenerator:
self.all_rules: Dict[str, Rule] = {} # Rules + temporal rules self.all_rules: Dict[str, Rule] = {} # Rules + temporal rules
self._local_variable_stack: List[List[str]] = [] self._local_variable_stack: List[List[str]] = []
def validate_rule_names(self):
for rule in self.rules:
if rule.startswith("_"):
raise GrammarError(f"Rule names cannot start with underscore: '{rule}'")
@contextlib.contextmanager @contextlib.contextmanager
def local_variable_context(self) -> Iterator[None]: def local_variable_context(self) -> Iterator[None]:
self._local_variable_stack.append([]) self._local_variable_stack.append([])