bpo-43822: Improve syntax errors for missing commas (GH-25377)

2021-04-15 21:38:45 +01:00 · 2021-04-15 21:38:45 +01:00 · b280248be8
parent e692f55979
commit b280248be8
13 changed files with 1235 additions and 1034 deletions
--- a/Doc/library/token-list.inc
+++ b/Doc/library/token-list.inc
@ -211,6 +211,8 @@

 .. data:: TYPE_COMMENT

+.. data:: SOFT_KEYWORD
+
 .. data:: ERRORTOKEN

 .. data:: N_TOKENS
--- a/Grammar/Tokens
+++ b/Grammar/Tokens
@ -59,6 +59,7 @@ AWAIT
 ASYNC
 TYPE_IGNORE
 TYPE_COMMENT
+SOFT_KEYWORD
 ERRORTOKEN

 # These aren't used by the C tokenizer but are needed for tokenize.py
--- a/Grammar/python.gram
+++ b/Grammar/python.gram
@ -7,6 +7,7 @@ _PyPegen_parse(Parser *p)
    // Initialize keywords
    p->keywords = reserved_keywords;
    p->n_keyword_lists = n_keyword_lists;
+    p->soft_keywords = soft_keywords;

    // Run parser
    void *result = NULL;
@ -459,6 +460,7 @@ expressions[expr_ty]:
    | a=expression ',' { _PyAST_Tuple(CHECK(asdl_expr_seq*, _PyPegen_singleton_seq(p, a)), Load, EXTRA) }
    | expression
 expression[expr_ty] (memo):
+    | invalid_expression
    | a=disjunction 'if' b=disjunction 'else' c=expression { _PyAST_IfExp(b, a, c, EXTRA) }
    | disjunction
    | lambdef
@ -778,6 +780,13 @@ invalid_kwarg:
    | expression a='=' {
        RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
            a, "expression cannot contain assignment, perhaps you meant \"==\"?") }
+
+invalid_expression:
+    # !(NAME STRING) is not matched so we don't show this error with some invalid string prefixes like: kf"dsfsdf"
+    # Soft keywords need to also be ignored because they can be parsed as NAME NAME
+    | !(NAME STRING | SOFT_KEYWORD) a=disjunction expression {
+        RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, a->lineno, a->end_col_offset - 1, "invalid syntax. Perhaps you forgot a comma?") }
+
 invalid_named_expression:
    | a=expression ':=' expression {
        RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
--- a/Include/token.h
+++ b/Include/token.h
@ -69,8 +69,9 @@ extern "C" {
 #define ASYNC           56
 #define TYPE_IGNORE     57
 #define TYPE_COMMENT    58
-#define ERRORTOKEN      59
-#define N_TOKENS        63
+#define SOFT_KEYWORD    59
+#define ERRORTOKEN      60
+#define N_TOKENS        64
 #define NT_OFFSET       256

 /* Special definitions for cooperation with parser */
--- a/Lib/test/test_genexps.py
+++ b/Lib/test/test_genexps.py
@ -103,7 +103,7 @@ Verify that parenthesis are required when used as a keyword argument value
    >>> dict(a = i for i in range(10))
    Traceback (most recent call last):
       ...
-    SyntaxError: invalid syntax
+    SyntaxError: invalid syntax. Maybe you meant '==' or ':=' instead of '='?

 Verify that parenthesis are required when used as a keyword argument value

--- a/Lib/test/test_syntax.py
+++ b/Lib/test/test_syntax.py
@ -248,22 +248,36 @@ SyntaxError: did you forget parentheses around the comprehension target?

 # Missing commas in literals collections should not
 # produce special error messages regarding missing
-# parentheses
+# parentheses, but about missing commas instead

 >>> [1, 2 3]
 Traceback (most recent call last):
-SyntaxError: invalid syntax
+SyntaxError: invalid syntax. Perhaps you forgot a comma?

 >>> {1, 2 3}
 Traceback (most recent call last):
-SyntaxError: invalid syntax
+SyntaxError: invalid syntax. Perhaps you forgot a comma?

 >>> {1:2, 2:5 3:12}
 Traceback (most recent call last):
-SyntaxError: invalid syntax
+SyntaxError: invalid syntax. Perhaps you forgot a comma?

 >>> (1, 2 3)
 Traceback (most recent call last):
+SyntaxError: invalid syntax. Perhaps you forgot a comma?
+
+# Make sure soft keywords constructs don't raise specialized
+# errors regarding missing commas
+
+>>> match x:
+...     y = 3
+Traceback (most recent call last):
+SyntaxError: invalid syntax
+
+>>> match x:
+...     case y:
+...        3 $ 3
+Traceback (most recent call last):
 SyntaxError: invalid syntax

 From compiler_complex_args():
@ -864,7 +878,7 @@ leading to spurious errors.
   SyntaxError: cannot assign to attribute here. Maybe you meant '==' instead of '='?

 Ensure that early = are not matched by the parser as invalid comparisons
-   >>> f(2, 4, x=34); {1,2 a}
+   >>> f(2, 4, x=34); 1 $ 2
   Traceback (most recent call last):
   SyntaxError: invalid syntax

--- a/Lib/token.py
+++ b/Lib/token.py
@ -62,12 +62,13 @@ AWAIT = 55
 ASYNC = 56
 TYPE_IGNORE = 57
 TYPE_COMMENT = 58
+SOFT_KEYWORD = 59
 # These aren't used by the C tokenizer but are needed for tokenize.py
-ERRORTOKEN = 59
-COMMENT = 60
-NL = 61
-ENCODING = 62
-N_TOKENS = 63
+ERRORTOKEN = 60
+COMMENT = 61
+NL = 62
+ENCODING = 63
+N_TOKENS = 64
 # Special definitions for cooperation with parser
 NT_OFFSET = 256

--- a/Builtins/2021-04-13-02-32-18.bpo-43822.lej0OO.rst
+++ b/Builtins/2021-04-13-02-32-18.bpo-43822.lej0OO.rst
@ -0,0 +1,2 @@
+Improve syntax errors in the parser for missing commas between expressions.
+Patch by Pablo Galindo.
--- a/Parser/parser.c
+++ b/Parser/parser.c
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@ -943,6 +943,23 @@ _PyPegen_string_token(Parser *p)
    return _PyPegen_expect_token(p, STRING);
 }

+
+expr_ty _PyPegen_soft_keyword_token(Parser *p) {
+    Token *t = _PyPegen_expect_token(p, NAME);
+    if (t == NULL) {
+        return NULL;
+    }
+    char *the_token;
+    Py_ssize_t size;
+    PyBytes_AsStringAndSize(t->bytes, &the_token, &size);
+    for (char **keyword = p->soft_keywords; *keyword != NULL; keyword++) {
+        if (strncmp(*keyword, the_token, size) == 0) {
+            return _PyPegen_name_token(p);
+        }
+    }
+    return NULL;
+}
+
 static PyObject *
 parsenumber_raw(const char *s)
 {
@ -1151,6 +1168,7 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
    p->tok = tok;
    p->keywords = NULL;
    p->n_keyword_lists = -1;
+    p->soft_keywords = NULL;
    p->tokens = PyMem_Malloc(sizeof(Token *));
    if (!p->tokens) {
        PyMem_Free(p);
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@ -59,6 +59,7 @@ typedef struct {
    int fill, size;
    PyArena *arena;
    KeywordToken **keywords;
+    char **soft_keywords;
    int n_keyword_lists;
    int start_rule;
    int *errcode;
@ -125,6 +126,7 @@ int _PyPegen_lookahead(int, void *(func)(Parser *), Parser *);
 Token *_PyPegen_expect_token(Parser *p, int type);
 Token *_PyPegen_expect_forced_token(Parser *p, int type, const char* expected);
 expr_ty _PyPegen_expect_soft_keyword(Parser *p, const char *keyword);
+expr_ty _PyPegen_soft_keyword_token(Parser *p);
 Token *_PyPegen_get_last_nonnwhitespace_token(Parser *);
 int _PyPegen_fill_token(Parser *p);
 expr_ty _PyPegen_name_token(Parser *p);
--- a/Parser/token.c
+++ b/Parser/token.c
@ -65,6 +65,7 @@ const char * const _PyParser_TokenNames[] = {
    "ASYNC",
    "TYPE_IGNORE",
    "TYPE_COMMENT",
+    "SOFT_KEYWORD",
    "<ERRORTOKEN>",
    "<COMMENT>",
    "<NL>",
--- a/Tools/peg_generator/pegen/c_generator.py
+++ b/Tools/peg_generator/pegen/c_generator.py
@ -46,6 +46,7 @@ _PyPegen_parse(Parser *p)
    // Initialize keywords
    p->keywords = reserved_keywords;
    p->n_keyword_lists = n_keyword_lists;
+    p->soft_keywords = soft_keywords;

    return start_rule(p);
 }
@ -66,6 +67,7 @@ BASE_NODETYPES = {
    "NAME": NodeTypes.NAME_TOKEN,
    "NUMBER": NodeTypes.NUMBER_TOKEN,
    "STRING": NodeTypes.STRING_TOKEN,
+    "SOFT_KEYWORD": NodeTypes.SOFT_KEYWORD,
 }


@ -411,6 +413,7 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
        if subheader:
            self.print(subheader)
        self._setup_keywords()
+        self._setup_soft_keywords()
        for i, (rulename, rule) in enumerate(self.todo.items(), 1000):
            comment = "  // Left-recursive" if rule.left_recursive else ""
            self.print(f"#define {rulename}_type {i}{comment}")
@ -474,6 +477,15 @@ class CParserGenerator(ParserGenerator, GrammarVisitor):
                    self.print("},")
        self.print("};")

+    def _setup_soft_keywords(self) -> None:
+        soft_keywords = sorted(self.callmakervisitor.soft_keywords)
+        self.print("static char *soft_keywords[] = {")
+        with self.indent():
+            for keyword in soft_keywords:
+                self.print(f'"{keyword}",')
+            self.print("NULL,")
+        self.print("};")
+
    def _set_up_token_start_metadata_extraction(self) -> None:
        self.print("if (p->mark == p->fill && _PyPegen_fill_token(p) < 0) {")
        with self.indent():