bpo-40335: Correctly handle multi-line strings in tokenize error scenarios (GH-19619)

Co-authored-by: Guido van Rossum <gvanrossum@gmail.com>
2020-04-21 01:53:04 +01:00 · 2020-04-21 01:53:04 +01:00 · 11a7f158ef
parent 6a9e80a931
commit 11a7f158ef
3 changed files with 37 additions and 23 deletions
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@ -188,7 +188,7 @@ class ExceptionTests(unittest.TestCase):
                if not isinstance(src, str):
                    src = src.decode(encoding, 'replace')
                line = src.split('\n')[lineno-1]
-                self.assertEqual(cm.exception.text.rstrip('\n'), line)
+                self.assertIn(line, cm.exception.text)

        check('def fact(x):\n\treturn x!\n', 2, 10)
        check('1 +\n', 1, 4)
@ -217,6 +217,16 @@ class ExceptionTests(unittest.TestCase):
        check(b'\xce\xb1 = 0xI', 1, 6)
        check(b'# -*- coding: iso8859-7 -*-\n\xe1 = 0xI', 2, 6,
              encoding='iso8859-7')
+        check(b"""if 1:
+            def foo():
+                '''
+
+            def bar():
+                pass
+
+            def baz():
+                '''quux'''
+            """, 9, 20)

        # Errors thrown by symtable.c
        check('x = [(yield i) for i in range(3)]', 1, 5)
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@ -251,25 +251,7 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
        const char *line_start;

        type = PyTokenizer_Get(tok, &a, &b);
-        if (type == ERRORTOKEN) {
-            err_ret->error = tok->done;
-            break;
-        }
-        if (type == ENDMARKER && started) {
-            type = NEWLINE; /* Add an extra newline */
-            started = 0;
-            /* Add the right number of dedent tokens,
-               except if a certain flag is given --
-               codeop.py uses this. */
-            if (tok->indent &&
-                !(*flags & PyPARSE_DONT_IMPLY_DEDENT))
-            {
-                tok->pendin = -tok->indent;
-                tok->indent = 0;
-            }
-        }
-        else
-            started = 1;
+
        len = (a != NULL && b != NULL) ? b - a : 0;
        str = (char *) PyObject_MALLOC(len + 1);
        if (str == NULL) {
@ -328,6 +310,27 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
            continue;
        }

+        if (type == ERRORTOKEN) {
+            err_ret->error = tok->done;
+            break;
+        }
+        if (type == ENDMARKER && started) {
+            type = NEWLINE; /* Add an extra newline */
+            started = 0;
+            /* Add the right number of dedent tokens,
+               except if a certain flag is given --
+               codeop.py uses this. */
+            if (tok->indent &&
+                !(*flags & PyPARSE_DONT_IMPLY_DEDENT))
+            {
+                tok->pendin = -tok->indent;
+                tok->indent = 0;
+            }
+        }
+        else {
+            started = 1;
+        }
+
        if ((err_ret->error =
             PyParser_AddToken(ps, (int)type, str,
                               lineno, col_offset, tok->lineno, end_col_offset,
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -1392,13 +1392,14 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
        if (nonascii && !verify_identifier(tok)) {
            return ERRORTOKEN;
        }
+
+        *p_start = tok->start;
+        *p_end = tok->cur;
+
        if (c == '"' || c == '\'') {
            tok->done = E_BADPREFIX;
            return ERRORTOKEN;
        }
-        *p_start = tok->start;
-        *p_end = tok->cur;
-
        /* async/await parsing block. */
        if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
            /* May be an 'async' or 'await' token.  For Python 3.7 or