bpo-39219: Fix SyntaxError attributes in the tokenizer. (GH-17828)

* Always set the text attribute. * Correct the offset attribute for non-ascii sources. (cherry picked from commit 0cc6b5e559) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2020-02-12 02:35:10 -08:00 · 2020-02-12 02:35:10 -08:00 · efd878cdb4
parent 0b8f738eb3
commit efd878cdb4
3 changed files with 47 additions and 5 deletions
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@ -179,17 +179,25 @@ class ExceptionTests(unittest.TestCase):
        ckmsg(s, "inconsistent use of tabs and spaces in indentation", TabError)

    def testSyntaxErrorOffset(self):
-        def check(src, lineno, offset):
+        def check(src, lineno, offset, encoding='utf-8'):
            with self.assertRaises(SyntaxError) as cm:
                compile(src, '<fragment>', 'exec')
            self.assertEqual(cm.exception.lineno, lineno)
            self.assertEqual(cm.exception.offset, offset)
+            if cm.exception.text is not None:
+                if not isinstance(src, str):
+                    src = src.decode(encoding, 'replace')
+                line = src.split('\n')[lineno-1]
+                self.assertEqual(cm.exception.text.rstrip('\n'), line)

        check('def fact(x):\n\treturn x!\n', 2, 10)
        check('1 +\n', 1, 4)
        check('def spam():\n  print(1)\n print(2)', 3, 10)
        check('Python = "Python" +', 1, 20)
        check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +', 1, 20)
+        check(b'# -*- coding: cp1251 -*-\nPython = "\xcf\xb3\xf2\xee\xed" +',
+              2, 19, encoding='cp1251')
+        check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 18)
        check('x = "a', 1, 7)
        check('lambda x: x = 2', 1, 1)

@ -205,6 +213,10 @@ class ExceptionTests(unittest.TestCase):
        check('0010 + 2', 1, 4)
        check('x = 32e-+4', 1, 8)
        check('x = 0o9', 1, 6)
+        check('\u03b1 = 0xI', 1, 6)
+        check(b'\xce\xb1 = 0xI', 1, 6)
+        check(b'# -*- coding: iso8859-7 -*-\n\xe1 = 0xI', 2, 6,
+              encoding='iso8859-7')

        # Errors thrown by symtable.c
        check('x = [(yield i) for i in range(3)]', 1, 5)
--- a/Builtins/2020-01-05-13-36-08.bpo-39219.uHtKd4.rst
+++ b/Builtins/2020-01-05-13-36-08.bpo-39219.uHtKd4.rst
@ -0,0 +1,2 @@
+Syntax errors raised in the tokenizer now always set correct "text" and
+"offset" attributes.
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -1,6 +1,7 @@

 /* Tokenizer implementation */

+#define PY_SSIZE_T_CLEAN
 #include "Python.h"

 #include <ctype.h>
@ -1034,17 +1035,44 @@ tok_backup(struct tok_state *tok, int c)
 static int
 syntaxerror(struct tok_state *tok, const char *format, ...)
 {
+    PyObject *errmsg, *errtext, *args;
    va_list vargs;
 #ifdef HAVE_STDARG_PROTOTYPES
    va_start(vargs, format);
 #else
    va_start(vargs);
 #endif
-    PyErr_FormatV(PyExc_SyntaxError, format, vargs);
+    errmsg = PyUnicode_FromFormatV(format, vargs);
    va_end(vargs);
-    PyErr_SyntaxLocationObject(tok->filename,
-                               tok->lineno,
-                               (int)(tok->cur - tok->line_start));
+    if (!errmsg) {
+        goto error;
+    }
+
+    errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
+                                   "replace");
+    if (!errtext) {
+        goto error;
+    }
+    int offset = (int)PyUnicode_GET_LENGTH(errtext);
+    Py_ssize_t line_len = strcspn(tok->line_start, "\n");
+    if (line_len != tok->cur - tok->line_start) {
+        Py_DECREF(errtext);
+        errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
+                                       "replace");
+    }
+    if (!errtext) {
+        goto error;
+    }
+
+    args = Py_BuildValue("(O(OiiN))", errmsg,
+                         tok->filename, tok->lineno, offset, errtext);
+    if (args) {
+        PyErr_SetObject(PyExc_SyntaxError, args);
+        Py_DECREF(args);
+    }
+
+error:
+    Py_XDECREF(errmsg);
    tok->done = E_ERROR;
    return ERRORTOKEN;
 }