From 0cc6b5e559b8303b18fdd56c2befd900fe7b5e35 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 12 Feb 2020 12:17:00 +0200 Subject: [PATCH] bpo-39219: Fix SyntaxError attributes in the tokenizer. (GH-17828) * Always set the text attribute. * Correct the offset attribute for non-ascii sources. --- Lib/test/test_exceptions.py | 14 +++++++- .../2020-01-05-13-36-08.bpo-39219.uHtKd4.rst | 2 ++ Parser/tokenizer.c | 36 ++++++++++++++++--- 3 files changed, 47 insertions(+), 5 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-01-05-13-36-08.bpo-39219.uHtKd4.rst diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index 4d1aa4bca62..22a22363a7d 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -179,17 +179,25 @@ class ExceptionTests(unittest.TestCase): ckmsg(s, "inconsistent use of tabs and spaces in indentation", TabError) def testSyntaxErrorOffset(self): - def check(src, lineno, offset): + def check(src, lineno, offset, encoding='utf-8'): with self.assertRaises(SyntaxError) as cm: compile(src, '', 'exec') self.assertEqual(cm.exception.lineno, lineno) self.assertEqual(cm.exception.offset, offset) + if cm.exception.text is not None: + if not isinstance(src, str): + src = src.decode(encoding, 'replace') + line = src.split('\n')[lineno-1] + self.assertEqual(cm.exception.text.rstrip('\n'), line) check('def fact(x):\n\treturn x!\n', 2, 10) check('1 +\n', 1, 4) check('def spam():\n print(1)\n print(2)', 3, 10) check('Python = "Python" +', 1, 20) check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +', 1, 20) + check(b'# -*- coding: cp1251 -*-\nPython = "\xcf\xb3\xf2\xee\xed" +', + 2, 19, encoding='cp1251') + check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 18) check('x = "a', 1, 7) check('lambda x: x = 2', 1, 1) @@ -205,6 +213,10 @@ class ExceptionTests(unittest.TestCase): check('0010 + 2', 1, 4) check('x = 32e-+4', 1, 8) check('x = 0o9', 1, 6) + check('\u03b1 = 0xI', 1, 6) + check(b'\xce\xb1 = 0xI', 1, 6) + check(b'# -*- coding: iso8859-7 -*-\n\xe1 = 0xI', 2, 6, + encoding='iso8859-7') # Errors thrown by symtable.c check('x = [(yield i) for i in range(3)]', 1, 5) diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-01-05-13-36-08.bpo-39219.uHtKd4.rst b/Misc/NEWS.d/next/Core and Builtins/2020-01-05-13-36-08.bpo-39219.uHtKd4.rst new file mode 100644 index 00000000000..dac8360df71 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-01-05-13-36-08.bpo-39219.uHtKd4.rst @@ -0,0 +1,2 @@ +Syntax errors raised in the tokenizer now always set correct "text" and +"offset" attributes. diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index c37cd927df5..630b0aaab03 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1,6 +1,7 @@ /* Tokenizer implementation */ +#define PY_SSIZE_T_CLEAN #include "Python.h" #include @@ -1034,17 +1035,44 @@ tok_backup(struct tok_state *tok, int c) static int syntaxerror(struct tok_state *tok, const char *format, ...) { + PyObject *errmsg, *errtext, *args; va_list vargs; #ifdef HAVE_STDARG_PROTOTYPES va_start(vargs, format); #else va_start(vargs); #endif - PyErr_FormatV(PyExc_SyntaxError, format, vargs); + errmsg = PyUnicode_FromFormatV(format, vargs); va_end(vargs); - PyErr_SyntaxLocationObject(tok->filename, - tok->lineno, - (int)(tok->cur - tok->line_start)); + if (!errmsg) { + goto error; + } + + errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start, + "replace"); + if (!errtext) { + goto error; + } + int offset = (int)PyUnicode_GET_LENGTH(errtext); + Py_ssize_t line_len = strcspn(tok->line_start, "\n"); + if (line_len != tok->cur - tok->line_start) { + Py_DECREF(errtext); + errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len, + "replace"); + } + if (!errtext) { + goto error; + } + + args = Py_BuildValue("(O(OiiN))", errmsg, + tok->filename, tok->lineno, offset, errtext); + if (args) { + PyErr_SetObject(PyExc_SyntaxError, args); + Py_DECREF(args); + } + +error: + Py_XDECREF(errmsg); tok->done = E_ERROR; return ERRORTOKEN; }