From 76c1b4d5c5a610c09943e1ee7ae18f1957804730 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Fri, 1 May 2020 16:13:43 +0300 Subject: [PATCH] bpo-40334: Improve column offsets for thrown syntax errors by Pegen (GH-19782) --- Grammar/python.gram | 2 +- Lib/test/test_cmd_line_script.py | 6 +- Lib/test/test_exceptions.py | 42 +++++----- Parser/pegen/parse.c | 2 +- Parser/pegen/pegen.c | 129 ++++++++++++------------------- Parser/pegen/pegen.h | 11 +-- 6 files changed, 80 insertions(+), 112 deletions(-) diff --git a/Grammar/python.gram b/Grammar/python.gram index 38107fcf735..3813d8845be 100644 --- a/Grammar/python.gram +++ b/Grammar/python.gram @@ -609,7 +609,7 @@ invalid_assignment: | expression ':' expression ['=' annotated_rhs] { RAISE_SYNTAX_ERROR("illegal target for annotation") } | a=expression ('=' | augassign) (yield_expr | star_expressions) { - RAISE_SYNTAX_ERROR("cannot assign to %s", _PyPegen_get_expr_name(a)) } + RAISE_SYNTAX_ERROR_NO_COL_OFFSET("cannot assign to %s", _PyPegen_get_expr_name(a)) } invalid_block: | NEWLINE !INDENT { RAISE_INDENTATION_ERROR("expected an indented block") } invalid_comprehension: diff --git a/Lib/test/test_cmd_line_script.py b/Lib/test/test_cmd_line_script.py index f0130e376ae..1fc9500738f 100644 --- a/Lib/test/test_cmd_line_script.py +++ b/Lib/test/test_cmd_line_script.py @@ -599,7 +599,7 @@ class CmdLineTest(unittest.TestCase): exitcode, stdout, stderr = assert_python_failure(script_name) text = io.TextIOWrapper(io.BytesIO(stderr), 'ascii').read() # Confirm that the caret is located under the first 1 character - self.assertIn("\n 1 + 1 = 2\n ^", text) + self.assertIn("\n 1 + 1 = 2\n ^", text) def test_syntaxerror_indented_caret_position(self): script = textwrap.dedent("""\ @@ -611,7 +611,7 @@ class CmdLineTest(unittest.TestCase): exitcode, stdout, stderr = assert_python_failure(script_name) text = io.TextIOWrapper(io.BytesIO(stderr), 'ascii').read() # Confirm that the caret is located under the first 1 character - self.assertIn("\n 1 + 1 = 2\n ^", text) + self.assertIn("\n 1 + 1 = 2\n ^", text) # Try the same with a form feed at the start of the indented line script = ( @@ -622,7 +622,7 @@ class CmdLineTest(unittest.TestCase): exitcode, stdout, stderr = assert_python_failure(script_name) text = io.TextIOWrapper(io.BytesIO(stderr), "ascii").read() self.assertNotIn("\f", text) - self.assertIn("\n 1 + 1 = 2\n ^", text) + self.assertIn("\n 1 + 1 = 2\n ^", text) def test_syntaxerror_multi_line_fstring(self): script = 'foo = f"""{}\nfoo"""\n' diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index a207fb48632..354b3f48437 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -178,19 +178,19 @@ class ExceptionTests(unittest.TestCase): s = '''if True:\n print()\n\texec "mixed tabs and spaces"''' ckmsg(s, "inconsistent use of tabs and spaces in indentation", TabError) - @support.skip_if_new_parser("Pegen column offsets might be different") - def testSyntaxErrorOffset(self): - def check(src, lineno, offset, encoding='utf-8'): - with self.assertRaises(SyntaxError) as cm: - compile(src, '', 'exec') - self.assertEqual(cm.exception.lineno, lineno) - self.assertEqual(cm.exception.offset, offset) - if cm.exception.text is not None: - if not isinstance(src, str): - src = src.decode(encoding, 'replace') - line = src.split('\n')[lineno-1] - self.assertIn(line, cm.exception.text) + def check(self, src, lineno, offset, encoding='utf-8'): + with self.assertRaises(SyntaxError) as cm: + compile(src, '', 'exec') + self.assertEqual(cm.exception.lineno, lineno) + self.assertEqual(cm.exception.offset, offset) + if cm.exception.text is not None: + if not isinstance(src, str): + src = src.decode(encoding, 'replace') + line = src.split('\n')[lineno-1] + self.assertIn(line, cm.exception.text) + def testSyntaxErrorOffset(self): + check = self.check check('def fact(x):\n\treturn x!\n', 2, 10) check('1 +\n', 1, 4) check('def spam():\n print(1)\n print(2)', 3, 10) @@ -238,20 +238,20 @@ class ExceptionTests(unittest.TestCase): check('nonlocal x', 1, 1) check('def f():\n global x\n nonlocal x', 2, 3) - # Errors thrown by ast.c - check('for 1 in []: pass', 1, 5) - check('def f(*):\n pass', 1, 7) - check('[*x for x in xs]', 1, 2) - check('def f():\n x, y: int', 2, 3) - check('(yield i) = 2', 1, 1) - check('foo(x for x in range(10), 100)', 1, 5) - check('foo(1=2)', 1, 5) - # Errors thrown by future.c check('from __future__ import doesnt_exist', 1, 1) check('from __future__ import braces', 1, 1) check('x=1\nfrom __future__ import division', 2, 1) + @support.skip_if_new_parser("Pegen column offsets might be different") + def testSyntaxErrorOffsetCustom(self): + self.check('for 1 in []: pass', 1, 5) + self.check('def f(*):\n pass', 1, 7) + self.check('[*x for x in xs]', 1, 2) + self.check('def f():\n x, y: int', 2, 3) + self.check('(yield i) = 2', 1, 1) + self.check('foo(x for x in range(10), 100)', 1, 5) + self.check('foo(1=2)', 1, 5) @cpython_only def testSettingException(self): diff --git a/Parser/pegen/parse.c b/Parser/pegen/parse.c index 2be5e384ae5..33c92c232c5 100644 --- a/Parser/pegen/parse.c +++ b/Parser/pegen/parse.c @@ -10515,7 +10515,7 @@ invalid_assignment_rule(Parser *p) (_tmp_132_var = _tmp_132_rule(p)) ) { - res = RAISE_SYNTAX_ERROR ( "cannot assign to %s" , _PyPegen_get_expr_name ( a ) ); + res = RAISE_SYNTAX_ERROR_NO_COL_OFFSET ( "cannot assign to %s" , _PyPegen_get_expr_name ( a ) ); if (res == NULL && PyErr_Occurred()) { p->error_indicator = 1; return NULL; diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index 40c09ffcc3a..a7add8fbb14 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -145,11 +145,15 @@ byte_offset_to_character_offset(PyObject *line, int col_offset) if (!str) { return 0; } - PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, NULL); + PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace"); if (!text) { return 0; } Py_ssize_t size = PyUnicode_GET_LENGTH(text); + str = PyUnicode_AsUTF8(text); + if (str != NULL && (int)strlen(str) == col_offset) { + size = strlen(str); + } Py_DECREF(text); return size; } @@ -297,68 +301,23 @@ error: } static inline PyObject * -get_error_line(char *buffer) +get_error_line(char *buffer, int is_file) { - char *newline = strchr(buffer, '\n'); + const char *newline; + if (is_file) { + newline = strrchr(buffer, '\n'); + } else { + newline = strchr(buffer, '\n'); + } + if (newline) { - return PyUnicode_FromStringAndSize(buffer, newline - buffer); + return PyUnicode_DecodeUTF8(buffer, newline - buffer, "replace"); } else { - return PyUnicode_FromString(buffer); + return PyUnicode_DecodeUTF8(buffer, strlen(buffer), "replace"); } } -static int -tokenizer_error_with_col_offset(Parser *p, PyObject *errtype, const char *errmsg) -{ - PyObject *errstr = NULL; - PyObject *value = NULL; - size_t col_number = -1; - - errstr = PyUnicode_FromString(errmsg); - if (!errstr) { - return -1; - } - - PyObject *loc = NULL; - if (p->start_rule == Py_file_input) { - loc = PyErr_ProgramTextObject(p->tok->filename, p->tok->lineno); - } - if (!loc) { - loc = get_error_line(p->tok->buf); - } - - if (loc) { - col_number = p->tok->cur - p->tok->buf; - } - else { - Py_INCREF(Py_None); - loc = Py_None; - } - - PyObject *tmp = Py_BuildValue("(OiiN)", p->tok->filename, p->tok->lineno, - col_number, loc); - if (!tmp) { - goto error; - } - - value = PyTuple_Pack(2, errstr, tmp); - Py_DECREF(tmp); - if (!value) { - goto error; - } - PyErr_SetObject(errtype, value); - - Py_XDECREF(value); - Py_XDECREF(errstr); - return -1; - -error: - Py_XDECREF(errstr); - Py_XDECREF(loc); - return -1; -} - static int tokenizer_error(Parser *p) { @@ -376,20 +335,20 @@ tokenizer_error(Parser *p) msg = "invalid character in identifier"; break; case E_BADPREFIX: - return tokenizer_error_with_col_offset(p, - errtype, "invalid string prefix"); + RAISE_SYNTAX_ERROR("invalid string prefix"); + return -1; case E_EOFS: - return tokenizer_error_with_col_offset(p, - errtype, "EOF while scanning triple-quoted string literal"); + RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal"); + return -1; case E_EOLS: - return tokenizer_error_with_col_offset(p, - errtype, "EOL while scanning string literal"); + RAISE_SYNTAX_ERROR("EOL while scanning string literal"); + return -1; case E_EOF: - return tokenizer_error_with_col_offset(p, - errtype, "unexpected EOF while parsing"); + RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); + return -1; case E_DEDENT: - return tokenizer_error_with_col_offset(p, - PyExc_IndentationError, "unindent does not match any outer indentation level"); + RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); + return -1; case E_INTR: if (!PyErr_Occurred()) { PyErr_SetNone(PyExc_KeyboardInterrupt); @@ -421,14 +380,14 @@ tokenizer_error(Parser *p) } void * -_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) +_PyPegen_raise_error(Parser *p, PyObject *errtype, int with_col_number, const char *errmsg, ...) { PyObject *value = NULL; PyObject *errstr = NULL; PyObject *loc = NULL; PyObject *tmp = NULL; Token *t = p->tokens[p->fill - 1]; - Py_ssize_t col_number = 0; + Py_ssize_t col_number = !with_col_number; va_list va; va_start(va, errmsg); @@ -443,14 +402,20 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) } if (!loc) { - loc = get_error_line(p->tok->buf); + loc = get_error_line(p->tok->buf, p->start_rule == Py_file_input); } - if (loc) { - int col_offset = t->col_offset == -1 ? 0 : t->col_offset; - col_number = byte_offset_to_character_offset(loc, col_offset) + 1; + if (loc && with_col_number) { + int col_offset; + if (t->col_offset == -1) { + col_offset = Py_SAFE_DOWNCAST(p->tok->cur - p->tok->buf, + intptr_t, int); + } else { + col_offset = t->col_offset + 1; + } + col_number = byte_offset_to_character_offset(loc, col_offset); } - else { + else if (!loc) { Py_INCREF(Py_None); loc = Py_None; } @@ -632,14 +597,6 @@ _PyPegen_fill_token(Parser *p) type = PyTokenizer_Get(p->tok, &start, &end); } - if (type == ERRORTOKEN) { - if (p->tok->done == E_DECODE) { - return raise_decode_error(p); - } - else { - return tokenizer_error(p); - } - } if (type == ENDMARKER && p->start_rule == Py_single_input && p->parsing_started) { type = NEWLINE; /* Add an extra newline */ p->parsing_started = 0; @@ -700,6 +657,16 @@ _PyPegen_fill_token(Parser *p) t->end_col_offset = p->tok->lineno == 1 ? p->starting_col_offset + end_col_offset : end_col_offset; p->fill += 1; + + if (type == ERRORTOKEN) { + if (p->tok->done == E_DECODE) { + return raise_decode_error(p); + } + else { + return tokenizer_error(p); + } + } + return 0; } diff --git a/Parser/pegen/pegen.h b/Parser/pegen/pegen.h index 1620f926094..cbe6f197ac7 100644 --- a/Parser/pegen/pegen.h +++ b/Parser/pegen/pegen.h @@ -126,14 +126,15 @@ expr_ty _PyPegen_name_token(Parser *p); expr_ty _PyPegen_number_token(Parser *p); void *_PyPegen_string_token(Parser *p); const char *_PyPegen_get_expr_name(expr_ty); -void *_PyPegen_raise_error(Parser *p, PyObject *, const char *errmsg, ...); +void *_PyPegen_raise_error(Parser *p, PyObject *errtype, int with_col_number, const char *errmsg, ...); void *_PyPegen_dummy_name(Parser *p, ...); #define UNUSED(expr) do { (void)(expr); } while (0) #define EXTRA_EXPR(head, tail) head->lineno, head->col_offset, tail->end_lineno, tail->end_col_offset, p->arena #define EXTRA start_lineno, start_col_offset, end_lineno, end_col_offset, p->arena -#define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, msg, ##__VA_ARGS__) -#define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, msg, ##__VA_ARGS__) +#define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 1, msg, ##__VA_ARGS__) +#define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, 1, msg, ##__VA_ARGS__) +#define RAISE_SYNTAX_ERROR_NO_COL_OFFSET(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 0, msg, ##__VA_ARGS__) Py_LOCAL_INLINE(void *) CHECK_CALL(Parser *p, void *result) @@ -190,8 +191,8 @@ INVALID_VERSION_CHECK(Parser *p, int version, char *msg, void *node) } if (p->feature_version < version) { p->error_indicator = 1; - return _PyPegen_raise_error(p, PyExc_SyntaxError, "%s only supported in Python 3.%i and greater", - msg, version); + return RAISE_SYNTAX_ERROR("%s only supported in Python 3.%i and greater", + msg, version); } return node; }