bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033)

This commit is contained in:
Serhiy Storchaka 2020-05-12 12:42:04 +03:00 committed by GitHub
parent f3a5b7ada0
commit 74ea6b5a75
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 92 additions and 45 deletions

View File

@ -1222,6 +1222,8 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
and where the hash values are equal (i.e. a very probable match) */ and where the hash values are equal (i.e. a very probable match) */
PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *); PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -29,7 +29,6 @@ extern "C" {
#define E_EOFS 23 /* EOF in triple-quoted string */ #define E_EOFS 23 /* EOF in triple-quoted string */
#define E_EOLS 24 /* EOL in single-quoted string */ #define E_EOLS 24 /* EOL in single-quoted string */
#define E_LINECONT 25 /* Unexpected characters after a line continuation */ #define E_LINECONT 25 /* Unexpected characters after a line continuation */
#define E_IDENTIFIER 26 /* Invalid characters in identifier */
#define E_BADSINGLE 27 /* Ill-formed single statement input */ #define E_BADSINGLE 27 /* Ill-formed single statement input */
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -583,7 +583,7 @@ non-important content
]) ])
# Different error message is raised for other whitespace characters. # Different error message is raised for other whitespace characters.
self.assertAllRaise(SyntaxError, 'invalid character in identifier', self.assertAllRaise(SyntaxError, r"invalid non-printable character U\+00A0",
["f'''{\xa0}'''", ["f'''{\xa0}'''",
"\xa0", "\xa0",
]) ])

View File

@ -57,6 +57,9 @@ class MiscSourceEncodingTest(unittest.TestCase):
# one byte in common with the UTF-16-LE BOM # one byte in common with the UTF-16-LE BOM
self.assertRaises(SyntaxError, eval, b'\xff\x20') self.assertRaises(SyntaxError, eval, b'\xff\x20')
# one byte in common with the UTF-8 BOM
self.assertRaises(SyntaxError, eval, b'\xef\x20')
# two bytes in common with the UTF-8 BOM # two bytes in common with the UTF-8 BOM
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20') self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')

View File

@ -20,9 +20,11 @@ class PEP3131Test(unittest.TestCase):
def test_invalid(self): def test_invalid(self):
try: try:
from test import badsyntax_3131 from test import badsyntax_3131
except SyntaxError as s: except SyntaxError as err:
self.assertEqual(str(s), self.assertEqual(str(err),
"invalid character in identifier (badsyntax_3131.py, line 2)") "invalid character '' (U+20AC) (badsyntax_3131.py, line 2)")
self.assertEqual(err.lineno, 2)
self.assertEqual(err.offset, 1)
else: else:
self.fail("expected exception didn't occur") self.fail("expected exception didn't occur")

View File

@ -0,0 +1 @@
Improved syntax errors for invalid characters in source code.

View File

@ -12309,31 +12309,22 @@ unicode_isnumeric_impl(PyObject *self)
Py_RETURN_TRUE; Py_RETURN_TRUE;
} }
int Py_ssize_t
PyUnicode_IsIdentifier(PyObject *self) _PyUnicode_ScanIdentifier(PyObject *self)
{ {
Py_ssize_t i; Py_ssize_t i;
int ready = PyUnicode_IS_READY(self); if (PyUnicode_READY(self) == -1)
return -1;
Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self); Py_ssize_t len = PyUnicode_GET_LENGTH(self);
if (len == 0) { if (len == 0) {
/* an empty string is not a valid identifier */ /* an empty string is not a valid identifier */
return 0; return 0;
} }
int kind = 0; int kind = PyUnicode_KIND(self);
const void *data = NULL; const void *data = PyUnicode_DATA(self);
const wchar_t *wstr = NULL; Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Py_UCS4 ch;
if (ready) {
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
ch = PyUnicode_READ(kind, data, 0);
}
else {
wstr = _PyUnicode_WSTR(self);
ch = wstr[0];
}
/* PEP 3131 says that the first character must be in /* PEP 3131 says that the first character must be in
XID_Start and subsequent characters in XID_Continue, XID_Start and subsequent characters in XID_Continue,
and for the ASCII range, the 2.x rules apply (i.e and for the ASCII range, the 2.x rules apply (i.e
@ -12347,18 +12338,45 @@ PyUnicode_IsIdentifier(PyObject *self)
} }
for (i = 1; i < len; i++) { for (i = 1; i < len; i++) {
if (ready) {
ch = PyUnicode_READ(kind, data, i); ch = PyUnicode_READ(kind, data, i);
if (!_PyUnicode_IsXidContinue(ch)) {
return i;
}
}
return i;
}
int
PyUnicode_IsIdentifier(PyObject *self)
{
if (PyUnicode_IS_READY(self)) {
Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
/* an empty string is not a valid identifier */
return len && i == len;
} }
else { else {
ch = wstr[i]; Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
if (len == 0) {
/* an empty string is not a valid identifier */
return 0;
} }
const wchar_t *wstr = _PyUnicode_WSTR(self);
Py_UCS4 ch = wstr[0];
if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
return 0;
}
for (i = 1; i < len; i++) {
ch = wstr[i];
if (!_PyUnicode_IsXidContinue(ch)) { if (!_PyUnicode_IsXidContinue(ch)) {
return 0; return 0;
} }
} }
return 1; return 1;
} }
}
/*[clinic input] /*[clinic input]
str.isidentifier as unicode_isidentifier str.isidentifier as unicode_isidentifier

View File

@ -337,9 +337,6 @@ tokenizer_error(Parser *p)
case E_TOKEN: case E_TOKEN:
msg = "invalid token"; msg = "invalid token";
break; break;
case E_IDENTIFIER:
msg = "invalid character in identifier";
break;
case E_EOFS: case E_EOFS:
RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal"); RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
return -1; return -1;

View File

@ -1101,25 +1101,53 @@ static int
verify_identifier(struct tok_state *tok) verify_identifier(struct tok_state *tok)
{ {
PyObject *s; PyObject *s;
int result;
if (tok->decoding_erred) if (tok->decoding_erred)
return 0; return 0;
s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
if (s == NULL) { if (s == NULL) {
if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
PyErr_Clear(); tok->done = E_DECODE;
tok->done = E_IDENTIFIER; }
} else { else {
tok->done = E_ERROR; tok->done = E_ERROR;
} }
return 0; return 0;
} }
result = PyUnicode_IsIdentifier(s); Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
if (invalid < 0) {
Py_DECREF(s); Py_DECREF(s);
if (result == 0) { tok->done = E_ERROR;
tok->done = E_IDENTIFIER; return 0;
} }
return result; assert(PyUnicode_GET_LENGTH(s) > 0);
if (invalid < PyUnicode_GET_LENGTH(s)) {
Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
/* Determine the offset in UTF-8 encoded input */
Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
if (s != NULL) {
Py_SETREF(s, PyUnicode_AsUTF8String(s));
}
if (s == NULL) {
tok->done = E_ERROR;
return 0;
}
tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
}
Py_DECREF(s);
// PyUnicode_FromFormatV() does not support %X
char hex[9];
snprintf(hex, sizeof(hex), "%04X", ch);
if (Py_UNICODE_ISPRINTABLE(ch)) {
syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
}
else {
syntaxerror(tok, "invalid non-printable character U+%s", hex);
}
return 0;
}
Py_DECREF(s);
return 1;
} }
static int static int

View File

@ -1603,9 +1603,6 @@ err_input(perrdetail *err)
msg = "unexpected character after line continuation character"; msg = "unexpected character after line continuation character";
break; break;
case E_IDENTIFIER:
msg = "invalid character in identifier";
break;
case E_BADSINGLE: case E_BADSINGLE:
msg = "multiple statements found while compiling a single statement"; msg = "multiple statements found while compiling a single statement";
break; break;