mirror of https://github.com/python/cpython
bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033)
This commit is contained in:
parent
f3a5b7ada0
commit
74ea6b5a75
|
@ -1222,6 +1222,8 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
|
|||
and where the hash values are equal (i.e. a very probable match) */
|
||||
PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
|
||||
|
||||
PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -29,7 +29,6 @@ extern "C" {
|
|||
#define E_EOFS 23 /* EOF in triple-quoted string */
|
||||
#define E_EOLS 24 /* EOL in single-quoted string */
|
||||
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
|
||||
#define E_IDENTIFIER 26 /* Invalid characters in identifier */
|
||||
#define E_BADSINGLE 27 /* Ill-formed single statement input */
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -583,7 +583,7 @@ non-important content
|
|||
])
|
||||
|
||||
# Different error message is raised for other whitespace characters.
|
||||
self.assertAllRaise(SyntaxError, 'invalid character in identifier',
|
||||
self.assertAllRaise(SyntaxError, r"invalid non-printable character U\+00A0",
|
||||
["f'''{\xa0}'''",
|
||||
"\xa0",
|
||||
])
|
||||
|
|
|
@ -57,6 +57,9 @@ class MiscSourceEncodingTest(unittest.TestCase):
|
|||
# one byte in common with the UTF-16-LE BOM
|
||||
self.assertRaises(SyntaxError, eval, b'\xff\x20')
|
||||
|
||||
# one byte in common with the UTF-8 BOM
|
||||
self.assertRaises(SyntaxError, eval, b'\xef\x20')
|
||||
|
||||
# two bytes in common with the UTF-8 BOM
|
||||
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
|
||||
|
||||
|
|
|
@ -20,9 +20,11 @@ class PEP3131Test(unittest.TestCase):
|
|||
def test_invalid(self):
|
||||
try:
|
||||
from test import badsyntax_3131
|
||||
except SyntaxError as s:
|
||||
self.assertEqual(str(s),
|
||||
"invalid character in identifier (badsyntax_3131.py, line 2)")
|
||||
except SyntaxError as err:
|
||||
self.assertEqual(str(err),
|
||||
"invalid character '€' (U+20AC) (badsyntax_3131.py, line 2)")
|
||||
self.assertEqual(err.lineno, 2)
|
||||
self.assertEqual(err.offset, 1)
|
||||
else:
|
||||
self.fail("expected exception didn't occur")
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Improved syntax errors for invalid characters in source code.
|
|
@ -12309,31 +12309,22 @@ unicode_isnumeric_impl(PyObject *self)
|
|||
Py_RETURN_TRUE;
|
||||
}
|
||||
|
||||
int
|
||||
PyUnicode_IsIdentifier(PyObject *self)
|
||||
Py_ssize_t
|
||||
_PyUnicode_ScanIdentifier(PyObject *self)
|
||||
{
|
||||
Py_ssize_t i;
|
||||
int ready = PyUnicode_IS_READY(self);
|
||||
if (PyUnicode_READY(self) == -1)
|
||||
return -1;
|
||||
|
||||
Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
|
||||
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
|
||||
if (len == 0) {
|
||||
/* an empty string is not a valid identifier */
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kind = 0;
|
||||
const void *data = NULL;
|
||||
const wchar_t *wstr = NULL;
|
||||
Py_UCS4 ch;
|
||||
if (ready) {
|
||||
kind = PyUnicode_KIND(self);
|
||||
data = PyUnicode_DATA(self);
|
||||
ch = PyUnicode_READ(kind, data, 0);
|
||||
}
|
||||
else {
|
||||
wstr = _PyUnicode_WSTR(self);
|
||||
ch = wstr[0];
|
||||
}
|
||||
int kind = PyUnicode_KIND(self);
|
||||
const void *data = PyUnicode_DATA(self);
|
||||
Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
|
||||
/* PEP 3131 says that the first character must be in
|
||||
XID_Start and subsequent characters in XID_Continue,
|
||||
and for the ASCII range, the 2.x rules apply (i.e
|
||||
|
@ -12347,17 +12338,44 @@ PyUnicode_IsIdentifier(PyObject *self)
|
|||
}
|
||||
|
||||
for (i = 1; i < len; i++) {
|
||||
if (ready) {
|
||||
ch = PyUnicode_READ(kind, data, i);
|
||||
}
|
||||
else {
|
||||
ch = wstr[i];
|
||||
}
|
||||
ch = PyUnicode_READ(kind, data, i);
|
||||
if (!_PyUnicode_IsXidContinue(ch)) {
|
||||
return 0;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
return i;
|
||||
}
|
||||
|
||||
int
|
||||
PyUnicode_IsIdentifier(PyObject *self)
|
||||
{
|
||||
if (PyUnicode_IS_READY(self)) {
|
||||
Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
|
||||
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
|
||||
/* an empty string is not a valid identifier */
|
||||
return len && i == len;
|
||||
}
|
||||
else {
|
||||
Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
|
||||
if (len == 0) {
|
||||
/* an empty string is not a valid identifier */
|
||||
return 0;
|
||||
}
|
||||
|
||||
const wchar_t *wstr = _PyUnicode_WSTR(self);
|
||||
Py_UCS4 ch = wstr[0];
|
||||
if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (i = 1; i < len; i++) {
|
||||
ch = wstr[i];
|
||||
if (!_PyUnicode_IsXidContinue(ch)) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*[clinic input]
|
||||
|
|
|
@ -337,9 +337,6 @@ tokenizer_error(Parser *p)
|
|||
case E_TOKEN:
|
||||
msg = "invalid token";
|
||||
break;
|
||||
case E_IDENTIFIER:
|
||||
msg = "invalid character in identifier";
|
||||
break;
|
||||
case E_EOFS:
|
||||
RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
|
||||
return -1;
|
||||
|
|
|
@ -1101,25 +1101,53 @@ static int
|
|||
verify_identifier(struct tok_state *tok)
|
||||
{
|
||||
PyObject *s;
|
||||
int result;
|
||||
if (tok->decoding_erred)
|
||||
return 0;
|
||||
s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
|
||||
if (s == NULL) {
|
||||
if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
|
||||
PyErr_Clear();
|
||||
tok->done = E_IDENTIFIER;
|
||||
} else {
|
||||
tok->done = E_DECODE;
|
||||
}
|
||||
else {
|
||||
tok->done = E_ERROR;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
result = PyUnicode_IsIdentifier(s);
|
||||
Py_DECREF(s);
|
||||
if (result == 0) {
|
||||
tok->done = E_IDENTIFIER;
|
||||
Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
|
||||
if (invalid < 0) {
|
||||
Py_DECREF(s);
|
||||
tok->done = E_ERROR;
|
||||
return 0;
|
||||
}
|
||||
return result;
|
||||
assert(PyUnicode_GET_LENGTH(s) > 0);
|
||||
if (invalid < PyUnicode_GET_LENGTH(s)) {
|
||||
Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
|
||||
if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
|
||||
/* Determine the offset in UTF-8 encoded input */
|
||||
Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
|
||||
if (s != NULL) {
|
||||
Py_SETREF(s, PyUnicode_AsUTF8String(s));
|
||||
}
|
||||
if (s == NULL) {
|
||||
tok->done = E_ERROR;
|
||||
return 0;
|
||||
}
|
||||
tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
|
||||
}
|
||||
Py_DECREF(s);
|
||||
// PyUnicode_FromFormatV() does not support %X
|
||||
char hex[9];
|
||||
snprintf(hex, sizeof(hex), "%04X", ch);
|
||||
if (Py_UNICODE_ISPRINTABLE(ch)) {
|
||||
syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
|
||||
}
|
||||
else {
|
||||
syntaxerror(tok, "invalid non-printable character U+%s", hex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
Py_DECREF(s);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int
|
||||
|
|
|
@ -1603,9 +1603,6 @@ err_input(perrdetail *err)
|
|||
msg = "unexpected character after line continuation character";
|
||||
break;
|
||||
|
||||
case E_IDENTIFIER:
|
||||
msg = "invalid character in identifier";
|
||||
break;
|
||||
case E_BADSINGLE:
|
||||
msg = "multiple statements found while compiling a single statement";
|
||||
break;
|
||||
|
|
Loading…
Reference in New Issue