mirror of https://github.com/python/cpython
bpo-45461: Fix IncrementalDecoder and StreamReader in the "unicode-escape" codec (GH-28939)
They support now splitting escape sequences between input chunks. Add the third parameter "final" in codecs.unicode_escape_decode(). It is True by default to match the former behavior.
This commit is contained in:
parent
e71662c1ae
commit
c96d1546b1
|
@ -777,12 +777,20 @@ PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
|
||||||
|
|
||||||
/* --- Unicode-Escape Codecs ---------------------------------------------- */
|
/* --- Unicode-Escape Codecs ---------------------------------------------- */
|
||||||
|
|
||||||
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
|
/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
|
||||||
chars. */
|
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
|
||||||
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
|
|
||||||
const char *string, /* Unicode-Escape encoded string */
|
const char *string, /* Unicode-Escape encoded string */
|
||||||
Py_ssize_t length, /* size of string */
|
Py_ssize_t length, /* size of string */
|
||||||
const char *errors, /* error handling */
|
const char *errors, /* error handling */
|
||||||
|
Py_ssize_t *consumed /* bytes consumed */
|
||||||
|
);
|
||||||
|
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
|
||||||
|
chars. */
|
||||||
|
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
|
||||||
|
const char *string, /* Unicode-Escape encoded string */
|
||||||
|
Py_ssize_t length, /* size of string */
|
||||||
|
const char *errors, /* error handling */
|
||||||
|
Py_ssize_t *consumed, /* bytes consumed */
|
||||||
const char **first_invalid_escape /* on return, points to first
|
const char **first_invalid_escape /* on return, points to first
|
||||||
invalid escaped char in
|
invalid escaped char in
|
||||||
string. */
|
string. */
|
||||||
|
|
|
@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||||
def encode(self, input, final=False):
|
def encode(self, input, final=False):
|
||||||
return codecs.unicode_escape_encode(input, self.errors)[0]
|
return codecs.unicode_escape_encode(input, self.errors)[0]
|
||||||
|
|
||||||
class IncrementalDecoder(codecs.IncrementalDecoder):
|
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||||
def decode(self, input, final=False):
|
def _buffer_decode(self, input, errors, final):
|
||||||
return codecs.unicode_escape_decode(input, self.errors)[0]
|
return codecs.unicode_escape_decode(input, errors, final)
|
||||||
|
|
||||||
class StreamWriter(Codec,codecs.StreamWriter):
|
class StreamWriter(Codec,codecs.StreamWriter):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class StreamReader(Codec,codecs.StreamReader):
|
class StreamReader(Codec,codecs.StreamReader):
|
||||||
pass
|
def decode(self, input, errors='strict'):
|
||||||
|
return codecs.unicode_escape_decode(input, errors, False)
|
||||||
|
|
||||||
### encodings module API
|
### encodings module API
|
||||||
|
|
||||||
|
|
|
@ -114,7 +114,7 @@ class ReadTest(MixInCheckStateHandling):
|
||||||
q = Queue(b"")
|
q = Queue(b"")
|
||||||
r = codecs.getreader(self.encoding)(q)
|
r = codecs.getreader(self.encoding)(q)
|
||||||
result = ""
|
result = ""
|
||||||
for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
|
for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
|
||||||
q.write(bytes([c]))
|
q.write(bytes([c]))
|
||||||
result += r.read()
|
result += r.read()
|
||||||
self.assertEqual(result, partialresult)
|
self.assertEqual(result, partialresult)
|
||||||
|
@ -125,7 +125,7 @@ class ReadTest(MixInCheckStateHandling):
|
||||||
# do the check again, this time using an incremental decoder
|
# do the check again, this time using an incremental decoder
|
||||||
d = codecs.getincrementaldecoder(self.encoding)()
|
d = codecs.getincrementaldecoder(self.encoding)()
|
||||||
result = ""
|
result = ""
|
||||||
for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
|
for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
|
||||||
result += d.decode(bytes([c]))
|
result += d.decode(bytes([c]))
|
||||||
self.assertEqual(result, partialresult)
|
self.assertEqual(result, partialresult)
|
||||||
# check that there's nothing left in the buffers
|
# check that there's nothing left in the buffers
|
||||||
|
@ -135,7 +135,7 @@ class ReadTest(MixInCheckStateHandling):
|
||||||
# Check whether the reset method works properly
|
# Check whether the reset method works properly
|
||||||
d.reset()
|
d.reset()
|
||||||
result = ""
|
result = ""
|
||||||
for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
|
for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
|
||||||
result += d.decode(bytes([c]))
|
result += d.decode(bytes([c]))
|
||||||
self.assertEqual(result, partialresult)
|
self.assertEqual(result, partialresult)
|
||||||
# check that there's nothing left in the buffers
|
# check that there's nothing left in the buffers
|
||||||
|
@ -2353,7 +2353,11 @@ class TypesTest(unittest.TestCase):
|
||||||
(r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
|
(r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
|
||||||
|
|
||||||
|
|
||||||
class UnicodeEscapeTest(unittest.TestCase):
|
class UnicodeEscapeTest(ReadTest, unittest.TestCase):
|
||||||
|
encoding = "unicode-escape"
|
||||||
|
|
||||||
|
test_lone_surrogates = None
|
||||||
|
|
||||||
def test_empty(self):
|
def test_empty(self):
|
||||||
self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
|
self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
|
||||||
self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
|
self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
|
||||||
|
@ -2440,6 +2444,44 @@ class UnicodeEscapeTest(unittest.TestCase):
|
||||||
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
|
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
|
||||||
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
|
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
|
||||||
|
|
||||||
|
def test_partial(self):
|
||||||
|
self.check_partial(
|
||||||
|
"\x00\t\n\r\\\xff\uffff\U00010000",
|
||||||
|
[
|
||||||
|
'',
|
||||||
|
'',
|
||||||
|
'',
|
||||||
|
'\x00',
|
||||||
|
'\x00',
|
||||||
|
'\x00\t',
|
||||||
|
'\x00\t',
|
||||||
|
'\x00\t\n',
|
||||||
|
'\x00\t\n',
|
||||||
|
'\x00\t\n\r',
|
||||||
|
'\x00\t\n\r',
|
||||||
|
'\x00\t\n\r\\',
|
||||||
|
'\x00\t\n\r\\',
|
||||||
|
'\x00\t\n\r\\',
|
||||||
|
'\x00\t\n\r\\',
|
||||||
|
'\x00\t\n\r\\\xff',
|
||||||
|
'\x00\t\n\r\\\xff',
|
||||||
|
'\x00\t\n\r\\\xff',
|
||||||
|
'\x00\t\n\r\\\xff',
|
||||||
|
'\x00\t\n\r\\\xff',
|
||||||
|
'\x00\t\n\r\\\xff',
|
||||||
|
'\x00\t\n\r\\\xff\uffff',
|
||||||
|
'\x00\t\n\r\\\xff\uffff',
|
||||||
|
'\x00\t\n\r\\\xff\uffff',
|
||||||
|
'\x00\t\n\r\\\xff\uffff',
|
||||||
|
'\x00\t\n\r\\\xff\uffff',
|
||||||
|
'\x00\t\n\r\\\xff\uffff',
|
||||||
|
'\x00\t\n\r\\\xff\uffff',
|
||||||
|
'\x00\t\n\r\\\xff\uffff',
|
||||||
|
'\x00\t\n\r\\\xff\uffff',
|
||||||
|
'\x00\t\n\r\\\xff\uffff',
|
||||||
|
'\x00\t\n\r\\\xff\uffff\U00010000',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
class RawUnicodeEscapeTest(unittest.TestCase):
|
class RawUnicodeEscapeTest(unittest.TestCase):
|
||||||
def test_empty(self):
|
def test_empty(self):
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Fix incremental decoder and stream reader in the "unicode-escape" codec.
|
||||||
|
Previously they failed if the escape sequence was split.
|
|
@ -489,17 +489,20 @@ _codecs_utf_32_ex_decode_impl(PyObject *module, Py_buffer *data,
|
||||||
_codecs.unicode_escape_decode
|
_codecs.unicode_escape_decode
|
||||||
data: Py_buffer(accept={str, buffer})
|
data: Py_buffer(accept={str, buffer})
|
||||||
errors: str(accept={str, NoneType}) = None
|
errors: str(accept={str, NoneType}) = None
|
||||||
|
final: bool(accept={int}) = True
|
||||||
/
|
/
|
||||||
[clinic start generated code]*/
|
[clinic start generated code]*/
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
_codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
|
_codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
|
||||||
const char *errors)
|
const char *errors, int final)
|
||||||
/*[clinic end generated code: output=3ca3c917176b82ab input=8328081a3a569bd6]*/
|
/*[clinic end generated code: output=b284f97b12c635ee input=6154f039a9f7c639]*/
|
||||||
{
|
{
|
||||||
PyObject *decoded = PyUnicode_DecodeUnicodeEscape(data->buf, data->len,
|
Py_ssize_t consumed = data->len;
|
||||||
errors);
|
PyObject *decoded = _PyUnicode_DecodeUnicodeEscapeStateful(data->buf, data->len,
|
||||||
return codec_tuple(decoded, data->len);
|
errors,
|
||||||
|
final ? NULL : &consumed);
|
||||||
|
return codec_tuple(decoded, consumed);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*[clinic input]
|
/*[clinic input]
|
||||||
|
|
|
@ -1063,7 +1063,7 @@ exit:
|
||||||
}
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(_codecs_unicode_escape_decode__doc__,
|
PyDoc_STRVAR(_codecs_unicode_escape_decode__doc__,
|
||||||
"unicode_escape_decode($module, data, errors=None, /)\n"
|
"unicode_escape_decode($module, data, errors=None, final=True, /)\n"
|
||||||
"--\n"
|
"--\n"
|
||||||
"\n");
|
"\n");
|
||||||
|
|
||||||
|
@ -1072,7 +1072,7 @@ PyDoc_STRVAR(_codecs_unicode_escape_decode__doc__,
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
_codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
|
_codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
|
||||||
const char *errors);
|
const char *errors, int final);
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
_codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
|
_codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
|
||||||
|
@ -1080,8 +1080,9 @@ _codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_
|
||||||
PyObject *return_value = NULL;
|
PyObject *return_value = NULL;
|
||||||
Py_buffer data = {NULL, NULL};
|
Py_buffer data = {NULL, NULL};
|
||||||
const char *errors = NULL;
|
const char *errors = NULL;
|
||||||
|
int final = 1;
|
||||||
|
|
||||||
if (!_PyArg_CheckPositional("unicode_escape_decode", nargs, 1, 2)) {
|
if (!_PyArg_CheckPositional("unicode_escape_decode", nargs, 1, 3)) {
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
if (PyUnicode_Check(args[0])) {
|
if (PyUnicode_Check(args[0])) {
|
||||||
|
@ -1122,8 +1123,15 @@ _codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_
|
||||||
_PyArg_BadArgument("unicode_escape_decode", "argument 2", "str or None", args[1]);
|
_PyArg_BadArgument("unicode_escape_decode", "argument 2", "str or None", args[1]);
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
|
if (nargs < 3) {
|
||||||
|
goto skip_optional;
|
||||||
|
}
|
||||||
|
final = _PyLong_AsInt(args[2]);
|
||||||
|
if (final == -1 && PyErr_Occurred()) {
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
skip_optional:
|
skip_optional:
|
||||||
return_value = _codecs_unicode_escape_decode_impl(module, &data, errors);
|
return_value = _codecs_unicode_escape_decode_impl(module, &data, errors, final);
|
||||||
|
|
||||||
exit:
|
exit:
|
||||||
/* Cleanup for data */
|
/* Cleanup for data */
|
||||||
|
@ -2801,4 +2809,4 @@ exit:
|
||||||
#ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
|
#ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
|
||||||
#define _CODECS_CODE_PAGE_ENCODE_METHODDEF
|
#define _CODECS_CODE_PAGE_ENCODE_METHODDEF
|
||||||
#endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
|
#endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
|
||||||
/*[clinic end generated code: output=557c3b37e4c492ac input=a9049054013a1b77]*/
|
/*[clinic end generated code: output=9e9fb1d5d81577e0 input=a9049054013a1b77]*/
|
||||||
|
|
|
@ -6342,9 +6342,10 @@ PyUnicode_AsUTF16String(PyObject *unicode)
|
||||||
static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
|
static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
_PyUnicode_DecodeUnicodeEscape(const char *s,
|
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
|
||||||
Py_ssize_t size,
|
Py_ssize_t size,
|
||||||
const char *errors,
|
const char *errors,
|
||||||
|
Py_ssize_t *consumed,
|
||||||
const char **first_invalid_escape)
|
const char **first_invalid_escape)
|
||||||
{
|
{
|
||||||
const char *starts = s;
|
const char *starts = s;
|
||||||
|
@ -6357,6 +6358,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
*first_invalid_escape = NULL;
|
*first_invalid_escape = NULL;
|
||||||
|
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
|
if (consumed) {
|
||||||
|
*consumed = 0;
|
||||||
|
}
|
||||||
_Py_RETURN_UNICODE_EMPTY();
|
_Py_RETURN_UNICODE_EMPTY();
|
||||||
}
|
}
|
||||||
/* Escaped strings will always be longer than the resulting
|
/* Escaped strings will always be longer than the resulting
|
||||||
|
@ -6407,7 +6411,7 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
/* \ - Escapes */
|
/* \ - Escapes */
|
||||||
if (s >= end) {
|
if (s >= end) {
|
||||||
message = "\\ at end of string";
|
message = "\\ at end of string";
|
||||||
goto error;
|
goto incomplete;
|
||||||
}
|
}
|
||||||
c = (unsigned char) *s++;
|
c = (unsigned char) *s++;
|
||||||
|
|
||||||
|
@ -6461,7 +6465,10 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
count = 8;
|
count = 8;
|
||||||
message = "truncated \\UXXXXXXXX escape";
|
message = "truncated \\UXXXXXXXX escape";
|
||||||
hexescape:
|
hexescape:
|
||||||
for (ch = 0; count && s < end; ++s, --count) {
|
for (ch = 0; count; ++s, --count) {
|
||||||
|
if (s >= end) {
|
||||||
|
goto incomplete;
|
||||||
|
}
|
||||||
c = (unsigned char)*s;
|
c = (unsigned char)*s;
|
||||||
ch <<= 4;
|
ch <<= 4;
|
||||||
if (c >= '0' && c <= '9') {
|
if (c >= '0' && c <= '9') {
|
||||||
|
@ -6474,12 +6481,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
ch += c - ('A' - 10);
|
ch += c - ('A' - 10);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (count) {
|
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* when we get here, ch is a 32-bit unicode character */
|
/* when we get here, ch is a 32-bit unicode character */
|
||||||
if (ch > MAX_UNICODE) {
|
if (ch > MAX_UNICODE) {
|
||||||
|
@ -6506,14 +6510,20 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
}
|
}
|
||||||
|
|
||||||
message = "malformed \\N character escape";
|
message = "malformed \\N character escape";
|
||||||
if (s < end && *s == '{') {
|
if (s >= end) {
|
||||||
|
goto incomplete;
|
||||||
|
}
|
||||||
|
if (*s == '{') {
|
||||||
const char *start = ++s;
|
const char *start = ++s;
|
||||||
size_t namelen;
|
size_t namelen;
|
||||||
/* look for the closing brace */
|
/* look for the closing brace */
|
||||||
while (s < end && *s != '}')
|
while (s < end && *s != '}')
|
||||||
s++;
|
s++;
|
||||||
|
if (s >= end) {
|
||||||
|
goto incomplete;
|
||||||
|
}
|
||||||
namelen = s - start;
|
namelen = s - start;
|
||||||
if (namelen && s < end) {
|
if (namelen) {
|
||||||
/* found a name. look it up in the unicode database */
|
/* found a name. look it up in the unicode database */
|
||||||
s++;
|
s++;
|
||||||
ch = 0xffffffff; /* in case 'getcode' messes up */
|
ch = 0xffffffff; /* in case 'getcode' messes up */
|
||||||
|
@ -6539,6 +6549,11 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
incomplete:
|
||||||
|
if (consumed) {
|
||||||
|
*consumed = startinpos;
|
||||||
|
break;
|
||||||
|
}
|
||||||
error:
|
error:
|
||||||
endinpos = s-starts;
|
endinpos = s-starts;
|
||||||
writer.min_length = end - s + writer.pos;
|
writer.min_length = end - s + writer.pos;
|
||||||
|
@ -6567,12 +6582,14 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
PyUnicode_DecodeUnicodeEscape(const char *s,
|
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
|
||||||
Py_ssize_t size,
|
Py_ssize_t size,
|
||||||
const char *errors)
|
const char *errors,
|
||||||
|
Py_ssize_t *consumed)
|
||||||
{
|
{
|
||||||
const char *first_invalid_escape;
|
const char *first_invalid_escape;
|
||||||
PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
|
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
|
||||||
|
consumed,
|
||||||
&first_invalid_escape);
|
&first_invalid_escape);
|
||||||
if (result == NULL)
|
if (result == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -6587,6 +6604,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PyObject *
|
||||||
|
PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
|
Py_ssize_t size,
|
||||||
|
const char *errors)
|
||||||
|
{
|
||||||
|
return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
/* Return a Unicode-Escape string version of the Unicode object. */
|
/* Return a Unicode-Escape string version of the Unicode object. */
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
|
|
|
@ -115,7 +115,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
|
||||||
s = buf;
|
s = buf;
|
||||||
|
|
||||||
const char *first_invalid_escape;
|
const char *first_invalid_escape;
|
||||||
v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
|
v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
|
||||||
|
|
||||||
if (v != NULL && first_invalid_escape != NULL) {
|
if (v != NULL && first_invalid_escape != NULL) {
|
||||||
if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
|
if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
|
||||||
|
|
Loading…
Reference in New Issue