bpo-45461: Fix IncrementalDecoder and StreamReader in the "unicode-escape" codec (GH-28939)

They support now splitting escape sequences between input chunks. Add the third parameter "final" in codecs.unicode_escape_decode(). It is True by default to match the former behavior.
2021-10-14 13:17:00 +03:00 · 2021-10-14 13:17:00 +03:00 · c96d1546b1
parent e71662c1ae
commit c96d1546b1
8 changed files with 123 additions and 34 deletions
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@ -777,12 +777,20 @@ PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(

 /* --- Unicode-Escape Codecs ---------------------------------------------- */

-/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
-   chars. */
-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
+/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
        const char *string,     /* Unicode-Escape encoded string */
        Py_ssize_t length,      /* size of string */
        const char *errors,     /* error handling */
+        Py_ssize_t *consumed    /* bytes consumed */
+);
+/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
+   chars. */
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
+        const char *string,     /* Unicode-Escape encoded string */
+        Py_ssize_t length,      /* size of string */
+        const char *errors,     /* error handling */
+        Py_ssize_t *consumed,   /* bytes consumed */
        const char **first_invalid_escape  /* on return, points to first
                                              invalid escaped char in
                                              string. */
--- a/Lib/encodings/unicode_escape.py
+++ b/Lib/encodings/unicode_escape.py
@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
    def encode(self, input, final=False):
        return codecs.unicode_escape_encode(input, self.errors)[0]

-class IncrementalDecoder(codecs.IncrementalDecoder):
-    def decode(self, input, final=False):
-        return codecs.unicode_escape_decode(input, self.errors)[0]
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    def _buffer_decode(self, input, errors, final):
+        return codecs.unicode_escape_decode(input, errors, final)

 class StreamWriter(Codec,codecs.StreamWriter):
    pass

 class StreamReader(Codec,codecs.StreamReader):
-    pass
+    def decode(self, input, errors='strict'):
+        return codecs.unicode_escape_decode(input, errors, False)

 ### encodings module API

--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -114,7 +114,7 @@ class ReadTest(MixInCheckStateHandling):
        q = Queue(b"")
        r = codecs.getreader(self.encoding)(q)
        result = ""
-        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
+        for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
            q.write(bytes([c]))
            result += r.read()
            self.assertEqual(result, partialresult)
@ -125,7 +125,7 @@ class ReadTest(MixInCheckStateHandling):
        # do the check again, this time using an incremental decoder
        d = codecs.getincrementaldecoder(self.encoding)()
        result = ""
-        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
+        for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
            result += d.decode(bytes([c]))
            self.assertEqual(result, partialresult)
        # check that there's nothing left in the buffers
@ -135,7 +135,7 @@ class ReadTest(MixInCheckStateHandling):
        # Check whether the reset method works properly
        d.reset()
        result = ""
-        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
+        for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
            result += d.decode(bytes([c]))
            self.assertEqual(result, partialresult)
        # check that there's nothing left in the buffers
@ -2353,7 +2353,11 @@ class TypesTest(unittest.TestCase):
                         (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))


-class UnicodeEscapeTest(unittest.TestCase):
+class UnicodeEscapeTest(ReadTest, unittest.TestCase):
+    encoding = "unicode-escape"
+
+    test_lone_surrogates = None
+
    def test_empty(self):
        self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
        self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
@ -2440,6 +2444,44 @@ class UnicodeEscapeTest(unittest.TestCase):
        self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
        self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))

+    def test_partial(self):
+        self.check_partial(
+            "\x00\t\n\r\\\xff\uffff\U00010000",
+            [
+                '',
+                '',
+                '',
+                '\x00',
+                '\x00',
+                '\x00\t',
+                '\x00\t',
+                '\x00\t\n',
+                '\x00\t\n',
+                '\x00\t\n\r',
+                '\x00\t\n\r',
+                '\x00\t\n\r\\',
+                '\x00\t\n\r\\',
+                '\x00\t\n\r\\',
+                '\x00\t\n\r\\',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff\U00010000',
+            ]
+        )

 class RawUnicodeEscapeTest(unittest.TestCase):
    def test_empty(self):
--- a/Misc/NEWS.d/next/Library/2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst
+++ b/Misc/NEWS.d/next/Library/2021-10-14-00-19-02.bpo-45461.4LB_tJ.rst
@ -0,0 +1,2 @@
+Fix incremental decoder and stream reader in the "unicode-escape" codec.
+Previously they failed if the escape sequence was split.
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@ -489,17 +489,20 @@ _codecs_utf_32_ex_decode_impl(PyObject *module, Py_buffer *data,
 _codecs.unicode_escape_decode
    data: Py_buffer(accept={str, buffer})
    errors: str(accept={str, NoneType}) = None
+    final: bool(accept={int}) = True
    /
 [clinic start generated code]*/

 static PyObject *
 _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
-                                   const char *errors)
-/*[clinic end generated code: output=3ca3c917176b82ab input=8328081a3a569bd6]*/
+                                   const char *errors, int final)
+/*[clinic end generated code: output=b284f97b12c635ee input=6154f039a9f7c639]*/
 {
-    PyObject *decoded = PyUnicode_DecodeUnicodeEscape(data->buf, data->len,
-                                                      errors);
-    return codec_tuple(decoded, data->len);
+    Py_ssize_t consumed = data->len;
+    PyObject *decoded = _PyUnicode_DecodeUnicodeEscapeStateful(data->buf, data->len,
+                                                               errors,
+                                                               final ? NULL : &consumed);
+    return codec_tuple(decoded, consumed);
 }

 /*[clinic input]
--- a/Modules/clinic/_codecsmodule.c.h
+++ b/Modules/clinic/_codecsmodule.c.h
@ -1063,7 +1063,7 @@ exit:
 }

 PyDoc_STRVAR(_codecs_unicode_escape_decode__doc__,
-"unicode_escape_decode($module, data, errors=None, /)\n"
+"unicode_escape_decode($module, data, errors=None, final=True, /)\n"
 "--\n"
 "\n");

@ -1072,7 +1072,7 @@ PyDoc_STRVAR(_codecs_unicode_escape_decode__doc__,

 static PyObject *
 _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
-                                   const char *errors);
+                                   const char *errors, int final);

 static PyObject *
 _codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
@ -1080,8 +1080,9 @@ _codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_
    PyObject *return_value = NULL;
    Py_buffer data = {NULL, NULL};
    const char *errors = NULL;
+    int final = 1;

-    if (!_PyArg_CheckPositional("unicode_escape_decode", nargs, 1, 2)) {
+    if (!_PyArg_CheckPositional("unicode_escape_decode", nargs, 1, 3)) {
        goto exit;
    }
    if (PyUnicode_Check(args[0])) {
@ -1122,8 +1123,15 @@ _codecs_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_
        _PyArg_BadArgument("unicode_escape_decode", "argument 2", "str or None", args[1]);
        goto exit;
    }
+    if (nargs < 3) {
+        goto skip_optional;
+    }
+    final = _PyLong_AsInt(args[2]);
+    if (final == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
 skip_optional:
-    return_value = _codecs_unicode_escape_decode_impl(module, &data, errors);
+    return_value = _codecs_unicode_escape_decode_impl(module, &data, errors, final);

 exit:
    /* Cleanup for data */
@ -2801,4 +2809,4 @@ exit:
 #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
    #define _CODECS_CODE_PAGE_ENCODE_METHODDEF
 #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
-/*[clinic end generated code: output=557c3b37e4c492ac input=a9049054013a1b77]*/
+/*[clinic end generated code: output=9e9fb1d5d81577e0 input=a9049054013a1b77]*/
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -6342,9 +6342,10 @@ PyUnicode_AsUTF16String(PyObject *unicode)
 static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;

 PyObject *
-_PyUnicode_DecodeUnicodeEscape(const char *s,
+_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
                               Py_ssize_t size,
                               const char *errors,
+                               Py_ssize_t *consumed,
                               const char **first_invalid_escape)
 {
    const char *starts = s;
@ -6357,6 +6358,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
    *first_invalid_escape = NULL;

    if (size == 0) {
+        if (consumed) {
+            *consumed = 0;
+        }
        _Py_RETURN_UNICODE_EMPTY();
    }
    /* Escaped strings will always be longer than the resulting
@ -6407,7 +6411,7 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
        /* \ - Escapes */
        if (s >= end) {
            message = "\\ at end of string";
-            goto error;
+            goto incomplete;
        }
        c = (unsigned char) *s++;

@ -6461,7 +6465,10 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
            count = 8;
            message = "truncated \\UXXXXXXXX escape";
        hexescape:
-            for (ch = 0; count && s < end; ++s, --count) {
+            for (ch = 0; count; ++s, --count) {
+                if (s >= end) {
+                    goto incomplete;
+                }
                c = (unsigned char)*s;
                ch <<= 4;
                if (c >= '0' && c <= '9') {
@ -6474,12 +6481,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
                    ch += c - ('A' - 10);
                }
                else {
-                    break;
+                    goto error;
                }
            }
-            if (count) {
-                goto error;
-            }

            /* when we get here, ch is a 32-bit unicode character */
            if (ch > MAX_UNICODE) {
@ -6506,14 +6510,20 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
            }

            message = "malformed \\N character escape";
-            if (s < end && *s == '{') {
+            if (s >= end) {
+                goto incomplete;
+            }
+            if (*s == '{') {
                const char *start = ++s;
                size_t namelen;
                /* look for the closing brace */
                while (s < end && *s != '}')
                    s++;
+                if (s >= end) {
+                    goto incomplete;
+                }
                namelen = s - start;
-                if (namelen && s < end) {
+                if (namelen) {
                    /* found a name.  look it up in the unicode database */
                    s++;
                    ch = 0xffffffff; /* in case 'getcode' messes up */
@ -6539,6 +6549,11 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
            continue;
        }

+      incomplete:
+        if (consumed) {
+            *consumed = startinpos;
+            break;
+        }
      error:
        endinpos = s-starts;
        writer.min_length = end - s + writer.pos;
@ -6567,12 +6582,14 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
 }

 PyObject *
-PyUnicode_DecodeUnicodeEscape(const char *s,
+_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
                              Py_ssize_t size,
-                              const char *errors)
+                              const char *errors,
+                              Py_ssize_t *consumed)
 {
    const char *first_invalid_escape;
-    PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
+    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
+                                                      consumed,
                                                      &first_invalid_escape);
    if (result == NULL)
        return NULL;
@ -6587,6 +6604,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
    return result;
 }

+PyObject *
+PyUnicode_DecodeUnicodeEscape(const char *s,
+                              Py_ssize_t size,
+                              const char *errors)
+{
+    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
+}
+
 /* Return a Unicode-Escape string version of the Unicode object. */

 PyObject *
--- a/Parser/string_parser.c
+++ b/Parser/string_parser.c
@ -115,7 +115,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
    s = buf;

    const char *first_invalid_escape;
-    v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
+    v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);

    if (v != NULL && first_invalid_escape != NULL) {
        if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {