From 39aa98346d5dd8ac591a7cafb467af21c53f1e5d Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 14 Oct 2021 20:04:19 +0300 Subject: [PATCH] bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec (GH-28944) They support now splitting escape sequences between input chunks. Add the third parameter "final" in codecs.raw_unicode_escape_decode(). It is True by default to match the former behavior. --- Include/cpython/unicodeobject.h | 10 +++ Lib/encodings/raw_unicode_escape.py | 9 +-- Lib/test/test_codecs.py | 35 +++++++++- .../2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst | 2 + Modules/_codecsmodule.c | 13 ++-- Modules/clinic/_codecsmodule.c.h | 18 ++++-- Objects/unicodeobject.c | 64 +++++++++++++------ 7 files changed, 116 insertions(+), 35 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index bc5a3b4bd0b..ab4aebf5e70 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -796,6 +796,16 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal( string. */ ); +/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */ + +/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */ +PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful( + const char *string, /* Unicode-Escape encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed /* bytes consumed */ +); + /* --- Latin-1 Codecs ----------------------------------------------------- */ PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( diff --git a/Lib/encodings/raw_unicode_escape.py b/Lib/encodings/raw_unicode_escape.py index 2b919b40d37..46c8e070dd1 100644 --- a/Lib/encodings/raw_unicode_escape.py +++ b/Lib/encodings/raw_unicode_escape.py @@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): return codecs.raw_unicode_escape_encode(input, self.errors)[0] -class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - return codecs.raw_unicode_escape_decode(input, self.errors)[0] +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def _buffer_decode(self, input, errors, final): + return codecs.raw_unicode_escape_decode(input, errors, final) class StreamWriter(Codec,codecs.StreamWriter): pass class StreamReader(Codec,codecs.StreamReader): - pass + def decode(self, input, errors='strict'): + return codecs.raw_unicode_escape_decode(input, errors, False) ### encodings module API diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 288a3006cde..506b51c428f 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -2483,7 +2483,11 @@ class UnicodeEscapeTest(ReadTest, unittest.TestCase): ] ) -class RawUnicodeEscapeTest(unittest.TestCase): +class RawUnicodeEscapeTest(ReadTest, unittest.TestCase): + encoding = "raw-unicode-escape" + + test_lone_surrogates = None + def test_empty(self): self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0)) self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0)) @@ -2532,6 +2536,35 @@ class RawUnicodeEscapeTest(unittest.TestCase): self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) + def test_partial(self): + self.check_partial( + "\x00\t\n\r\\\xff\uffff\U00010000", + [ + '\x00', + '\x00\t', + '\x00\t\n', + '\x00\t\n\r', + '\x00\t\n\r', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff', + '\x00\t\n\r\\\xff\uffff\U00010000', + ] + ) + class EscapeEncodeTest(unittest.TestCase): diff --git a/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst b/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst new file mode 100644 index 00000000000..f2c0ae4ae51 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst @@ -0,0 +1,2 @@ +Fix incremental decoder and stream reader in the "raw-unicode-escape" codec. +Previously they failed if the escape sequence was split. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index fc74127ce56..50afc097b35 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -509,17 +509,20 @@ _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data, _codecs.raw_unicode_escape_decode data: Py_buffer(accept={str, buffer}) errors: str(accept={str, NoneType}) = None + final: bool(accept={int}) = True / [clinic start generated code]*/ static PyObject * _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data, - const char *errors) -/*[clinic end generated code: output=c98eeb56028070a6 input=d2f5159ce3b3392f]*/ + const char *errors, int final) +/*[clinic end generated code: output=11dbd96301e2879e input=2d166191beb3235a]*/ { - PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len, - errors); - return codec_tuple(decoded, data->len); + Py_ssize_t consumed = data->len; + PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len, + errors, + final ? NULL : &consumed); + return codec_tuple(decoded, consumed); } /*[clinic input] diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index a7086dd6e18..855ac77a7f7 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -1143,7 +1143,7 @@ exit: } PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__, -"raw_unicode_escape_decode($module, data, errors=None, /)\n" +"raw_unicode_escape_decode($module, data, errors=None, final=True, /)\n" "--\n" "\n"); @@ -1152,7 +1152,7 @@ PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__, static PyObject * _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data, - const char *errors); + const char *errors, int final); static PyObject * _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs) @@ -1160,8 +1160,9 @@ _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ss PyObject *return_value = NULL; Py_buffer data = {NULL, NULL}; const char *errors = NULL; + int final = 1; - if (!_PyArg_CheckPositional("raw_unicode_escape_decode", nargs, 1, 2)) { + if (!_PyArg_CheckPositional("raw_unicode_escape_decode", nargs, 1, 3)) { goto exit; } if (PyUnicode_Check(args[0])) { @@ -1202,8 +1203,15 @@ _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ss _PyArg_BadArgument("raw_unicode_escape_decode", "argument 2", "str or None", args[1]); goto exit; } + if (nargs < 3) { + goto skip_optional; + } + final = _PyLong_AsInt(args[2]); + if (final == -1 && PyErr_Occurred()) { + goto exit; + } skip_optional: - return_value = _codecs_raw_unicode_escape_decode_impl(module, &data, errors); + return_value = _codecs_raw_unicode_escape_decode_impl(module, &data, errors, final); exit: /* Cleanup for data */ @@ -2809,4 +2817,4 @@ exit: #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=9e9fb1d5d81577e0 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=814dae36b6f885cb input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index af3b3335d60..386052f31be 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6379,8 +6379,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, unsigned char c = (unsigned char) *s++; Py_UCS4 ch; int count; - Py_ssize_t startinpos; - Py_ssize_t endinpos; const char *message; #define WRITE_ASCII_CHAR(ch) \ @@ -6407,7 +6405,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, continue; } - startinpos = s - starts - 1; + Py_ssize_t startinpos = s - starts - 1; /* \ - Escapes */ if (s >= end) { message = "\\ at end of string"; @@ -6554,8 +6552,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, *consumed = startinpos; break; } - error: - endinpos = s-starts; + error:; + Py_ssize_t endinpos = s-starts; writer.min_length = end - s + writer.pos; if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, @@ -6735,9 +6733,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) /* --- Raw Unicode Escape Codec ------------------------------------------- */ PyObject * -PyUnicode_DecodeRawUnicodeEscape(const char *s, - Py_ssize_t size, - const char *errors) +_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) { const char *starts = s; _PyUnicodeWriter writer; @@ -6746,6 +6745,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, PyObject *exc = NULL; if (size == 0) { + if (consumed) { + *consumed = 0; + } _Py_RETURN_UNICODE_EMPTY(); } @@ -6764,8 +6766,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, unsigned char c = (unsigned char) *s++; Py_UCS4 ch; int count; - Py_ssize_t startinpos; - Py_ssize_t endinpos; const char *message; #define WRITE_CHAR(ch) \ @@ -6780,11 +6780,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, } while(0) /* Non-escape characters are interpreted as Unicode ordinals */ - if (c != '\\' || s >= end) { + if (c != '\\' || (s >= end && !consumed)) { WRITE_CHAR(c); continue; } + Py_ssize_t startinpos = s - starts - 1; + /* \ - Escapes */ + if (s >= end) { + assert(consumed); + // Set message to silent compiler warning. + // Actually it is never used. + message = "\\ at end of string"; + goto incomplete; + } + c = (unsigned char) *s++; if (c == 'u') { count = 4; @@ -6800,10 +6810,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, WRITE_CHAR(c); continue; } - startinpos = s - starts - 2; /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */ - for (ch = 0; count && s < end; ++s, --count) { + for (ch = 0; count; ++s, --count) { + if (s >= end) { + goto incomplete; + } c = (unsigned char)*s; ch <<= 4; if (c >= '0' && c <= '9') { @@ -6816,18 +6828,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, ch += c - ('A' - 10); } else { - break; + goto error; } } - if (!count) { - if (ch <= MAX_UNICODE) { - WRITE_CHAR(ch); - continue; - } + if (ch > MAX_UNICODE) { message = "\\Uxxxxxxxx out of range"; + goto error; } + WRITE_CHAR(ch); + continue; - endinpos = s-starts; + incomplete: + if (consumed) { + *consumed = startinpos; + break; + } + error:; + Py_ssize_t endinpos = s-starts; writer.min_length = end - s + writer.pos; if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, @@ -6849,7 +6866,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, Py_XDECREF(errorHandler); Py_XDECREF(exc); return NULL; +} +PyObject * +PyUnicode_DecodeRawUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors) +{ + return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL); }