From 1d65d9192dac57776693c55a9ccefbde2ca74c23 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 5 Oct 2015 13:43:50 +0200 Subject: [PATCH] Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error handlers: ``ignore``, ``replace`` and ``surrogateescape``. --- Doc/whatsnew/3.6.rst | 3 +++ Lib/test/test_codecs.py | 12 +++++++++++ Misc/NEWS | 3 +++ Objects/unicodeobject.c | 48 +++++++++++++++++++++++++++++++++-------- 4 files changed, 57 insertions(+), 9 deletions(-) diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst index ca83ef91a06..24fd822c7b0 100644 --- a/Doc/whatsnew/3.6.rst +++ b/Doc/whatsnew/3.6.rst @@ -123,6 +123,9 @@ Optimizations * The UTF-8 encoder is now up to 75 times as fast for error handlers: ``ignore``, ``replace``, ``surrogateescape``, ``surrogatepass``. +* The UTF-8 decoder is now up to 15 times as fast for error handlers: + ``ignore``, ``replace`` and ``surrogateescape``. + Build and C API Changes ======================= diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index bdc331e4911..7b6883fcc51 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -788,6 +788,18 @@ class UTF8Test(ReadTest, unittest.TestCase): self.check_state_handling_decode(self.encoding, u, u.encode(self.encoding)) + def test_decode_error(self): + for data, error_handler, expected in ( + (b'[\x80\xff]', 'ignore', '[]'), + (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), + (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), + (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), + ): + with self.subTest(data=data, error_handler=error_handler, + expected=expected): + self.assertEqual(data.decode(self.encoding, error_handler), + expected) + def test_lone_surrogates(self): super().test_lone_surrogates() # not sure if this is making sense for diff --git a/Misc/NEWS b/Misc/NEWS index d8093771c59..3991d6bb86a 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,9 @@ Release date: XXXX-XX-XX Core and Builtins ----------------- +* Issue #25301: The UTF-8 decoder is now up to 15 times as fast for error + handlers: ``ignore``, ``replace`` and ``surrogateescape``. + - Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data. - Issue #25267: The UTF-8 encoder is now up to 75 times as fast for error diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index bc982876c51..56614e6b8d9 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4714,8 +4714,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s, Py_ssize_t startinpos; Py_ssize_t endinpos; const char *errmsg = ""; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) { if (consumed) @@ -4740,6 +4741,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, while (s < end) { Py_UCS4 ch; int kind = writer.kind; + if (kind == PyUnicode_1BYTE_KIND) { if (PyUnicode_IS_ASCII(writer.buffer)) ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); @@ -4778,24 +4780,52 @@ PyUnicode_DecodeUTF8Stateful(const char *s, continue; } - if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, - "utf-8", errmsg, - &starts, &end, &startinpos, &endinpos, &exc, &s, - &writer)) - goto onError; + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) { + case _Py_ERROR_IGNORE: + s += (endinpos - startinpos); + break; + + case _Py_ERROR_REPLACE: + if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0) + goto onError; + s += (endinpos - startinpos); + break; + + case _Py_ERROR_SURROGATEESCAPE: + if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) + goto onError; + for (Py_ssize_t i=startinpos; i