Issue #24870: Optimize the ASCII decoder for error handlers: surrogateescape,

ignore and replace. Initial patch written by Naoki Inada. The decoder is now up to 60 times as fast for these error handlers. Add also unit tests for the ASCII decoder.
2015-09-21 23:06:27 +02:00 · 2015-09-21 23:06:27 +02:00 · f96418de05
parent ba45295938
commit f96418de05
3 changed files with 95 additions and 6 deletions
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@ -106,7 +106,8 @@ operator
 Optimizations
 =============

-* None yet.
+* The ASCII decoder is now up to 60 times as fast for error handlers:
+  ``surrogateescape``, ``ignore`` and ``replace``.


 Build and C API Changes
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -27,6 +27,7 @@ def coding_checker(self, coder):
        self.assertEqual(coder(input), (expect, len(input)))
    return check

+
 class Queue(object):
    """
    queue: write bytes at one end, read bytes from the other end
@ -47,6 +48,7 @@ class Queue(object):
            self._buffer = self._buffer[size:]
            return s

+
 class MixInCheckStateHandling:
    def check_state_handling_decode(self, encoding, u, s):
        for i in range(len(s)+1):
@ -80,6 +82,7 @@ class MixInCheckStateHandling:
            part2 = d.encode(u[i:], True)
            self.assertEqual(s, part1+part2)

+
 class ReadTest(MixInCheckStateHandling):
    def check_partial(self, input, partialresults):
        # get a StreamReader for the encoding and feed the bytestring version
@ -383,6 +386,7 @@ class ReadTest(MixInCheckStateHandling):
            self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
                             before + backslashreplace + after)

+
 class UTF32Test(ReadTest, unittest.TestCase):
    encoding = "utf-32"
    if sys.byteorder == 'little':
@ -478,6 +482,7 @@ class UTF32Test(ReadTest, unittest.TestCase):
        self.assertEqual('\U00010000' * 1024,
                         codecs.utf_32_decode(encoded_be)[0])

+
 class UTF32LETest(ReadTest, unittest.TestCase):
    encoding = "utf-32-le"
    ill_formed_sequence = b"\x80\xdc\x00\x00"
@ -523,6 +528,7 @@ class UTF32LETest(ReadTest, unittest.TestCase):
        self.assertEqual('\U00010000' * 1024,
                         codecs.utf_32_le_decode(encoded)[0])

+
 class UTF32BETest(ReadTest, unittest.TestCase):
    encoding = "utf-32-be"
    ill_formed_sequence = b"\x00\x00\xdc\x80"
@ -797,6 +803,7 @@ class UTF8Test(ReadTest, unittest.TestCase):
        with self.assertRaises(UnicodeDecodeError):
            b"abc\xed\xa0z".decode("utf-8", "surrogatepass")

+
@unittest.skipUnless(sys.platform == 'win32',
                     'cp65001 is a Windows-only codec')
 class CP65001Test(ReadTest, unittest.TestCase):
@ -1136,6 +1143,7 @@ class EscapeDecodeTest(unittest.TestCase):
        self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
        self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))

+
 class RecodingTest(unittest.TestCase):
    def test_recoding(self):
        f = io.BytesIO()
@ -1255,6 +1263,7 @@ for i in punycode_testcases:
    if len(i)!=2:
        print(repr(i))

+
 class PunycodeTest(unittest.TestCase):
    def test_encode(self):
        for uni, puny in punycode_testcases:
@ -1274,6 +1283,7 @@ class PunycodeTest(unittest.TestCase):
            puny = puny.decode("ascii").encode("ascii")
            self.assertEqual(uni, puny.decode("punycode"))

+
 class UnicodeInternalTest(unittest.TestCase):
    @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
    def test_bug1251300(self):
@ -1528,6 +1538,7 @@ class NameprepTest(unittest.TestCase):
                except Exception as e:
                    raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))

+
 class IDNACodecTest(unittest.TestCase):
    def test_builtin_decode(self):
        self.assertEqual(str(b"python.org", "idna"), "python.org")
@ -1614,6 +1625,7 @@ class IDNACodecTest(unittest.TestCase):
            self.assertRaises(Exception,
                b"python.org".decode, "idna", errors)

+
 class CodecsModuleTest(unittest.TestCase):

    def test_decode(self):
@ -1722,6 +1734,7 @@ class CodecsModuleTest(unittest.TestCase):
            self.assertRaises(UnicodeError,
                codecs.decode, b'abc', 'undefined', errors)

+
 class StreamReaderTest(unittest.TestCase):

    def setUp(self):
@ -1732,6 +1745,7 @@ class StreamReaderTest(unittest.TestCase):
        f = self.reader(self.stream)
        self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])

+
 class EncodedFileTest(unittest.TestCase):

    def test_basic(self):
@ -1862,6 +1876,7 @@ broken_unicode_with_stateful = [
    "unicode_internal"
 ]

+
 class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
    def test_basics(self):
        s = "abc123"  # all codecs should be able to encode these
@ -2024,6 +2039,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
                self.check_state_handling_decode(encoding, u, u.encode(encoding))
                self.check_state_handling_encode(encoding, u, u.encode(encoding))

+
 class CharmapTest(unittest.TestCase):
    def test_decode_with_string_map(self):
        self.assertEqual(
@ -2274,6 +2290,7 @@ class WithStmtTest(unittest.TestCase):
                                       info.streamwriter, 'strict') as srw:
            self.assertEqual(srw.read(), "\xfc")

+
 class TypesTest(unittest.TestCase):
    def test_decode_unicode(self):
        # Most decoders don't accept unicode input
@ -2564,6 +2581,7 @@ else:
    bytes_transform_encodings.append("bz2_codec")
    transform_aliases["bz2_codec"] = ["bz2"]

+
 class TransformCodecTest(unittest.TestCase):

    def test_basics(self):
@ -3041,5 +3059,19 @@ class CodePageTest(unittest.TestCase):
        self.assertEqual(decoded, ('abc', 3))


+class ASCIITest(unittest.TestCase):
+    def test_decode(self):
+        for data, error_handler, expected in (
+            (b'[\x80\xff]', 'ignore', '[]'),
+            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
+            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
+            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
+        ):
+            with self.subTest(data=data, error_handler=error_handler,
+                              expected=expected):
+                self.assertEqual(data.decode('ascii', error_handler),
+                                 expected)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -6644,6 +6644,28 @@ PyUnicode_AsLatin1String(PyObject *unicode)

 /* --- 7-bit ASCII Codec -------------------------------------------------- */

+typedef enum {
+    _Py_ERROR_UNKNOWN=0,
+    _Py_ERROR_SURROGATEESCAPE,
+    _Py_ERROR_REPLACE,
+    _Py_ERROR_IGNORE,
+    _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+    if (errors == NULL)
+        return _Py_ERROR_OTHER;
+    if (strcmp(errors, "surrogateescape") == 0)
+        return _Py_ERROR_SURROGATEESCAPE;
+    if (strcmp(errors, "ignore") == 0)
+        return _Py_ERROR_IGNORE;
+    if (strcmp(errors, "replace") == 0)
+        return _Py_ERROR_REPLACE;
+    return _Py_ERROR_OTHER;
+}
+
 PyObject *
 PyUnicode_DecodeASCII(const char *s,
                      Py_ssize_t size,
@ -6657,8 +6679,9 @@ PyUnicode_DecodeASCII(const char *s,
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
    const char *e;
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
    PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;

    if (size == 0)
        _Py_RETURN_UNICODE_EMPTY();
@ -6687,12 +6710,45 @@ PyUnicode_DecodeASCII(const char *s,
            PyUnicode_WRITE(kind, data, writer.pos, c);
            writer.pos++;
            ++s;
+            continue;
        }
-        else {
+
+        /* byte outsize range 0x00..0x7f: call the error handler */
+
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler)
+        {
+        case _Py_ERROR_REPLACE:
+        case _Py_ERROR_SURROGATEESCAPE:
+            /* Fast-path: the error handler only writes one character,
+               but we must switch to UCS2 at the first write */
+            if (kind < PyUnicode_2BYTE_KIND) {
+                if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos,
+                                             0xffff) < 0)
+                    return NULL;
+                kind = writer.kind;
+                data = writer.data;
+            }
+
+            if (error_handler == _Py_ERROR_REPLACE)
+                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+            else
+                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+            writer.pos++;
+            ++s;
+            break;
+
+        case _Py_ERROR_IGNORE:
+            ++s;
+            break;
+
+        default:
            startinpos = s-starts;
            endinpos = startinpos + 1;
            if (unicode_decode_call_errorhandler_writer(
-                    errors, &errorHandler,
+                    errors, &error_handler_obj,
                    "ascii", "ordinal not in range(128)",
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
                    &writer))
@ -6701,13 +6757,13 @@ PyUnicode_DecodeASCII(const char *s,
            data = writer.data;
        }
    }
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
    Py_XDECREF(exc);
    return _PyUnicodeWriter_Finish(&writer);

  onError:
    _PyUnicodeWriter_Dealloc(&writer);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
    Py_XDECREF(exc);
    return NULL;
 }