diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst index 48ff38a241f..8a2b5d3021d 100644 --- a/Doc/whatsnew/3.6.rst +++ b/Doc/whatsnew/3.6.rst @@ -106,7 +106,8 @@ operator Optimizations ============= -* None yet. +* The ASCII decoder is now up to 60 times as fast for error handlers: + ``surrogateescape``, ``ignore`` and ``replace``. Build and C API Changes diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index a4a6f95ca2d..e0e31199cca 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -27,6 +27,7 @@ def coding_checker(self, coder): self.assertEqual(coder(input), (expect, len(input))) return check + class Queue(object): """ queue: write bytes at one end, read bytes from the other end @@ -47,6 +48,7 @@ class Queue(object): self._buffer = self._buffer[size:] return s + class MixInCheckStateHandling: def check_state_handling_decode(self, encoding, u, s): for i in range(len(s)+1): @@ -80,6 +82,7 @@ class MixInCheckStateHandling: part2 = d.encode(u[i:], True) self.assertEqual(s, part1+part2) + class ReadTest(MixInCheckStateHandling): def check_partial(self, input, partialresults): # get a StreamReader for the encoding and feed the bytestring version @@ -383,6 +386,7 @@ class ReadTest(MixInCheckStateHandling): self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"), before + backslashreplace + after) + class UTF32Test(ReadTest, unittest.TestCase): encoding = "utf-32" if sys.byteorder == 'little': @@ -478,6 +482,7 @@ class UTF32Test(ReadTest, unittest.TestCase): self.assertEqual('\U00010000' * 1024, codecs.utf_32_decode(encoded_be)[0]) + class UTF32LETest(ReadTest, unittest.TestCase): encoding = "utf-32-le" ill_formed_sequence = b"\x80\xdc\x00\x00" @@ -523,6 +528,7 @@ class UTF32LETest(ReadTest, unittest.TestCase): self.assertEqual('\U00010000' * 1024, codecs.utf_32_le_decode(encoded)[0]) + class UTF32BETest(ReadTest, unittest.TestCase): encoding = "utf-32-be" ill_formed_sequence = b"\x00\x00\xdc\x80" @@ -797,6 +803,7 @@ class UTF8Test(ReadTest, unittest.TestCase): with self.assertRaises(UnicodeDecodeError): b"abc\xed\xa0z".decode("utf-8", "surrogatepass") + @unittest.skipUnless(sys.platform == 'win32', 'cp65001 is a Windows-only codec') class CP65001Test(ReadTest, unittest.TestCase): @@ -1136,6 +1143,7 @@ class EscapeDecodeTest(unittest.TestCase): self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8)) self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8)) + class RecodingTest(unittest.TestCase): def test_recoding(self): f = io.BytesIO() @@ -1255,6 +1263,7 @@ for i in punycode_testcases: if len(i)!=2: print(repr(i)) + class PunycodeTest(unittest.TestCase): def test_encode(self): for uni, puny in punycode_testcases: @@ -1274,6 +1283,7 @@ class PunycodeTest(unittest.TestCase): puny = puny.decode("ascii").encode("ascii") self.assertEqual(uni, puny.decode("punycode")) + class UnicodeInternalTest(unittest.TestCase): @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t') def test_bug1251300(self): @@ -1528,6 +1538,7 @@ class NameprepTest(unittest.TestCase): except Exception as e: raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) + class IDNACodecTest(unittest.TestCase): def test_builtin_decode(self): self.assertEqual(str(b"python.org", "idna"), "python.org") @@ -1614,6 +1625,7 @@ class IDNACodecTest(unittest.TestCase): self.assertRaises(Exception, b"python.org".decode, "idna", errors) + class CodecsModuleTest(unittest.TestCase): def test_decode(self): @@ -1722,6 +1734,7 @@ class CodecsModuleTest(unittest.TestCase): self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined', errors) + class StreamReaderTest(unittest.TestCase): def setUp(self): @@ -1732,6 +1745,7 @@ class StreamReaderTest(unittest.TestCase): f = self.reader(self.stream) self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00']) + class EncodedFileTest(unittest.TestCase): def test_basic(self): @@ -1862,6 +1876,7 @@ broken_unicode_with_stateful = [ "unicode_internal" ] + class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): def test_basics(self): s = "abc123" # all codecs should be able to encode these @@ -2024,6 +2039,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): self.check_state_handling_decode(encoding, u, u.encode(encoding)) self.check_state_handling_encode(encoding, u, u.encode(encoding)) + class CharmapTest(unittest.TestCase): def test_decode_with_string_map(self): self.assertEqual( @@ -2274,6 +2290,7 @@ class WithStmtTest(unittest.TestCase): info.streamwriter, 'strict') as srw: self.assertEqual(srw.read(), "\xfc") + class TypesTest(unittest.TestCase): def test_decode_unicode(self): # Most decoders don't accept unicode input @@ -2564,6 +2581,7 @@ else: bytes_transform_encodings.append("bz2_codec") transform_aliases["bz2_codec"] = ["bz2"] + class TransformCodecTest(unittest.TestCase): def test_basics(self): @@ -3041,5 +3059,19 @@ class CodePageTest(unittest.TestCase): self.assertEqual(decoded, ('abc', 3)) +class ASCIITest(unittest.TestCase): + def test_decode(self): + for data, error_handler, expected in ( + (b'[\x80\xff]', 'ignore', '[]'), + (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), + (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), + (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), + ): + with self.subTest(data=data, error_handler=error_handler, + expected=expected): + self.assertEqual(data.decode('ascii', error_handler), + expected) + + if __name__ == "__main__": unittest.main() diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 0709789991e..b8840e1e360 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6644,6 +6644,28 @@ PyUnicode_AsLatin1String(PyObject *unicode) /* --- 7-bit ASCII Codec -------------------------------------------------- */ +typedef enum { + _Py_ERROR_UNKNOWN=0, + _Py_ERROR_SURROGATEESCAPE, + _Py_ERROR_REPLACE, + _Py_ERROR_IGNORE, + _Py_ERROR_OTHER +} _Py_error_handler; + +static _Py_error_handler +get_error_handler(const char *errors) +{ + if (errors == NULL) + return _Py_ERROR_OTHER; + if (strcmp(errors, "surrogateescape") == 0) + return _Py_ERROR_SURROGATEESCAPE; + if (strcmp(errors, "ignore") == 0) + return _Py_ERROR_IGNORE; + if (strcmp(errors, "replace") == 0) + return _Py_ERROR_REPLACE; + return _Py_ERROR_OTHER; +} + PyObject * PyUnicode_DecodeASCII(const char *s, Py_ssize_t size, @@ -6657,8 +6679,9 @@ PyUnicode_DecodeASCII(const char *s, Py_ssize_t endinpos; Py_ssize_t outpos; const char *e; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) _Py_RETURN_UNICODE_EMPTY(); @@ -6687,12 +6710,45 @@ PyUnicode_DecodeASCII(const char *s, PyUnicode_WRITE(kind, data, writer.pos, c); writer.pos++; ++s; + continue; } - else { + + /* byte outsize range 0x00..0x7f: call the error handler */ + + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) + { + case _Py_ERROR_REPLACE: + case _Py_ERROR_SURROGATEESCAPE: + /* Fast-path: the error handler only writes one character, + but we must switch to UCS2 at the first write */ + if (kind < PyUnicode_2BYTE_KIND) { + if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos, + 0xffff) < 0) + return NULL; + kind = writer.kind; + data = writer.data; + } + + if (error_handler == _Py_ERROR_REPLACE) + PyUnicode_WRITE(kind, data, writer.pos, 0xfffd); + else + PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); + writer.pos++; + ++s; + break; + + case _Py_ERROR_IGNORE: + ++s; + break; + + default: startinpos = s-starts; endinpos = startinpos + 1; if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, + errors, &error_handler_obj, "ascii", "ordinal not in range(128)", &starts, &e, &startinpos, &endinpos, &exc, &s, &writer)) @@ -6701,13 +6757,13 @@ PyUnicode_DecodeASCII(const char *s, data = writer.data; } } - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); onError: _PyUnicodeWriter_Dealloc(&writer); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return NULL; }