Optimize ascii/latin1+surrogateescape encoders

Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape``
error handler: the encoders are now up to 3 times as fast.

Initial patch written by Serhiy Storchaka.
This commit is contained in:
Victor Stinner 2015-09-29 12:32:13 +02:00
parent 5fbeabcbb6
commit c3713e9706
4 changed files with 83 additions and 0 deletions

View File

@ -117,6 +117,9 @@ Optimizations
* The ASCII decoder is now up to 60 times as fast for error handlers:
``surrogateescape``, ``ignore`` and ``replace``.
* The ASCII and the Latin1 encoders are now up to 3 times as fast for the error
error ``surrogateescape``.
Build and C API Changes
=======================

View File

@ -3060,7 +3060,31 @@ class CodePageTest(unittest.TestCase):
class ASCIITest(unittest.TestCase):
def test_encode(self):
self.assertEqual('abc123'.encode('ascii'), b'abc123')
def test_encode_error(self):
for data, error_handler, expected in (
('[\x80\xff\u20ac]', 'ignore', b'[]'),
('[\x80\xff\u20ac]', 'replace', b'[???]'),
('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'),
('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'),
('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
):
with self.subTest(data=data, error_handler=error_handler,
expected=expected):
self.assertEqual(data.encode('ascii', error_handler),
expected)
def test_encode_surrogateescape_error(self):
with self.assertRaises(UnicodeEncodeError):
# the first character can be decoded, but not the second
'\udc80\xff'.encode('ascii', 'surrogateescape')
def test_decode(self):
self.assertEqual(b'abc'.decode('ascii'), 'abc')
def test_decode_error(self):
for data, error_handler, expected in (
(b'[\x80\xff]', 'ignore', '[]'),
(b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
@ -3073,5 +3097,41 @@ class ASCIITest(unittest.TestCase):
expected)
class Latin1Test(unittest.TestCase):
def test_encode(self):
for data, expected in (
('abc', b'abc'),
('\x80\xe9\xff', b'\x80\xe9\xff'),
):
with self.subTest(data=data, expected=expected):
self.assertEqual(data.encode('latin1'), expected)
def test_encode_errors(self):
for data, error_handler, expected in (
('[\u20ac\udc80]', 'ignore', b'[]'),
('[\u20ac\udc80]', 'replace', b'[??]'),
('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'),
('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'),
('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
):
with self.subTest(data=data, error_handler=error_handler,
expected=expected):
self.assertEqual(data.encode('latin1', error_handler),
expected)
def test_encode_surrogateescape_error(self):
with self.assertRaises(UnicodeEncodeError):
# the first character can be decoded, but not the second
'\udc80\u20ac'.encode('latin1', 'surrogateescape')
def test_decode(self):
for data, expected in (
(b'abc', 'abc'),
(b'[\x80\xff]', '[\x80\xff]'),
):
with self.subTest(data=data, expected=expected):
self.assertEqual(data.decode('latin1'), expected)
if __name__ == "__main__":
unittest.main()

View File

@ -10,6 +10,10 @@ Release date: XXXX-XX-XX
Core and Builtins
-----------------
- Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape``
error handler: the encoders are now up to 3 times as fast. Initial patch
written by Serhiy Storchaka.
- Issue #25003: On Solaris 11.3 or newer, os.urandom() now uses the
getrandom() function instead of the getentropy() function. The getentropy()
function is blocking to generate very good quality entropy, os.urandom()

View File

@ -6532,6 +6532,22 @@ unicode_encode_ucs1(PyObject *unicode,
pos = collend;
break;
case _Py_ERROR_SURROGATEESCAPE:
for (i = collstart; i < collend; ++i) {
ch = PyUnicode_READ(kind, data, i);
if (ch < 0xdc80 || 0xdcff < ch) {
/* Not a UTF-8b surrogate */
break;
}
*str++ = (char)(ch - 0xdc00);
++pos;
}
if (i >= collend)
break;
collstart = pos;
assert(collstart != collend);
/* fallback to general error handling */
default:
repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
encoding, reason, unicode, &exc,