Optimize ascii/latin1+surrogateescape encoders
Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape`` error handler: the encoders are now up to 3 times as fast. Initial patch written by Serhiy Storchaka.
This commit is contained in:
parent
5fbeabcbb6
commit
c3713e9706
|
@ -117,6 +117,9 @@ Optimizations
|
|||
* The ASCII decoder is now up to 60 times as fast for error handlers:
|
||||
``surrogateescape``, ``ignore`` and ``replace``.
|
||||
|
||||
* The ASCII and the Latin1 encoders are now up to 3 times as fast for the error
|
||||
error ``surrogateescape``.
|
||||
|
||||
|
||||
Build and C API Changes
|
||||
=======================
|
||||
|
|
|
@ -3060,7 +3060,31 @@ class CodePageTest(unittest.TestCase):
|
|||
|
||||
|
||||
class ASCIITest(unittest.TestCase):
|
||||
def test_encode(self):
|
||||
self.assertEqual('abc123'.encode('ascii'), b'abc123')
|
||||
|
||||
def test_encode_error(self):
|
||||
for data, error_handler, expected in (
|
||||
('[\x80\xff\u20ac]', 'ignore', b'[]'),
|
||||
('[\x80\xff\u20ac]', 'replace', b'[???]'),
|
||||
('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'),
|
||||
('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'),
|
||||
('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
|
||||
):
|
||||
with self.subTest(data=data, error_handler=error_handler,
|
||||
expected=expected):
|
||||
self.assertEqual(data.encode('ascii', error_handler),
|
||||
expected)
|
||||
|
||||
def test_encode_surrogateescape_error(self):
|
||||
with self.assertRaises(UnicodeEncodeError):
|
||||
# the first character can be decoded, but not the second
|
||||
'\udc80\xff'.encode('ascii', 'surrogateescape')
|
||||
|
||||
def test_decode(self):
|
||||
self.assertEqual(b'abc'.decode('ascii'), 'abc')
|
||||
|
||||
def test_decode_error(self):
|
||||
for data, error_handler, expected in (
|
||||
(b'[\x80\xff]', 'ignore', '[]'),
|
||||
(b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
|
||||
|
@ -3073,5 +3097,41 @@ class ASCIITest(unittest.TestCase):
|
|||
expected)
|
||||
|
||||
|
||||
class Latin1Test(unittest.TestCase):
|
||||
def test_encode(self):
|
||||
for data, expected in (
|
||||
('abc', b'abc'),
|
||||
('\x80\xe9\xff', b'\x80\xe9\xff'),
|
||||
):
|
||||
with self.subTest(data=data, expected=expected):
|
||||
self.assertEqual(data.encode('latin1'), expected)
|
||||
|
||||
def test_encode_errors(self):
|
||||
for data, error_handler, expected in (
|
||||
('[\u20ac\udc80]', 'ignore', b'[]'),
|
||||
('[\u20ac\udc80]', 'replace', b'[??]'),
|
||||
('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'),
|
||||
('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'),
|
||||
('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
|
||||
):
|
||||
with self.subTest(data=data, error_handler=error_handler,
|
||||
expected=expected):
|
||||
self.assertEqual(data.encode('latin1', error_handler),
|
||||
expected)
|
||||
|
||||
def test_encode_surrogateescape_error(self):
|
||||
with self.assertRaises(UnicodeEncodeError):
|
||||
# the first character can be decoded, but not the second
|
||||
'\udc80\u20ac'.encode('latin1', 'surrogateescape')
|
||||
|
||||
def test_decode(self):
|
||||
for data, expected in (
|
||||
(b'abc', 'abc'),
|
||||
(b'[\x80\xff]', '[\x80\xff]'),
|
||||
):
|
||||
with self.subTest(data=data, expected=expected):
|
||||
self.assertEqual(data.decode('latin1'), expected)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
@ -10,6 +10,10 @@ Release date: XXXX-XX-XX
|
|||
Core and Builtins
|
||||
-----------------
|
||||
|
||||
- Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape``
|
||||
error handler: the encoders are now up to 3 times as fast. Initial patch
|
||||
written by Serhiy Storchaka.
|
||||
|
||||
- Issue #25003: On Solaris 11.3 or newer, os.urandom() now uses the
|
||||
getrandom() function instead of the getentropy() function. The getentropy()
|
||||
function is blocking to generate very good quality entropy, os.urandom()
|
||||
|
|
|
@ -6532,6 +6532,22 @@ unicode_encode_ucs1(PyObject *unicode,
|
|||
pos = collend;
|
||||
break;
|
||||
|
||||
case _Py_ERROR_SURROGATEESCAPE:
|
||||
for (i = collstart; i < collend; ++i) {
|
||||
ch = PyUnicode_READ(kind, data, i);
|
||||
if (ch < 0xdc80 || 0xdcff < ch) {
|
||||
/* Not a UTF-8b surrogate */
|
||||
break;
|
||||
}
|
||||
*str++ = (char)(ch - 0xdc00);
|
||||
++pos;
|
||||
}
|
||||
if (i >= collend)
|
||||
break;
|
||||
collstart = pos;
|
||||
assert(collstart != collend);
|
||||
/* fallback to general error handling */
|
||||
|
||||
default:
|
||||
repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
|
||||
encoding, reason, unicode, &exc,
|
||||
|
|
Loading…
Reference in New Issue