bpo-32583: Fix possible crashing in builtin Unicode decoders (#5325)
When using customized decode error handlers, it is possible for builtin decoders to write out-of-bounds and then crash.
This commit is contained in:
parent
57750be4ad
commit
86fdad093b
|
@ -1044,6 +1044,58 @@ class CodecCallbackTest(unittest.TestCase):
|
|||
for (encoding, data) in baddata:
|
||||
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
|
||||
|
||||
# issue32583
|
||||
def test_crashing_decode_handler(self):
|
||||
# better generating one more character to fill the extra space slot
|
||||
# so in debug build it can steadily fail
|
||||
def forward_shorter_than_end(exc):
|
||||
if isinstance(exc, UnicodeDecodeError):
|
||||
# size one character, 0 < forward < exc.end
|
||||
return ('\ufffd', exc.start+1)
|
||||
else:
|
||||
raise TypeError("don't know how to handle %r" % exc)
|
||||
codecs.register_error(
|
||||
"test.forward_shorter_than_end", forward_shorter_than_end)
|
||||
|
||||
self.assertEqual(
|
||||
b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
|
||||
'utf-16-le', 'test.forward_shorter_than_end'),
|
||||
'\ufffd\ufffd\ufffd\ufffd\xd8\x00'
|
||||
)
|
||||
self.assertEqual(
|
||||
b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
|
||||
'utf-16-be', 'test.forward_shorter_than_end'),
|
||||
'\ufffd\ufffd\ufffd\ufffd\xd8\x00'
|
||||
)
|
||||
self.assertEqual(
|
||||
b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
|
||||
'utf-32-le', 'test.forward_shorter_than_end'),
|
||||
'\ufffd\ufffd\ufffd\u1111\x00'
|
||||
)
|
||||
self.assertEqual(
|
||||
b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
|
||||
'utf-32-be', 'test.forward_shorter_than_end'),
|
||||
'\ufffd\ufffd\ufffd\u1111\x00'
|
||||
)
|
||||
|
||||
def replace_with_long(exc):
|
||||
if isinstance(exc, UnicodeDecodeError):
|
||||
exc.object = b"\x00" * 8
|
||||
return ('\ufffd', exc.start)
|
||||
else:
|
||||
raise TypeError("don't know how to handle %r" % exc)
|
||||
codecs.register_error("test.replace_with_long", replace_with_long)
|
||||
|
||||
self.assertEqual(
|
||||
b'\x00'.decode('utf-16', 'test.replace_with_long'),
|
||||
'\ufffd\x00\x00\x00\x00'
|
||||
)
|
||||
self.assertEqual(
|
||||
b'\x00'.decode('utf-32', 'test.replace_with_long'),
|
||||
'\ufffd\x00\x00'
|
||||
)
|
||||
|
||||
|
||||
def test_fake_error_class(self):
|
||||
handlers = [
|
||||
codecs.strict_errors,
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Fix possible crashing in builtin Unicode decoders caused by write
|
||||
out-of-bound errors when using customized decode error handlers.
|
|
@ -4190,7 +4190,10 @@ unicode_decode_call_errorhandler_writer(
|
|||
Py_ssize_t insize;
|
||||
Py_ssize_t newpos;
|
||||
Py_ssize_t replen;
|
||||
Py_ssize_t remain;
|
||||
PyObject *inputobj = NULL;
|
||||
int need_to_grow = 0;
|
||||
const char *new_inptr;
|
||||
|
||||
if (*errorHandler == NULL) {
|
||||
*errorHandler = PyCodec_LookupError(errors);
|
||||
|
@ -4221,6 +4224,7 @@ unicode_decode_call_errorhandler_writer(
|
|||
inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
|
||||
if (!inputobj)
|
||||
goto onError;
|
||||
remain = *inend - *input - *endinpos;
|
||||
*input = PyBytes_AS_STRING(inputobj);
|
||||
insize = PyBytes_GET_SIZE(inputobj);
|
||||
*inend = *input + insize;
|
||||
|
@ -4238,6 +4242,19 @@ unicode_decode_call_errorhandler_writer(
|
|||
replen = PyUnicode_GET_LENGTH(repunicode);
|
||||
if (replen > 1) {
|
||||
writer->min_length += replen - 1;
|
||||
need_to_grow = 1;
|
||||
}
|
||||
new_inptr = *input + newpos;
|
||||
if (*inend - new_inptr > remain) {
|
||||
/* We don't know the decoding algorithm here so we make the worst
|
||||
assumption that one byte decodes to one unicode character.
|
||||
If unfortunately one byte could decode to more unicode characters,
|
||||
the decoder may write out-of-bound then. Is it possible for the
|
||||
algorithms using this function? */
|
||||
writer->min_length += *inend - new_inptr - remain;
|
||||
need_to_grow = 1;
|
||||
}
|
||||
if (need_to_grow) {
|
||||
writer->overallocate = 1;
|
||||
if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
|
||||
PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
|
||||
|
@ -4247,7 +4264,7 @@ unicode_decode_call_errorhandler_writer(
|
|||
goto onError;
|
||||
|
||||
*endinpos = newpos;
|
||||
*inptr = *input + newpos;
|
||||
*inptr = new_inptr;
|
||||
|
||||
/* we made it! */
|
||||
Py_DECREF(restuple);
|
||||
|
@ -5572,7 +5589,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
|
|||
#endif
|
||||
|
||||
/* Note: size will always be longer than the resulting Unicode
|
||||
character count */
|
||||
character count normally. Error handler will take care of
|
||||
resizing when needed. */
|
||||
_PyUnicodeWriter_Init(&writer);
|
||||
writer.min_length = (e - q + 1) / 2;
|
||||
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
|
||||
|
|
Loading…
Reference in New Issue