bpo-32583: Fix possible crashing in builtin Unicode decoders (#5325)

When using customized decode error handlers, it is possible for builtin decoders
to write out-of-bounds and then crash.
This commit is contained in:
Xiang Zhang 2018-01-31 20:48:05 +08:00 committed by Ned Deily
parent 57750be4ad
commit 86fdad093b
3 changed files with 74 additions and 2 deletions

View File

@ -1044,6 +1044,58 @@ class CodecCallbackTest(unittest.TestCase):
for (encoding, data) in baddata:
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
# issue32583
def test_crashing_decode_handler(self):
# better generating one more character to fill the extra space slot
# so in debug build it can steadily fail
def forward_shorter_than_end(exc):
if isinstance(exc, UnicodeDecodeError):
# size one character, 0 < forward < exc.end
return ('\ufffd', exc.start+1)
else:
raise TypeError("don't know how to handle %r" % exc)
codecs.register_error(
"test.forward_shorter_than_end", forward_shorter_than_end)
self.assertEqual(
b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
'utf-16-le', 'test.forward_shorter_than_end'),
'\ufffd\ufffd\ufffd\ufffd\xd8\x00'
)
self.assertEqual(
b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
'utf-16-be', 'test.forward_shorter_than_end'),
'\ufffd\ufffd\ufffd\ufffd\xd8\x00'
)
self.assertEqual(
b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
'utf-32-le', 'test.forward_shorter_than_end'),
'\ufffd\ufffd\ufffd\u1111\x00'
)
self.assertEqual(
b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
'utf-32-be', 'test.forward_shorter_than_end'),
'\ufffd\ufffd\ufffd\u1111\x00'
)
def replace_with_long(exc):
if isinstance(exc, UnicodeDecodeError):
exc.object = b"\x00" * 8
return ('\ufffd', exc.start)
else:
raise TypeError("don't know how to handle %r" % exc)
codecs.register_error("test.replace_with_long", replace_with_long)
self.assertEqual(
b'\x00'.decode('utf-16', 'test.replace_with_long'),
'\ufffd\x00\x00\x00\x00'
)
self.assertEqual(
b'\x00'.decode('utf-32', 'test.replace_with_long'),
'\ufffd\x00\x00'
)
def test_fake_error_class(self):
handlers = [
codecs.strict_errors,

View File

@ -0,0 +1,2 @@
Fix possible crashing in builtin Unicode decoders caused by write
out-of-bound errors when using customized decode error handlers.

View File

@ -4190,7 +4190,10 @@ unicode_decode_call_errorhandler_writer(
Py_ssize_t insize;
Py_ssize_t newpos;
Py_ssize_t replen;
Py_ssize_t remain;
PyObject *inputobj = NULL;
int need_to_grow = 0;
const char *new_inptr;
if (*errorHandler == NULL) {
*errorHandler = PyCodec_LookupError(errors);
@ -4221,6 +4224,7 @@ unicode_decode_call_errorhandler_writer(
inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
if (!inputobj)
goto onError;
remain = *inend - *input - *endinpos;
*input = PyBytes_AS_STRING(inputobj);
insize = PyBytes_GET_SIZE(inputobj);
*inend = *input + insize;
@ -4238,6 +4242,19 @@ unicode_decode_call_errorhandler_writer(
replen = PyUnicode_GET_LENGTH(repunicode);
if (replen > 1) {
writer->min_length += replen - 1;
need_to_grow = 1;
}
new_inptr = *input + newpos;
if (*inend - new_inptr > remain) {
/* We don't know the decoding algorithm here so we make the worst
assumption that one byte decodes to one unicode character.
If unfortunately one byte could decode to more unicode characters,
the decoder may write out-of-bound then. Is it possible for the
algorithms using this function? */
writer->min_length += *inend - new_inptr - remain;
need_to_grow = 1;
}
if (need_to_grow) {
writer->overallocate = 1;
if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
@ -4247,7 +4264,7 @@ unicode_decode_call_errorhandler_writer(
goto onError;
*endinpos = newpos;
*inptr = *input + newpos;
*inptr = new_inptr;
/* we made it! */
Py_DECREF(restuple);
@ -5572,7 +5589,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
#endif
/* Note: size will always be longer than the resulting Unicode
character count */
character count normally. Error handler will take care of
resizing when needed. */
_PyUnicodeWriter_Init(&writer);
writer.min_length = (e - q + 1) / 2;
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)