mirror of https://github.com/python/cpython
bpo-36819: Fix crashes in built-in encoders with weird error handlers (GH-28593)
If the error handler returns position less or equal than the starting position of non-encodable characters, most of built-in encoders didn't properly re-size the output buffer. This led to out-of-bounds writes, and segfaults.
This commit is contained in:
parent
614420df97
commit
18b07d773e
|
@ -1,5 +1,6 @@
|
|||
import codecs
|
||||
import html.entities
|
||||
import itertools
|
||||
import sys
|
||||
import unicodedata
|
||||
import unittest
|
||||
|
@ -22,6 +23,18 @@ class PosReturn:
|
|||
self.pos = len(exc.object)
|
||||
return ("<?>", oldpos)
|
||||
|
||||
class RepeatedPosReturn:
|
||||
def __init__(self, repl="<?>"):
|
||||
self.repl = repl
|
||||
self.pos = 0
|
||||
self.count = 0
|
||||
|
||||
def handle(self, exc):
|
||||
if self.count > 0:
|
||||
self.count -= 1
|
||||
return (self.repl, self.pos)
|
||||
return (self.repl, exc.end)
|
||||
|
||||
# A UnicodeEncodeError object with a bad start attribute
|
||||
class BadStartUnicodeEncodeError(UnicodeEncodeError):
|
||||
def __init__(self):
|
||||
|
@ -783,20 +796,104 @@ class CodecCallbackTest(unittest.TestCase):
|
|||
codecs.lookup_error("namereplace")
|
||||
)
|
||||
|
||||
def test_unencodablereplacement(self):
|
||||
def test_encode_nonascii_replacement(self):
|
||||
def handle(exc):
|
||||
if isinstance(exc, UnicodeEncodeError):
|
||||
return (repl, exc.end)
|
||||
raise TypeError("don't know how to handle %r" % exc)
|
||||
codecs.register_error("test.replacing", handle)
|
||||
|
||||
for enc, input, repl in (
|
||||
("ascii", "[¤]", "abc"),
|
||||
("iso-8859-1", "[€]", "½¾"),
|
||||
("iso-8859-15", "[¤]", "œŸ"),
|
||||
):
|
||||
res = input.encode(enc, "test.replacing")
|
||||
self.assertEqual(res, ("[" + repl + "]").encode(enc))
|
||||
|
||||
for enc, input, repl in (
|
||||
("utf-8", "[\udc80]", "\U0001f40d"),
|
||||
("utf-16", "[\udc80]", "\U0001f40d"),
|
||||
("utf-32", "[\udc80]", "\U0001f40d"),
|
||||
):
|
||||
with self.subTest(encoding=enc):
|
||||
with self.assertRaises(UnicodeEncodeError) as cm:
|
||||
input.encode(enc, "test.replacing")
|
||||
exc = cm.exception
|
||||
self.assertEqual(exc.start, 1)
|
||||
self.assertEqual(exc.end, 2)
|
||||
self.assertEqual(exc.object, input)
|
||||
|
||||
def test_encode_unencodable_replacement(self):
|
||||
def unencrepl(exc):
|
||||
if isinstance(exc, UnicodeEncodeError):
|
||||
return ("\u4242", exc.end)
|
||||
return (repl, exc.end)
|
||||
else:
|
||||
raise TypeError("don't know how to handle %r" % exc)
|
||||
codecs.register_error("test.unencreplhandler", unencrepl)
|
||||
for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
|
||||
self.assertRaises(
|
||||
UnicodeEncodeError,
|
||||
"\u4242".encode,
|
||||
enc,
|
||||
"test.unencreplhandler"
|
||||
)
|
||||
|
||||
for enc, input, repl in (
|
||||
("ascii", "[¤]", "½"),
|
||||
("iso-8859-1", "[€]", "œ"),
|
||||
("iso-8859-15", "[¤]", "½"),
|
||||
("utf-8", "[\udc80]", "\udcff"),
|
||||
("utf-16", "[\udc80]", "\udcff"),
|
||||
("utf-32", "[\udc80]", "\udcff"),
|
||||
):
|
||||
with self.subTest(encoding=enc):
|
||||
with self.assertRaises(UnicodeEncodeError) as cm:
|
||||
input.encode(enc, "test.unencreplhandler")
|
||||
exc = cm.exception
|
||||
self.assertEqual(exc.start, 1)
|
||||
self.assertEqual(exc.end, 2)
|
||||
self.assertEqual(exc.object, input)
|
||||
|
||||
def test_encode_bytes_replacement(self):
|
||||
def handle(exc):
|
||||
if isinstance(exc, UnicodeEncodeError):
|
||||
return (repl, exc.end)
|
||||
raise TypeError("don't know how to handle %r" % exc)
|
||||
codecs.register_error("test.replacing", handle)
|
||||
|
||||
# It works even if the bytes sequence is not decodable.
|
||||
for enc, input, repl in (
|
||||
("ascii", "[¤]", b"\xbd\xbe"),
|
||||
("iso-8859-1", "[€]", b"\xbd\xbe"),
|
||||
("iso-8859-15", "[¤]", b"\xbd\xbe"),
|
||||
("utf-8", "[\udc80]", b"\xbd\xbe"),
|
||||
("utf-16le", "[\udc80]", b"\xbd\xbe"),
|
||||
("utf-16be", "[\udc80]", b"\xbd\xbe"),
|
||||
("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
|
||||
("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
|
||||
):
|
||||
with self.subTest(encoding=enc):
|
||||
res = input.encode(enc, "test.replacing")
|
||||
self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc))
|
||||
|
||||
def test_encode_odd_bytes_replacement(self):
|
||||
def handle(exc):
|
||||
if isinstance(exc, UnicodeEncodeError):
|
||||
return (repl, exc.end)
|
||||
raise TypeError("don't know how to handle %r" % exc)
|
||||
codecs.register_error("test.replacing", handle)
|
||||
|
||||
input = "[\udc80]"
|
||||
# Tests in which the replacement bytestring contains not whole number
|
||||
# of code units.
|
||||
for enc, repl in (
|
||||
*itertools.product(("utf-16le", "utf-16be"),
|
||||
[b"a", b"abc"]),
|
||||
*itertools.product(("utf-32le", "utf-32be"),
|
||||
[b"a", b"ab", b"abc", b"abcde"]),
|
||||
):
|
||||
with self.subTest(encoding=enc, repl=repl):
|
||||
with self.assertRaises(UnicodeEncodeError) as cm:
|
||||
input.encode(enc, "test.replacing")
|
||||
exc = cm.exception
|
||||
self.assertEqual(exc.start, 1)
|
||||
self.assertEqual(exc.end, 2)
|
||||
self.assertEqual(exc.object, input)
|
||||
self.assertEqual(exc.reason, "surrogates not allowed")
|
||||
|
||||
def test_badregistercall(self):
|
||||
# enhance coverage of:
|
||||
|
@ -940,6 +1037,68 @@ class CodecCallbackTest(unittest.TestCase):
|
|||
self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
|
||||
self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
|
||||
|
||||
def test_decodehelper_bug36819(self):
|
||||
handler = RepeatedPosReturn("x")
|
||||
codecs.register_error("test.bug36819", handler.handle)
|
||||
|
||||
testcases = [
|
||||
("ascii", b"\xff"),
|
||||
("utf-8", b"\xff"),
|
||||
("utf-16be", b'\xdc\x80'),
|
||||
("utf-32be", b'\x00\x00\xdc\x80'),
|
||||
("iso-8859-6", b"\xff"),
|
||||
]
|
||||
for enc, bad in testcases:
|
||||
input = "abcd".encode(enc) + bad
|
||||
with self.subTest(encoding=enc):
|
||||
handler.count = 50
|
||||
decoded = input.decode(enc, "test.bug36819")
|
||||
self.assertEqual(decoded, 'abcdx' * 51)
|
||||
|
||||
def test_encodehelper_bug36819(self):
|
||||
handler = RepeatedPosReturn()
|
||||
codecs.register_error("test.bug36819", handler.handle)
|
||||
|
||||
input = "abcd\udc80"
|
||||
encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"] # built-in
|
||||
encodings += ["iso-8859-15"] # charmap codec
|
||||
if sys.platform == 'win32':
|
||||
encodings = ["mbcs", "oem"] # code page codecs
|
||||
|
||||
handler.repl = "\udcff"
|
||||
for enc in encodings:
|
||||
with self.subTest(encoding=enc):
|
||||
handler.count = 50
|
||||
with self.assertRaises(UnicodeEncodeError) as cm:
|
||||
input.encode(enc, "test.bug36819")
|
||||
exc = cm.exception
|
||||
self.assertEqual(exc.start, 4)
|
||||
self.assertEqual(exc.end, 5)
|
||||
self.assertEqual(exc.object, input)
|
||||
if sys.platform == "win32":
|
||||
handler.count = 50
|
||||
with self.assertRaises(UnicodeEncodeError) as cm:
|
||||
codecs.code_page_encode(437, input, "test.bug36819")
|
||||
exc = cm.exception
|
||||
self.assertEqual(exc.start, 4)
|
||||
self.assertEqual(exc.end, 5)
|
||||
self.assertEqual(exc.object, input)
|
||||
|
||||
handler.repl = "x"
|
||||
for enc in encodings:
|
||||
with self.subTest(encoding=enc):
|
||||
# The interpreter should segfault after a handful of attempts.
|
||||
# 50 was chosen to try to ensure a segfault without a fix,
|
||||
# but not OOM a machine with one.
|
||||
handler.count = 50
|
||||
encoded = input.encode(enc, "test.bug36819")
|
||||
self.assertEqual(encoded.decode(enc), "abcdx" * 51)
|
||||
if sys.platform == "win32":
|
||||
handler.count = 50
|
||||
encoded = codecs.code_page_encode(437, input, "test.bug36819")
|
||||
self.assertEqual(encoded[0].decode(), "abcdx" * 51)
|
||||
self.assertEqual(encoded[1], len(input))
|
||||
|
||||
def test_translatehelper(self):
|
||||
# enhance coverage of:
|
||||
# Objects/unicodeobject.c::unicode_encode_call_errorhandler()
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Fix crashes in built-in encoders with error handlers that return position
|
||||
less or equal than the starting position of non-encodable characters.
|
|
@ -387,8 +387,19 @@ STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
|
|||
if (!rep)
|
||||
goto error;
|
||||
|
||||
/* subtract preallocated bytes */
|
||||
writer->min_size -= max_char_size * (newpos - startpos);
|
||||
if (newpos < startpos) {
|
||||
writer->overallocate = 1;
|
||||
p = _PyBytesWriter_Prepare(writer, p,
|
||||
max_char_size * (startpos - newpos));
|
||||
if (p == NULL)
|
||||
goto error;
|
||||
}
|
||||
else {
|
||||
/* subtract preallocated bytes */
|
||||
writer->min_size -= max_char_size * (newpos - startpos);
|
||||
/* Only overallocate the buffer if it's not the last write */
|
||||
writer->overallocate = (newpos < size);
|
||||
}
|
||||
|
||||
if (PyBytes_Check(rep)) {
|
||||
p = _PyBytesWriter_WriteBytes(writer, p,
|
||||
|
|
|
@ -5868,7 +5868,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
|||
|
||||
pos = 0;
|
||||
while (pos < len) {
|
||||
Py_ssize_t repsize, moreunits;
|
||||
Py_ssize_t newpos, repsize, moreunits;
|
||||
|
||||
if (kind == PyUnicode_2BYTE_KIND) {
|
||||
pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
|
||||
|
@ -5885,7 +5885,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
|||
rep = unicode_encode_call_errorhandler(
|
||||
errors, &errorHandler,
|
||||
encoding, "surrogates not allowed",
|
||||
str, &exc, pos, pos + 1, &pos);
|
||||
str, &exc, pos, pos + 1, &newpos);
|
||||
if (!rep)
|
||||
goto error;
|
||||
|
||||
|
@ -5893,7 +5893,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
|||
repsize = PyBytes_GET_SIZE(rep);
|
||||
if (repsize & 3) {
|
||||
raise_encode_exception(&exc, encoding,
|
||||
str, pos - 1, pos,
|
||||
str, pos, pos + 1,
|
||||
"surrogates not allowed");
|
||||
goto error;
|
||||
}
|
||||
|
@ -5906,28 +5906,30 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
|||
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
|
||||
if (!PyUnicode_IS_ASCII(rep)) {
|
||||
raise_encode_exception(&exc, encoding,
|
||||
str, pos - 1, pos,
|
||||
str, pos, pos + 1,
|
||||
"surrogates not allowed");
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
moreunits += pos - newpos;
|
||||
pos = newpos;
|
||||
|
||||
/* four bytes are reserved for each surrogate */
|
||||
if (moreunits > 1) {
|
||||
if (moreunits > 0) {
|
||||
Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
|
||||
if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
|
||||
/* integer overflow */
|
||||
PyErr_NoMemory();
|
||||
goto error;
|
||||
}
|
||||
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
|
||||
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
|
||||
goto error;
|
||||
out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
|
||||
}
|
||||
|
||||
if (PyBytes_Check(rep)) {
|
||||
memcpy(out, PyBytes_AS_STRING(rep), repsize);
|
||||
out += moreunits;
|
||||
out += repsize / 4;
|
||||
} else /* rep is unicode */ {
|
||||
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
|
||||
ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
|
||||
|
@ -6205,7 +6207,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,
|
|||
|
||||
pos = 0;
|
||||
while (pos < len) {
|
||||
Py_ssize_t repsize, moreunits;
|
||||
Py_ssize_t newpos, repsize, moreunits;
|
||||
|
||||
if (kind == PyUnicode_2BYTE_KIND) {
|
||||
pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
|
||||
|
@ -6222,7 +6224,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,
|
|||
rep = unicode_encode_call_errorhandler(
|
||||
errors, &errorHandler,
|
||||
encoding, "surrogates not allowed",
|
||||
str, &exc, pos, pos + 1, &pos);
|
||||
str, &exc, pos, pos + 1, &newpos);
|
||||
if (!rep)
|
||||
goto error;
|
||||
|
||||
|
@ -6230,7 +6232,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,
|
|||
repsize = PyBytes_GET_SIZE(rep);
|
||||
if (repsize & 1) {
|
||||
raise_encode_exception(&exc, encoding,
|
||||
str, pos - 1, pos,
|
||||
str, pos, pos + 1,
|
||||
"surrogates not allowed");
|
||||
goto error;
|
||||
}
|
||||
|
@ -6243,28 +6245,30 @@ _PyUnicode_EncodeUTF16(PyObject *str,
|
|||
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
|
||||
if (!PyUnicode_IS_ASCII(rep)) {
|
||||
raise_encode_exception(&exc, encoding,
|
||||
str, pos - 1, pos,
|
||||
str, pos, pos + 1,
|
||||
"surrogates not allowed");
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
moreunits += pos - newpos;
|
||||
pos = newpos;
|
||||
|
||||
/* two bytes are reserved for each surrogate */
|
||||
if (moreunits > 1) {
|
||||
if (moreunits > 0) {
|
||||
Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
|
||||
if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
|
||||
/* integer overflow */
|
||||
PyErr_NoMemory();
|
||||
goto error;
|
||||
}
|
||||
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
|
||||
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
|
||||
goto error;
|
||||
out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
|
||||
}
|
||||
|
||||
if (PyBytes_Check(rep)) {
|
||||
memcpy(out, PyBytes_AS_STRING(rep), repsize);
|
||||
out += moreunits;
|
||||
out += repsize / 2;
|
||||
} else /* rep is unicode */ {
|
||||
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
|
||||
ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
|
||||
|
@ -7167,8 +7171,19 @@ unicode_encode_ucs1(PyObject *unicode,
|
|||
if (rep == NULL)
|
||||
goto onError;
|
||||
|
||||
/* subtract preallocated bytes */
|
||||
writer.min_size -= newpos - collstart;
|
||||
if (newpos < collstart) {
|
||||
writer.overallocate = 1;
|
||||
str = _PyBytesWriter_Prepare(&writer, str,
|
||||
collstart - newpos);
|
||||
if (str == NULL)
|
||||
goto onError;
|
||||
}
|
||||
else {
|
||||
/* subtract preallocated bytes */
|
||||
writer.min_size -= newpos - collstart;
|
||||
/* Only overallocate the buffer if it's not the last write */
|
||||
writer.overallocate = (newpos < size);
|
||||
}
|
||||
|
||||
if (PyBytes_Check(rep)) {
|
||||
/* Directly copy bytes result to output. */
|
||||
|
@ -7944,13 +7959,14 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
|
|||
pos, pos + 1, &newpos);
|
||||
if (rep == NULL)
|
||||
goto error;
|
||||
pos = newpos;
|
||||
|
||||
Py_ssize_t morebytes = pos - newpos;
|
||||
if (PyBytes_Check(rep)) {
|
||||
outsize = PyBytes_GET_SIZE(rep);
|
||||
if (outsize != 1) {
|
||||
morebytes += outsize;
|
||||
if (morebytes > 0) {
|
||||
Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
|
||||
newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
|
||||
newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
|
||||
if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
|
||||
Py_DECREF(rep);
|
||||
goto error;
|
||||
|
@ -7971,9 +7987,10 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
|
|||
}
|
||||
|
||||
outsize = PyUnicode_GET_LENGTH(rep);
|
||||
if (outsize != 1) {
|
||||
morebytes += outsize;
|
||||
if (morebytes > 0) {
|
||||
Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
|
||||
newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
|
||||
newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
|
||||
if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
|
||||
Py_DECREF(rep);
|
||||
goto error;
|
||||
|
@ -7996,6 +8013,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
|
|||
out++;
|
||||
}
|
||||
}
|
||||
pos = newpos;
|
||||
Py_DECREF(rep);
|
||||
}
|
||||
/* write a NUL byte */
|
||||
|
|
Loading…
Reference in New Issue