diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index ba858c49400..b3c21cd4f3d 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -8,6 +8,7 @@ import pickle import weakref import errno from codecs import BOM_UTF8 +from itertools import product from textwrap import dedent from test.support import (captured_stderr, check_impl_detail, @@ -1336,6 +1337,29 @@ class ExceptionTests(unittest.TestCase): for klass in klasses: self.assertEqual(str(klass.__new__(klass)), "") + def test_unicode_error_str_does_not_crash(self): + # Test that str(UnicodeError(...)) does not crash. + # See https://github.com/python/cpython/issues/123378. + + for start, end, objlen in product( + range(-5, 5), + range(-5, 5), + range(7), + ): + obj = 'a' * objlen + with self.subTest('encode', objlen=objlen, start=start, end=end): + exc = UnicodeEncodeError('utf-8', obj, start, end, '') + self.assertIsInstance(str(exc), str) + + with self.subTest('translate', objlen=objlen, start=start, end=end): + exc = UnicodeTranslateError(obj, start, end, '') + self.assertIsInstance(str(exc), str) + + encoded = obj.encode() + with self.subTest('decode', objlen=objlen, start=start, end=end): + exc = UnicodeDecodeError('utf-8', encoded, start, end, '') + self.assertIsInstance(str(exc), str) + @no_tracing def test_badisinstance(self): # Bug #2542: if issubclass(e, MyException) raises an exception, diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-10-03-14-39-41.gh-issue-123378.dCxANf.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-10-03-14-39-41.gh-issue-123378.dCxANf.rst new file mode 100644 index 00000000000..5cd34535d67 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-10-03-14-39-41.gh-issue-123378.dCxANf.rst @@ -0,0 +1,3 @@ +Fix a crash in the :meth:`~object.__str__` method of :exc:`UnicodeError` +objects when the :attr:`UnicodeError.start` and :attr:`UnicodeError.end` +values are invalid or out-of-range. Patch by Bénédikt Tran. diff --git a/Objects/exceptions.c b/Objects/exceptions.c index b3910855165..c685481b13a 100644 --- a/Objects/exceptions.c +++ b/Objects/exceptions.c @@ -2994,46 +2994,55 @@ UnicodeEncodeError_init(PyObject *self, PyObject *args, PyObject *kwds) static PyObject * UnicodeEncodeError_str(PyObject *self) { - PyUnicodeErrorObject *uself = (PyUnicodeErrorObject *)self; + PyUnicodeErrorObject *exc = (PyUnicodeErrorObject *)self; PyObject *result = NULL; PyObject *reason_str = NULL; PyObject *encoding_str = NULL; - if (!uself->object) + if (exc->object == NULL) { /* Not properly initialized. */ return PyUnicode_FromString(""); + } /* Get reason and encoding as strings, which they might not be if they've been modified after we were constructed. */ - reason_str = PyObject_Str(uself->reason); - if (reason_str == NULL) + reason_str = PyObject_Str(exc->reason); + if (reason_str == NULL) { goto done; - encoding_str = PyObject_Str(uself->encoding); - if (encoding_str == NULL) + } + encoding_str = PyObject_Str(exc->encoding); + if (encoding_str == NULL) { goto done; + } - if (uself->start < PyUnicode_GET_LENGTH(uself->object) && uself->end == uself->start+1) { - Py_UCS4 badchar = PyUnicode_ReadChar(uself->object, uself->start); + Py_ssize_t len = PyUnicode_GET_LENGTH(exc->object); + Py_ssize_t start = exc->start, end = exc->end; + + if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) { + Py_UCS4 badchar = PyUnicode_ReadChar(exc->object, start); const char *fmt; - if (badchar <= 0xff) + if (badchar <= 0xff) { fmt = "'%U' codec can't encode character '\\x%02x' in position %zd: %U"; - else if (badchar <= 0xffff) + } + else if (badchar <= 0xffff) { fmt = "'%U' codec can't encode character '\\u%04x' in position %zd: %U"; - else + } + else { fmt = "'%U' codec can't encode character '\\U%08x' in position %zd: %U"; + } result = PyUnicode_FromFormat( fmt, encoding_str, (int)badchar, - uself->start, + start, reason_str); } else { result = PyUnicode_FromFormat( "'%U' codec can't encode characters in position %zd-%zd: %U", encoding_str, - uself->start, - uself->end-1, + start, + end - 1, reason_str); } done: @@ -3107,41 +3116,46 @@ error: static PyObject * UnicodeDecodeError_str(PyObject *self) { - PyUnicodeErrorObject *uself = (PyUnicodeErrorObject *)self; + PyUnicodeErrorObject *exc = (PyUnicodeErrorObject *)self; PyObject *result = NULL; PyObject *reason_str = NULL; PyObject *encoding_str = NULL; - if (!uself->object) + if (exc->object == NULL) { /* Not properly initialized. */ return PyUnicode_FromString(""); + } /* Get reason and encoding as strings, which they might not be if they've been modified after we were constructed. */ - reason_str = PyObject_Str(uself->reason); - if (reason_str == NULL) + reason_str = PyObject_Str(exc->reason); + if (reason_str == NULL) { goto done; - encoding_str = PyObject_Str(uself->encoding); - if (encoding_str == NULL) + } + encoding_str = PyObject_Str(exc->encoding); + if (encoding_str == NULL) { goto done; + } - if (uself->start < PyBytes_GET_SIZE(uself->object) && uself->end == uself->start+1) { - int byte = (int)(PyBytes_AS_STRING(((PyUnicodeErrorObject *)self)->object)[uself->start]&0xff); + Py_ssize_t len = PyBytes_GET_SIZE(exc->object); + Py_ssize_t start = exc->start, end = exc->end; + + if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) { + int badbyte = (int)(PyBytes_AS_STRING(exc->object)[start] & 0xff); result = PyUnicode_FromFormat( "'%U' codec can't decode byte 0x%02x in position %zd: %U", encoding_str, - byte, - uself->start, + badbyte, + start, reason_str); } else { result = PyUnicode_FromFormat( "'%U' codec can't decode bytes in position %zd-%zd: %U", encoding_str, - uself->start, - uself->end-1, - reason_str - ); + start, + end - 1, + reason_str); } done: Py_XDECREF(reason_str); @@ -3204,42 +3218,49 @@ UnicodeTranslateError_init(PyUnicodeErrorObject *self, PyObject *args, static PyObject * UnicodeTranslateError_str(PyObject *self) { - PyUnicodeErrorObject *uself = (PyUnicodeErrorObject *)self; + PyUnicodeErrorObject *exc = (PyUnicodeErrorObject *)self; PyObject *result = NULL; PyObject *reason_str = NULL; - if (!uself->object) + if (exc->object == NULL) { /* Not properly initialized. */ return PyUnicode_FromString(""); + } /* Get reason as a string, which it might not be if it's been modified after we were constructed. */ - reason_str = PyObject_Str(uself->reason); - if (reason_str == NULL) + reason_str = PyObject_Str(exc->reason); + if (reason_str == NULL) { goto done; + } - if (uself->start < PyUnicode_GET_LENGTH(uself->object) && uself->end == uself->start+1) { - Py_UCS4 badchar = PyUnicode_ReadChar(uself->object, uself->start); + Py_ssize_t len = PyUnicode_GET_LENGTH(exc->object); + Py_ssize_t start = exc->start, end = exc->end; + + if ((start >= 0 && start < len) && (end >= 0 && end <= len) && end == start + 1) { + Py_UCS4 badchar = PyUnicode_ReadChar(exc->object, start); const char *fmt; - if (badchar <= 0xff) + if (badchar <= 0xff) { fmt = "can't translate character '\\x%02x' in position %zd: %U"; - else if (badchar <= 0xffff) + } + else if (badchar <= 0xffff) { fmt = "can't translate character '\\u%04x' in position %zd: %U"; - else + } + else { fmt = "can't translate character '\\U%08x' in position %zd: %U"; + } result = PyUnicode_FromFormat( fmt, (int)badchar, - uself->start, - reason_str - ); - } else { + start, + reason_str); + } + else { result = PyUnicode_FromFormat( "can't translate characters in position %zd-%zd: %U", - uself->start, - uself->end-1, - reason_str - ); + start, + end - 1, + reason_str); } done: Py_XDECREF(reason_str);