From f1e751e933aa8c39c0e9cfa4cdc3f8f9f0538202 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 20 Oct 2023 20:03:11 +0200 Subject: [PATCH] gh-111089: PyUnicode_AsUTF8AndSize() sets size on error (#111106) On error, PyUnicode_AsUTF8AndSize() now sets the size argument to -1, to avoid undefined value. --- Doc/c-api/unicode.rst | 4 ++-- Modules/_testcapi/unicode.c | 2 +- Objects/unicodeobject.c | 9 ++++++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index d17e63dc089..5fa37963e07 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -971,8 +971,8 @@ These are the UTF-8 codec APIs: returned buffer always has an extra null byte appended (not included in *size*), regardless of whether there are any other null code points. - In the case of an error, ``NULL`` is returned with an exception set and no - *size* is stored. + On error, set an exception, set *size* to ``-1`` (if it's not NULL) and + return ``NULL``. This caches the UTF-8 representation of the string in the Unicode object, and subsequent calls will return a pointer to the same buffer. The caller is not diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index d52d88a65d8..a10183dddec 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -634,7 +634,7 @@ unicode_asutf8andsize(PyObject *self, PyObject *args) NULLABLE(unicode); s = PyUnicode_AsUTF8AndSize(unicode, &size); if (s == NULL) { - assert(size == UNINITIALIZED_SIZE); + assert(size == -1); return NULL; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 07d1b6e726b..80b19567c63 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3820,17 +3820,24 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) { if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); + if (psize) { + *psize = -1; + } return NULL; } if (PyUnicode_UTF8(unicode) == NULL) { if (unicode_fill_utf8(unicode) == -1) { + if (psize) { + *psize = -1; + } return NULL; } } - if (psize) + if (psize) { *psize = PyUnicode_UTF8_LENGTH(unicode); + } return PyUnicode_UTF8(unicode); }