mirror of https://github.com/python/cpython
gh-111089: PyUnicode_AsUTF8() now raises on embedded NUL (#111091)
* PyUnicode_AsUTF8() now raises an exception if the string contains embedded null characters. * Update related C API tests (test_capi.test_unicode). * type_new_set_doc() uses PyUnicode_AsUTF8AndSize() to silently truncate doc containing null bytes. Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
59ea0f523e
commit
d731579bfb
|
@ -992,11 +992,19 @@ These are the UTF-8 codec APIs:
|
||||||
|
|
||||||
As :c:func:`PyUnicode_AsUTF8AndSize`, but does not store the size.
|
As :c:func:`PyUnicode_AsUTF8AndSize`, but does not store the size.
|
||||||
|
|
||||||
|
Raise an exception if the *unicode* string contains embedded null
|
||||||
|
characters. To accept embedded null characters and truncate on purpose
|
||||||
|
at the first null byte, ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be
|
||||||
|
used instead.
|
||||||
|
|
||||||
.. versionadded:: 3.3
|
.. versionadded:: 3.3
|
||||||
|
|
||||||
.. versionchanged:: 3.7
|
.. versionchanged:: 3.7
|
||||||
The return type is now ``const char *`` rather of ``char *``.
|
The return type is now ``const char *`` rather of ``char *``.
|
||||||
|
|
||||||
|
.. versionchanged:: 3.13
|
||||||
|
Raise an exception if the string contains embedded null characters.
|
||||||
|
|
||||||
|
|
||||||
UTF-32 Codecs
|
UTF-32 Codecs
|
||||||
"""""""""""""
|
"""""""""""""
|
||||||
|
|
|
@ -1109,6 +1109,12 @@ Porting to Python 3.13
|
||||||
are now undefined by ``<Python.h>``.
|
are now undefined by ``<Python.h>``.
|
||||||
(Contributed by Victor Stinner in :gh:`85283`.)
|
(Contributed by Victor Stinner in :gh:`85283`.)
|
||||||
|
|
||||||
|
* The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the string
|
||||||
|
contains embedded null characters. To accept embedded null characters and
|
||||||
|
truncate on purpose at the first null byte,
|
||||||
|
``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be used instead.
|
||||||
|
(Contributed by Victor Stinner in :gh:`111089`.)
|
||||||
|
|
||||||
Deprecated
|
Deprecated
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
|
|
@ -442,18 +442,18 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
|
||||||
|
|
||||||
/* --- Manage the default encoding ---------------------------------------- */
|
/* --- Manage the default encoding ---------------------------------------- */
|
||||||
|
|
||||||
/* Returns a pointer to the default encoding (UTF-8) of the
|
// Returns a pointer to the default encoding (UTF-8) of the
|
||||||
Unicode object unicode.
|
// Unicode object unicode.
|
||||||
|
//
|
||||||
Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
|
// Raise an exception if the string contains embedded null characters.
|
||||||
in the unicodeobject.
|
// Use PyUnicode_AsUTF8AndSize() to accept embedded null characters.
|
||||||
|
//
|
||||||
Use of this API is DEPRECATED since no size information can be
|
// This function caches the UTF-8 encoded string in the Unicode object
|
||||||
extracted from the returned data.
|
// and subsequent calls will return the same string. The memory is released
|
||||||
*/
|
// when the Unicode object is deallocated.
|
||||||
|
|
||||||
PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
|
PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
|
||||||
|
|
||||||
|
|
||||||
/* === Characters Type APIs =============================================== */
|
/* === Characters Type APIs =============================================== */
|
||||||
|
|
||||||
/* These should not be used directly. Use the Py_UNICODE_IS* and
|
/* These should not be used directly. Use the Py_UNICODE_IS* and
|
||||||
|
|
|
@ -443,17 +443,15 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
|
||||||
PyObject *unicode /* Unicode object */
|
PyObject *unicode /* Unicode object */
|
||||||
);
|
);
|
||||||
|
|
||||||
/* Returns a pointer to the default encoding (UTF-8) of the
|
// Returns a pointer to the default encoding (UTF-8) of the
|
||||||
Unicode object unicode and the size of the encoded representation
|
// Unicode object unicode and the size of the encoded representation
|
||||||
in bytes stored in *size.
|
// in bytes stored in `*size` (if size is not NULL).
|
||||||
|
//
|
||||||
In case of an error, no *size is set.
|
// On error, `*size` is set to 0 (if size is not NULL).
|
||||||
|
//
|
||||||
This function caches the UTF-8 encoded string in the unicodeobject
|
// This function caches the UTF-8 encoded string in the Unicode object
|
||||||
and subsequent calls will return the same string. The memory is released
|
// and subsequent calls will return the same string. The memory is released
|
||||||
when the unicodeobject is deallocated.
|
// when the Unicode object is deallocated.
|
||||||
*/
|
|
||||||
|
|
||||||
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
|
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
|
||||||
PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
|
PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
|
||||||
PyObject *unicode,
|
PyObject *unicode,
|
||||||
|
|
|
@ -882,7 +882,10 @@ class CAPITest(unittest.TestCase):
|
||||||
self.assertEqual(unicode_asutf8('abc', 4), b'abc\0')
|
self.assertEqual(unicode_asutf8('abc', 4), b'abc\0')
|
||||||
self.assertEqual(unicode_asutf8('абв', 7), b'\xd0\xb0\xd0\xb1\xd0\xb2\0')
|
self.assertEqual(unicode_asutf8('абв', 7), b'\xd0\xb0\xd0\xb1\xd0\xb2\0')
|
||||||
self.assertEqual(unicode_asutf8('\U0001f600', 5), b'\xf0\x9f\x98\x80\0')
|
self.assertEqual(unicode_asutf8('\U0001f600', 5), b'\xf0\x9f\x98\x80\0')
|
||||||
self.assertEqual(unicode_asutf8('abc\0def', 8), b'abc\0def\0')
|
|
||||||
|
# disallow embedded null characters
|
||||||
|
self.assertRaises(ValueError, unicode_asutf8, 'abc\0', 0)
|
||||||
|
self.assertRaises(ValueError, unicode_asutf8, 'abc\0def', 0)
|
||||||
|
|
||||||
self.assertRaises(UnicodeEncodeError, unicode_asutf8, '\ud8ff', 0)
|
self.assertRaises(UnicodeEncodeError, unicode_asutf8, '\ud8ff', 0)
|
||||||
self.assertRaises(TypeError, unicode_asutf8, b'abc', 0)
|
self.assertRaises(TypeError, unicode_asutf8, b'abc', 0)
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the
|
||||||
|
string contains embedded null characters. Patch by Victor Stinner.
|
|
@ -3499,13 +3499,14 @@ type_new_set_doc(PyTypeObject *type)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *doc_str = PyUnicode_AsUTF8(doc);
|
Py_ssize_t doc_size;
|
||||||
|
const char *doc_str = PyUnicode_AsUTF8AndSize(doc, &doc_size);
|
||||||
if (doc_str == NULL) {
|
if (doc_str == NULL) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Silently truncate the docstring if it contains a null byte
|
// Silently truncate the docstring if it contains a null byte
|
||||||
Py_ssize_t size = strlen(doc_str) + 1;
|
Py_ssize_t size = doc_size + 1;
|
||||||
char *tp_doc = (char *)PyObject_Malloc(size);
|
char *tp_doc = (char *)PyObject_Malloc(size);
|
||||||
if (tp_doc == NULL) {
|
if (tp_doc == NULL) {
|
||||||
PyErr_NoMemory();
|
PyErr_NoMemory();
|
||||||
|
|
|
@ -3837,7 +3837,13 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
|
||||||
const char *
|
const char *
|
||||||
PyUnicode_AsUTF8(PyObject *unicode)
|
PyUnicode_AsUTF8(PyObject *unicode)
|
||||||
{
|
{
|
||||||
return PyUnicode_AsUTF8AndSize(unicode, NULL);
|
Py_ssize_t size;
|
||||||
|
const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &size);
|
||||||
|
if (utf8 != NULL && strlen(utf8) != (size_t)size) {
|
||||||
|
PyErr_SetString(PyExc_ValueError, "embedded null character");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return utf8;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Reference in New Issue