gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful() (#120639)

Add PyUnicodeWriter_WriteWideChar() and
PyUnicodeWriter_DecodeUTF8Stateful() functions.

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Victor Stinner 2024-06-21 19:33:15 +02:00 committed by GitHub
parent aed31beca9
commit 4123226bbd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 333 additions and 69 deletions

View File

@ -1551,9 +1551,17 @@ object.
On success, return ``0``.
On error, set an exception, leave the writer unchanged, and return ``-1``.
To use a different error handler than ``strict``,
:c:func:`PyUnicode_DecodeUTF8` can be used with
:c:func:`PyUnicodeWriter_WriteStr`.
See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size)
Writer the wide string *str* into *writer*.
*size* is a number of wide characters. If *size* is equal to ``-1``, call
``wcslen(str)`` to get the string length.
On success, return ``0``.
On error, set an exception, leave the writer unchanged, and return ``-1``.
.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
@ -1586,3 +1594,24 @@ object.
On success, return ``0``.
On error, set an exception, leave the writer unchanged, and return ``-1``.
.. c:function:: int PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, const char *string, Py_ssize_t length, const char *errors, Py_ssize_t *consumed)
Decode the string *str* from UTF-8 with *errors* error handler and write the
output into *writer*.
*size* is the string length in bytes. If *size* is equal to ``-1``, call
``strlen(str)`` to get the string length.
*errors* is an error handler name, such as ``"replace"``. If *errors* is
``NULL``, use the strict error handler.
If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded
bytes on success.
If *consumed* is ``NULL``, treat trailing incomplete UTF-8 byte sequences
as an error.
On success, return ``0``.
On error, set an exception, leave the writer unchanged, and return ``-1``.
See also :c:func:`PyUnicodeWriter_WriteUTF8`.

View File

@ -291,10 +291,12 @@ New Features
* :c:func:`PyUnicodeWriter_Finish`.
* :c:func:`PyUnicodeWriter_WriteChar`.
* :c:func:`PyUnicodeWriter_WriteUTF8`.
* :c:func:`PyUnicodeWriter_WriteWideChar`.
* :c:func:`PyUnicodeWriter_WriteStr`.
* :c:func:`PyUnicodeWriter_WriteRepr`.
* :c:func:`PyUnicodeWriter_WriteSubstring`.
* :c:func:`PyUnicodeWriter_Format`.
* :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
(Contributed by Victor Stinner in :gh:`119182`.)

View File

@ -459,6 +459,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
PyUnicodeWriter *writer,
const char *str,
Py_ssize_t size);
PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
PyUnicodeWriter *writer,
const wchar_t *str,
Py_ssize_t size);
PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
PyUnicodeWriter *writer,
@ -475,6 +479,12 @@ PyAPI_FUNC(int) PyUnicodeWriter_Format(
PyUnicodeWriter *writer,
const char *format,
...);
PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful(
PyUnicodeWriter *writer,
const char *string, /* UTF-8 encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed); /* bytes consumed */
/* --- Private _PyUnicodeWriter API --------------------------------------- */

View File

@ -374,6 +374,119 @@ test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args))
}
static PyObject *
test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args))
{
// test PyUnicodeWriter_DecodeUTF8Stateful()
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
if (writer == NULL) {
return NULL;
}
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0) {
goto error;
}
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
goto error;
}
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) {
goto error;
}
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
goto error;
}
// incomplete trailing UTF-8 sequence
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "replace", NULL) < 0) {
goto error;
}
PyObject *result = PyUnicodeWriter_Finish(writer);
if (result == NULL) {
return NULL;
}
assert(PyUnicode_EqualToUTF8(result,
"ignore-replace\xef\xbf\xbd"
"-incomplete\xef\xbf\xbd"));
Py_DECREF(result);
Py_RETURN_NONE;
error:
PyUnicodeWriter_Discard(writer);
return NULL;
}
static PyObject *
test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args))
{
// test PyUnicodeWriter_DecodeUTF8Stateful()
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
if (writer == NULL) {
return NULL;
}
Py_ssize_t consumed;
// valid string
consumed = 12345;
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0) {
goto error;
}
assert(consumed == 4);
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
goto error;
}
// non-ASCII
consumed = 12345;
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "\xC3\xA9-\xE2\x82\xAC", 6, NULL, &consumed) < 0) {
goto error;
}
assert(consumed == 6);
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
goto error;
}
// consumed is 0 if write fails
consumed = 12345;
assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0);
PyErr_Clear();
assert(consumed == 0);
// ignore error handler
consumed = 12345;
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0) {
goto error;
}
assert(consumed == 5);
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
goto error;
}
// incomplete trailing UTF-8 sequence
consumed = 12345;
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "ignore", &consumed) < 0) {
goto error;
}
assert(consumed == 10);
PyObject *result = PyUnicodeWriter_Finish(writer);
if (result == NULL) {
return NULL;
}
assert(PyUnicode_EqualToUTF8(result,
"text-\xC3\xA9-\xE2\x82\xAC-"
"more-incomplete"));
Py_DECREF(result);
Py_RETURN_NONE;
error:
PyUnicodeWriter_Discard(writer);
return NULL;
}
static PyObject *
test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args))
{
@ -436,6 +549,42 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args
}
static PyObject *
test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args))
{
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
if (writer == NULL) {
return NULL;
}
if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0) {
goto error;
}
if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0) {
goto error;
}
if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0) {
goto error;
}
if (PyUnicodeWriter_WriteChar(writer, '.') < 0) {
goto error;
}
PyObject *result = PyUnicodeWriter_Finish(writer);
if (result == NULL) {
return NULL;
}
assert(PyUnicode_EqualToUTF8(result,
"latin1=\xC3\xA9-euro=\xE2\x82\xAC."));
Py_DECREF(result);
Py_RETURN_NONE;
error:
PyUnicodeWriter_Discard(writer);
return NULL;
}
static PyMethodDef TestMethods[] = {
{"unicode_new", unicode_new, METH_VARARGS},
{"unicode_fill", unicode_fill, METH_VARARGS},
@ -448,8 +597,11 @@ static PyMethodDef TestMethods[] = {
{"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS},
{"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS},
{"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS},
{"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS},
{"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS},
{"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS},
{"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS},
{"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS},
{NULL},
};

View File

@ -1374,46 +1374,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
return obj;
}
#if SIZEOF_WCHAR_T == 2
/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
will decode surrogate pairs, the other conversions are implemented as macros
for efficiency.
This function assumes that unicode can hold one more code point than wstr
characters for a terminating null character. */
static void
unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
PyObject *unicode)
{
const wchar_t *iter;
Py_UCS4 *ucs4_out;
assert(unicode != NULL);
assert(_PyUnicode_CHECK(unicode));
assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
ucs4_out = PyUnicode_4BYTE_DATA(unicode);
for (iter = begin; iter < end; ) {
assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
_PyUnicode_GET_LENGTH(unicode)));
if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
&& (iter+1) < end
&& Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
{
*ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
iter += 2;
}
else {
*ucs4_out++ = *iter;
iter++;
}
}
assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
_PyUnicode_GET_LENGTH(unicode)));
}
#endif
static int
unicode_check_modifiable(PyObject *unicode)
{
@ -1937,6 +1897,62 @@ unicode_char(Py_UCS4 ch)
return unicode;
}
static inline void
unicode_write_widechar(int kind, void *data,
const wchar_t *u, Py_ssize_t size,
Py_ssize_t num_surrogates)
{
switch (kind) {
case PyUnicode_1BYTE_KIND:
_PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
break;
case PyUnicode_2BYTE_KIND:
#if SIZEOF_WCHAR_T == 2
memcpy(data, u, size * 2);
#else
_PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
#endif
break;
case PyUnicode_4BYTE_KIND:
{
#if SIZEOF_WCHAR_T == 2
// Convert a 16-bits wchar_t representation to UCS4, this will decode
// surrogate pairs.
const wchar_t *end = u + size;
Py_UCS4 *ucs4_out = (Py_UCS4*)data;
# ifndef NDEBUG
Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
# endif
for (const wchar_t *iter = u; iter < end; ) {
assert(ucs4_out < ucs4_end);
if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
&& (iter+1) < end
&& Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
{
*ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
iter += 2;
}
else {
*ucs4_out++ = *iter;
iter++;
}
}
assert(ucs4_out == ucs4_end);
#else
assert(num_surrogates == 0);
memcpy(data, u, size * 4);
#endif
break;
}
default:
Py_UNREACHABLE();
}
}
PyObject *
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
{
@ -1989,36 +2005,65 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
if (!unicode)
return NULL;
switch (PyUnicode_KIND(unicode)) {
case PyUnicode_1BYTE_KIND:
_PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
u, u + size, PyUnicode_1BYTE_DATA(unicode));
break;
case PyUnicode_2BYTE_KIND:
#if Py_UNICODE_SIZE == 2
memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
#else
_PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
u, u + size, PyUnicode_2BYTE_DATA(unicode));
#endif
break;
case PyUnicode_4BYTE_KIND:
#if SIZEOF_WCHAR_T == 2
/* This is the only case which has to process surrogates, thus
a simple copy loop is not enough and we need a function. */
unicode_convert_wchar_to_ucs4(u, u + size, unicode);
#else
assert(num_surrogates == 0);
memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
#endif
break;
default:
Py_UNREACHABLE();
}
unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
u, size, num_surrogates);
return unicode_result(unicode);
}
int
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
const wchar_t *str,
Py_ssize_t size)
{
_PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
if (size < 0) {
size = wcslen(str);
}
if (size == 0) {
return 0;
}
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
/* Oracle Solaris uses non-Unicode internal wchar_t form for
non-Unicode locales and hence needs conversion to UCS-4 first. */
if (_Py_LocaleUsesNonUnicodeWchar()) {
wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
if (!converted) {
return -1;
}
PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
PyMem_Free(converted);
int res = _PyUnicodeWriter_WriteStr(writer, unicode);
Py_DECREF(unicode);
return res;
}
#endif
Py_UCS4 maxchar = 0;
Py_ssize_t num_surrogates;
if (find_maxchar_surrogates(str, str + size,
&maxchar, &num_surrogates) == -1) {
return -1;
}
if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
return -1;
}
int kind = writer->kind;
void *data = (Py_UCS1*)writer->data + writer->pos * kind;
unicode_write_widechar(kind, data, str, size, num_surrogates);
writer->pos += size - num_surrogates;
return 0;
}
PyObject *
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
{
@ -13649,6 +13694,32 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
return res;
}
int
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
const char *string,
Py_ssize_t length,
const char *errors,
Py_ssize_t *consumed)
{
if (length < 0) {
length = strlen(string);
}
_PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
Py_ssize_t old_pos = _writer->pos;
int res = unicode_decode_utf8_writer(_writer, string, length,
_Py_ERROR_UNKNOWN, errors, consumed);
if (res < 0) {
_writer->pos = old_pos;
if (consumed) {
*consumed = 0;
}
}
return res;
}
int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
const char *str, Py_ssize_t len)