gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful() (#120639)

Add PyUnicodeWriter_WriteWideChar() and PyUnicodeWriter_DecodeUTF8Stateful() functions. Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2024-06-21 19:33:15 +02:00 · 2024-06-21 19:33:15 +02:00 · 4123226bbd
parent aed31beca9
commit 4123226bbd
5 changed files with 333 additions and 69 deletions
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@ -1551,9 +1551,17 @@ object.
   On success, return ``0``.
   On error, set an exception, leave the writer unchanged, and return ``-1``.
-   To use a different error handler than ``strict``,
+   See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
-   :c:func:`PyUnicode_DecodeUTF8` can be used with
+
-   :c:func:`PyUnicodeWriter_WriteStr`.
+.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size)
   Writer the wide string *str* into *writer*.
   *size* is a number of wide characters. If *size* is equal to ``-1``, call
   ``wcslen(str)`` to get the string length.
   On success, return ``0``.
   On error, set an exception, leave the writer unchanged, and return ``-1``.
 .. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
@ -1586,3 +1594,24 @@ object.
   On success, return ``0``.
   On error, set an exception, leave the writer unchanged, and return ``-1``.
 .. c:function:: int PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, const char *string, Py_ssize_t length, const char *errors, Py_ssize_t *consumed)
   Decode the string *str* from UTF-8 with *errors* error handler and write the
   output into *writer*.
   *size* is the string length in bytes. If *size* is equal to ``-1``, call
   ``strlen(str)`` to get the string length.
   *errors* is an error handler name, such as ``"replace"``. If *errors* is
   ``NULL``, use the strict error handler.
   If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded
   bytes on success.
   If *consumed* is ``NULL``, treat trailing incomplete UTF-8 byte sequences
   as an error.
   On success, return ``0``.
   On error, set an exception, leave the writer unchanged, and return ``-1``.
   See also :c:func:`PyUnicodeWriter_WriteUTF8`.
--- a/Doc/whatsnew/3.14.rst
+++ b/Doc/whatsnew/3.14.rst
@ -291,10 +291,12 @@ New Features
  * :c:func:`PyUnicodeWriter_Finish`.
  * :c:func:`PyUnicodeWriter_WriteChar`.
  * :c:func:`PyUnicodeWriter_WriteUTF8`.
  * :c:func:`PyUnicodeWriter_WriteWideChar`.
  * :c:func:`PyUnicodeWriter_WriteStr`.
  * :c:func:`PyUnicodeWriter_WriteRepr`.
  * :c:func:`PyUnicodeWriter_WriteSubstring`.
  * :c:func:`PyUnicodeWriter_Format`.
  * :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
  (Contributed by Victor Stinner in :gh:`119182`.)
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@ -459,6 +459,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
    PyUnicodeWriter *writer,
    const char *str,
    Py_ssize_t size);
 PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
    PyUnicodeWriter *writer,
    const wchar_t *str,
    Py_ssize_t size);
 PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
    PyUnicodeWriter *writer,
@ -475,6 +479,12 @@ PyAPI_FUNC(int) PyUnicodeWriter_Format(
    PyUnicodeWriter *writer,
    const char *format,
    ...);
 PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful(
    PyUnicodeWriter *writer,
    const char *string,         /* UTF-8 encoded string */
    Py_ssize_t length,          /* size of string */
    const char *errors,         /* error handling */
    Py_ssize_t *consumed);      /* bytes consumed */
 /* --- Private _PyUnicodeWriter API --------------------------------------- */
--- a/Modules/_testcapi/unicode.c
+++ b/Modules/_testcapi/unicode.c
@ -374,6 +374,119 @@ test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args))
 }
 static PyObject *
 test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args))
 {
    // test PyUnicodeWriter_DecodeUTF8Stateful()
    PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
    if (writer == NULL) {
        return NULL;
    }
    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0) {
        goto error;
    }
    if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
        goto error;
    }
    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) {
        goto error;
    }
    if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
        goto error;
    }
    // incomplete trailing UTF-8 sequence
    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "replace", NULL) < 0) {
        goto error;
    }
    PyObject *result = PyUnicodeWriter_Finish(writer);
    if (result == NULL) {
        return NULL;
    }
    assert(PyUnicode_EqualToUTF8(result,
                                 "ignore-replace\xef\xbf\xbd"
                                 "-incomplete\xef\xbf\xbd"));
    Py_DECREF(result);
    Py_RETURN_NONE;
 error:
    PyUnicodeWriter_Discard(writer);
    return NULL;
 }
 static PyObject *
 test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args))
 {
    // test PyUnicodeWriter_DecodeUTF8Stateful()
    PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
    if (writer == NULL) {
        return NULL;
    }
    Py_ssize_t consumed;
    // valid string
    consumed = 12345;
    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0) {
        goto error;
    }
    assert(consumed == 4);
    if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
        goto error;
    }
    // non-ASCII
    consumed = 12345;
    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "\xC3\xA9-\xE2\x82\xAC", 6, NULL, &consumed) < 0) {
        goto error;
    }
    assert(consumed == 6);
    if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
        goto error;
    }
    // consumed is 0 if write fails
    consumed = 12345;
    assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0);
    PyErr_Clear();
    assert(consumed == 0);
    // ignore error handler
    consumed = 12345;
    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0) {
        goto error;
    }
    assert(consumed == 5);
    if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
        goto error;
    }
    // incomplete trailing UTF-8 sequence
    consumed = 12345;
    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "ignore", &consumed) < 0) {
        goto error;
    }
    assert(consumed == 10);
    PyObject *result = PyUnicodeWriter_Finish(writer);
    if (result == NULL) {
        return NULL;
    }
    assert(PyUnicode_EqualToUTF8(result,
                                 "text-\xC3\xA9-\xE2\x82\xAC-"
                                 "more-incomplete"));
    Py_DECREF(result);
    Py_RETURN_NONE;
 error:
    PyUnicodeWriter_Discard(writer);
    return NULL;
 }
 static PyObject *
 test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args))
 {
@ -436,6 +549,42 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args
 }
 static PyObject *
 test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args))
 {
    PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
    if (writer == NULL) {
        return NULL;
    }
    if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0) {
        goto error;
    }
    if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0) {
        goto error;
    }
    if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0) {
        goto error;
    }
    if (PyUnicodeWriter_WriteChar(writer, '.') < 0) {
        goto error;
    }
    PyObject *result = PyUnicodeWriter_Finish(writer);
    if (result == NULL) {
        return NULL;
    }
    assert(PyUnicode_EqualToUTF8(result,
                                 "latin1=\xC3\xA9-euro=\xE2\x82\xAC."));
    Py_DECREF(result);
    Py_RETURN_NONE;
 error:
    PyUnicodeWriter_Discard(writer);
    return NULL;
 }
 static PyMethodDef TestMethods[] = {
    {"unicode_new",              unicode_new,                    METH_VARARGS},
    {"unicode_fill",             unicode_fill,                   METH_VARARGS},
@ -448,8 +597,11 @@ static PyMethodDef TestMethods[] = {
    {"test_unicodewriter_utf8",  test_unicodewriter_utf8,        METH_NOARGS},
    {"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS},
    {"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS},
    {"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS},
    {"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS},
    {"test_unicodewriter_format", test_unicodewriter_format,     METH_NOARGS},
    {"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS},
    {"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS},
    {NULL},
 };
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1374,46 +1374,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
    return obj;
 }
 #if SIZEOF_WCHAR_T == 2
 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
   will decode surrogate pairs, the other conversions are implemented as macros
   for efficiency.
   This function assumes that unicode can hold one more code point than wstr
   characters for a terminating null character. */
 static void
 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
                              PyObject *unicode)
 {
    const wchar_t *iter;
    Py_UCS4 *ucs4_out;
    assert(unicode != NULL);
    assert(_PyUnicode_CHECK(unicode));
    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
    for (iter = begin; iter < end; ) {
        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
                           _PyUnicode_GET_LENGTH(unicode)));
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
            && (iter+1) < end
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
        {
            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
            iter += 2;
        }
        else {
            *ucs4_out++ = *iter;
            iter++;
        }
    }
    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
                        _PyUnicode_GET_LENGTH(unicode)));
 }
 #endif
 static int
 unicode_check_modifiable(PyObject *unicode)
 {
@ -1937,6 +1897,62 @@ unicode_char(Py_UCS4 ch)
    return unicode;
 }
 static inline void
 unicode_write_widechar(int kind, void *data,
                       const wchar_t *u, Py_ssize_t size,
                       Py_ssize_t num_surrogates)
 {
    switch (kind) {
    case PyUnicode_1BYTE_KIND:
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
        break;
    case PyUnicode_2BYTE_KIND:
 #if SIZEOF_WCHAR_T == 2
        memcpy(data, u, size * 2);
 #else
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
 #endif
        break;
    case PyUnicode_4BYTE_KIND:
    {
 #if SIZEOF_WCHAR_T == 2
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
        // surrogate pairs.
        const wchar_t *end = u + size;
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
 #  ifndef NDEBUG
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
 #  endif
        for (const wchar_t *iter = u; iter < end; ) {
            assert(ucs4_out < ucs4_end);
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
                && (iter+1) < end
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
            {
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
                iter += 2;
            }
            else {
                *ucs4_out++ = *iter;
                iter++;
            }
        }
        assert(ucs4_out == ucs4_end);
 #else
        assert(num_surrogates == 0);
        memcpy(data, u, size * 4);
 #endif
        break;
    }
    default:
        Py_UNREACHABLE();
    }
 }
 PyObject *
 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
 {
@ -1989,36 +2005,65 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
    if (!unicode)
        return NULL;
-    switch (PyUnicode_KIND(unicode)) {
+    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
-    case PyUnicode_1BYTE_KIND:
+                           u, size, num_surrogates);
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
        break;
    case PyUnicode_2BYTE_KIND:
 #if Py_UNICODE_SIZE == 2
        memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
 #else
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
 #endif
        break;
    case PyUnicode_4BYTE_KIND:
 #if SIZEOF_WCHAR_T == 2
        /* This is the only case which has to process surrogates, thus
           a simple copy loop is not enough and we need a function. */
        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
 #else
        assert(num_surrogates == 0);
        memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
 #endif
        break;
    default:
        Py_UNREACHABLE();
    }
    return unicode_result(unicode);
 }
 int
 PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
                              const wchar_t *str,
                              Py_ssize_t size)
 {
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
    if (size < 0) {
        size = wcslen(str);
    }
    if (size == 0) {
        return 0;
    }
 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
       non-Unicode locales and hence needs conversion to UCS-4 first. */
    if (_Py_LocaleUsesNonUnicodeWchar()) {
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
        if (!converted) {
            return -1;
        }
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
        PyMem_Free(converted);
        int res = _PyUnicodeWriter_WriteStr(writer, unicode);
        Py_DECREF(unicode);
        return res;
    }
 #endif
    Py_UCS4 maxchar = 0;
    Py_ssize_t num_surrogates;
    if (find_maxchar_surrogates(str, str + size,
                                &maxchar, &num_surrogates) == -1) {
        return -1;
    }
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
        return -1;
    }
    int kind = writer->kind;
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
    unicode_write_widechar(kind, data, str, size, num_surrogates);
    writer->pos += size - num_surrogates;
    return 0;
 }
 PyObject *
 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 {
@ -13649,6 +13694,32 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
    return res;
 }
 int
 PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
                                   const char *string,
                                   Py_ssize_t length,
                                   const char *errors,
                                   Py_ssize_t *consumed)
 {
    if (length < 0) {
        length = strlen(string);
    }
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
    Py_ssize_t old_pos = _writer->pos;
    int res = unicode_decode_utf8_writer(_writer, string, length,
                                         _Py_ERROR_UNKNOWN, errors, consumed);
    if (res < 0) {
        _writer->pos = old_pos;
        if (consumed) {
            *consumed = 0;
        }
    }
    return res;
 }
 int
 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
                                   const char *str, Py_ssize_t len)