gh-119396: Optimize PyUnicode_FromFormat() UTF-8 decoder (#119398)

Add unicode_decode_utf8_writer() to write directly characters into a _PyUnicodeWriter writer: avoid the creation of a temporary string. Optimize PyUnicode_FromFormat() by using the new unicode_decode_utf8_writer(). Rename unicode_fromformat_write_cstr() to unicode_fromformat_write_utf8(). Microbenchmark on the code: return PyUnicode_FromFormat( "%s %s %s %s %s.", "format", "multiple", "utf8", "short", "strings"); Result: 620 ns +- 8 ns -> 382 ns +- 2 ns: 1.62x faster.
2024-05-22 23:05:26 +02:00 · 2024-05-22 23:05:26 +02:00 · 9b422fc6af
parent 14b063cbf1
commit 9b422fc6af
1 changed files with 141 additions and 62 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -202,6 +202,11 @@ static PyObject *
 unicode_decode_utf8(const char *s, Py_ssize_t size,
                    _Py_error_handler error_handler, const char *errors,
                    Py_ssize_t *consumed);
 static int
 unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
                           const char *s, Py_ssize_t size,
                           _Py_error_handler error_handler, const char *errors,
                           Py_ssize_t *consumed);
 #ifdef Py_DEBUG
 static inline int unicode_is_finalizing(void);
 static int unicode_is_singleton(PyObject *unicode);
@ -2377,14 +2382,11 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
 }
 static int
-unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
+unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
                              Py_ssize_t width, Py_ssize_t precision, int flags)
 {
    /* UTF-8 */
    Py_ssize_t length;
    PyObject *unicode;
    int res;
    if (precision == -1) {
        length = strlen(str);
    }
@ -2394,11 +2396,19 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
            length++;
        }
    }
-    unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
+
    if (width < 0) {
        return unicode_decode_utf8_writer(writer, str, length,
                                          _Py_ERROR_REPLACE, "replace", NULL);
    }
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
                                                     "replace", NULL);
    if (unicode == NULL)
        return -1;
-    res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
+    int res = unicode_fromformat_write_str(writer, unicode,
                                           width, -1, flags);
    Py_DECREF(unicode);
    return res;
 }
@ -2700,7 +2710,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
        else {
            /* UTF-8 */
            const char *s = va_arg(*vargs, const char*);
-            if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
+            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
                return NULL;
        }
        break;
@ -2739,7 +2749,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
        }
        else {
            assert(str != NULL);
-            if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
+            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
                return NULL;
        }
        break;
@ -4737,46 +4747,14 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
    return p - start;
 }
-static PyObject *
+
-unicode_decode_utf8(const char *s, Py_ssize_t size,
+static int
-                    _Py_error_handler error_handler, const char *errors,
+unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
-                    Py_ssize_t *consumed)
+                         const char *starts, const char *s, const char *end,
                         _Py_error_handler error_handler,
                         const char *errors,
                         Py_ssize_t *consumed)
 {
    if (size == 0) {
        if (consumed)
            *consumed = 0;
        _Py_RETURN_UNICODE_EMPTY();
    }
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
    if (size == 1 && (unsigned char)s[0] < 128) {
        if (consumed) {
            *consumed = 1;
        }
        return get_latin1_char((unsigned char)s[0]);
    }
    const char *starts = s;
    const char *end = s + size;
    // fast path: try ASCII string.
    PyObject *u = PyUnicode_New(size, 127);
    if (u == NULL) {
        return NULL;
    }
    s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
    if (s == end) {
        if (consumed) {
            *consumed = size;
        }
        return u;
    }
    // Use _PyUnicodeWriter after fast path is failed.
    _PyUnicodeWriter writer;
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
    writer.pos = s - starts;
    Py_ssize_t startinpos, endinpos;
    const char *errmsg = "";
    PyObject *error_handler_obj = NULL;
@ -4784,18 +4762,18 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
    while (s < end) {
        Py_UCS4 ch;
-        int kind = writer.kind;
+        int kind = writer->kind;
        if (kind == PyUnicode_1BYTE_KIND) {
-            if (PyUnicode_IS_ASCII(writer.buffer))
+            if (PyUnicode_IS_ASCII(writer->buffer))
-                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
+                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
            else
-                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
+                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
        } else if (kind == PyUnicode_2BYTE_KIND) {
-            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
+            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
        } else {
            assert(kind == PyUnicode_4BYTE_KIND);
-            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
+            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
        }
        switch (ch) {
@ -4826,7 +4804,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
            endinpos = startinpos + ch - 1;
            break;
        default:
-            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
+            // ch doesn't fit into kind, so change the buffer kind to write
            // the character
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
                goto onError;
            continue;
        }
@ -4840,7 +4820,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
            break;
        case _Py_ERROR_REPLACE:
-            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
+            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
                goto onError;
            s += (endinpos - startinpos);
            break;
@ -4849,13 +4829,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
        {
            Py_ssize_t i;
-            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
                goto onError;
            for (i=startinpos; i<endinpos; i++) {
                ch = (Py_UCS4)(unsigned char)(starts[i]);
-                PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
+                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
                                ch + 0xdc00);
-                writer.pos++;
+                writer->pos++;
            }
            s += (endinpos - startinpos);
            break;
@ -4866,8 +4846,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
                    errors, &error_handler_obj,
                    "utf-8", errmsg,
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
-                    &writer))
+                    writer)) {
                goto onError;
            }
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
                return -1;
            }
        }
    }
@ -4877,13 +4862,107 @@ End:
    Py_XDECREF(error_handler_obj);
    Py_XDECREF(exc);
-    return _PyUnicodeWriter_Finish(&writer);
+    return 0;
 onError:
    Py_XDECREF(error_handler_obj);
    Py_XDECREF(exc);
-    _PyUnicodeWriter_Dealloc(&writer);
+    return -1;
-    return NULL;
+}
 static PyObject *
 unicode_decode_utf8(const char *s, Py_ssize_t size,
                    _Py_error_handler error_handler, const char *errors,
                    Py_ssize_t *consumed)
 {
    if (size == 0) {
        if (consumed) {
            *consumed = 0;
        }
        _Py_RETURN_UNICODE_EMPTY();
    }
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
    if (size == 1 && (unsigned char)s[0] < 128) {
        if (consumed) {
            *consumed = 1;
        }
        return get_latin1_char((unsigned char)s[0]);
    }
    // fast path: try ASCII string.
    const char *starts = s;
    const char *end = s + size;
    PyObject *u = PyUnicode_New(size, 127);
    if (u == NULL) {
        return NULL;
    }
    Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
    if (decoded == size) {
        if (consumed) {
            *consumed = size;
        }
        return u;
    }
    s += decoded;
    size -= decoded;
    // Use _PyUnicodeWriter after fast path is failed.
    _PyUnicodeWriter writer;
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
    writer.pos = decoded;
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
                                 error_handler, errors,
                                 consumed) < 0) {
        _PyUnicodeWriter_Dealloc(&writer);
        return NULL;
    }
    return _PyUnicodeWriter_Finish(&writer);
 }
 static int
 unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
                           const char *s, Py_ssize_t size,
                           _Py_error_handler error_handler, const char *errors,
                           Py_ssize_t *consumed)
 {
    if (size == 0) {
        if (consumed) {
            *consumed = 0;
        }
        return 0;
    }
    // fast path: try ASCII string.
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
        return -1;
    }
    const char *starts = s;
    const char *end = s + size;
    Py_ssize_t decoded = 0;
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
    if (writer->kind == PyUnicode_1BYTE_KIND
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
    {
        decoded = ascii_decode(s, end, dest);
        writer->pos += decoded;
        if (decoded == size) {
            if (consumed) {
                *consumed = size;
            }
            return 0;
        }
        s += decoded;
        size -= decoded;
    }
    return unicode_decode_utf8_impl(writer, starts, s, end,
                                    error_handler, errors, consumed);
 }