gh-119396: Optimize PyUnicode_FromFormat() UTF-8 decoder (#119398)

Add unicode_decode_utf8_writer() to write directly characters into a
_PyUnicodeWriter writer: avoid the creation of a temporary string.
Optimize PyUnicode_FromFormat() by using the new
unicode_decode_utf8_writer().

Rename unicode_fromformat_write_cstr() to
unicode_fromformat_write_utf8().

Microbenchmark on the code:

    return PyUnicode_FromFormat(
        "%s %s %s %s %s.",
        "format", "multiple", "utf8", "short", "strings");

Result: 620 ns +- 8 ns -> 382 ns +- 2 ns: 1.62x faster.
This commit is contained in:
Victor Stinner 2024-05-22 23:05:26 +02:00 committed by GitHub
parent 14b063cbf1
commit 9b422fc6af
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 141 additions and 62 deletions

View File

@ -202,6 +202,11 @@ static PyObject *
unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed);
static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed);
#ifdef Py_DEBUG
static inline int unicode_is_finalizing(void);
static int unicode_is_singleton(PyObject *unicode);
@ -2377,14 +2382,11 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
}
static int
unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
Py_ssize_t width, Py_ssize_t precision, int flags)
{
/* UTF-8 */
Py_ssize_t length;
PyObject *unicode;
int res;
if (precision == -1) {
length = strlen(str);
}
@ -2394,11 +2396,19 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
length++;
}
}
unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
if (width < 0) {
return unicode_decode_utf8_writer(writer, str, length,
_Py_ERROR_REPLACE, "replace", NULL);
}
PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
"replace", NULL);
if (unicode == NULL)
return -1;
res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
int res = unicode_fromformat_write_str(writer, unicode,
width, -1, flags);
Py_DECREF(unicode);
return res;
}
@ -2700,7 +2710,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
else {
/* UTF-8 */
const char *s = va_arg(*vargs, const char*);
if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
return NULL;
}
break;
@ -2739,7 +2749,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
}
else {
assert(str != NULL);
if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
return NULL;
}
break;
@ -4737,46 +4747,14 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
return p - start;
}
static PyObject *
unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
static int
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
const char *starts, const char *s, const char *end,
_Py_error_handler error_handler,
const char *errors,
Py_ssize_t *consumed)
{
if (size == 0) {
if (consumed)
*consumed = 0;
_Py_RETURN_UNICODE_EMPTY();
}
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && (unsigned char)s[0] < 128) {
if (consumed) {
*consumed = 1;
}
return get_latin1_char((unsigned char)s[0]);
}
const char *starts = s;
const char *end = s + size;
// fast path: try ASCII string.
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
return NULL;
}
s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
if (s == end) {
if (consumed) {
*consumed = size;
}
return u;
}
// Use _PyUnicodeWriter after fast path is failed.
_PyUnicodeWriter writer;
_PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = s - starts;
Py_ssize_t startinpos, endinpos;
const char *errmsg = "";
PyObject *error_handler_obj = NULL;
@ -4784,18 +4762,18 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
while (s < end) {
Py_UCS4 ch;
int kind = writer.kind;
int kind = writer->kind;
if (kind == PyUnicode_1BYTE_KIND) {
if (PyUnicode_IS_ASCII(writer.buffer))
ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
if (PyUnicode_IS_ASCII(writer->buffer))
ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
else
ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
} else if (kind == PyUnicode_2BYTE_KIND) {
ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
} else {
assert(kind == PyUnicode_4BYTE_KIND);
ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
}
switch (ch) {
@ -4826,7 +4804,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
endinpos = startinpos + ch - 1;
break;
default:
if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
// ch doesn't fit into kind, so change the buffer kind to write
// the character
if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
goto onError;
continue;
}
@ -4840,7 +4820,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
break;
case _Py_ERROR_REPLACE:
if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
goto onError;
s += (endinpos - startinpos);
break;
@ -4849,13 +4829,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
{
Py_ssize_t i;
if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
goto onError;
for (i=startinpos; i<endinpos; i++) {
ch = (Py_UCS4)(unsigned char)(starts[i]);
PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
ch + 0xdc00);
writer.pos++;
writer->pos++;
}
s += (endinpos - startinpos);
break;
@ -4866,9 +4846,14 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
errors, &error_handler_obj,
"utf-8", errmsg,
&starts, &end, &startinpos, &endinpos, &exc, &s,
&writer))
writer)) {
goto onError;
}
if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
return -1;
}
}
}
End:
@ -4877,13 +4862,107 @@ End:
Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return _PyUnicodeWriter_Finish(&writer);
return 0;
onError:
Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return -1;
}
static PyObject *
unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
{
if (size == 0) {
if (consumed) {
*consumed = 0;
}
_Py_RETURN_UNICODE_EMPTY();
}
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && (unsigned char)s[0] < 128) {
if (consumed) {
*consumed = 1;
}
return get_latin1_char((unsigned char)s[0]);
}
// fast path: try ASCII string.
const char *starts = s;
const char *end = s + size;
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
return NULL;
}
Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
if (decoded == size) {
if (consumed) {
*consumed = size;
}
return u;
}
s += decoded;
size -= decoded;
// Use _PyUnicodeWriter after fast path is failed.
_PyUnicodeWriter writer;
_PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = decoded;
if (unicode_decode_utf8_impl(&writer, starts, s, end,
error_handler, errors,
consumed) < 0) {
_PyUnicodeWriter_Dealloc(&writer);
return NULL;
}
return _PyUnicodeWriter_Finish(&writer);
}
static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
{
if (size == 0) {
if (consumed) {
*consumed = 0;
}
return 0;
}
// fast path: try ASCII string.
if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
return -1;
}
const char *starts = s;
const char *end = s + size;
Py_ssize_t decoded = 0;
Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
if (writer->kind == PyUnicode_1BYTE_KIND
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
{
decoded = ascii_decode(s, end, dest);
writer->pos += decoded;
if (decoded == size) {
if (consumed) {
*consumed = size;
}
return 0;
}
s += decoded;
size -= decoded;
}
return unicode_decode_utf8_impl(writer, starts, s, end,
error_handler, errors, consumed);
}