gh-119396: Optimize unicode_repr() (#119617)

Use stringlib to specialize unicode_repr() for each string kind
(UCS1, UCS2, UCS4).

Benchmark:

+-------------------------------------+---------+----------------------+
| Benchmark                           | ref     | change2              |
+=====================================+=========+======================+
| repr('abc')                         | 100 ns  | 103 ns: 1.02x slower |
+-------------------------------------+---------+----------------------+
| repr('a' * 100)                     | 369 ns  | 369 ns: 1.00x slower |
+-------------------------------------+---------+----------------------+
| repr(('a' + squote) * 100)          | 1.21 us | 946 ns: 1.27x faster |
+-------------------------------------+---------+----------------------+
| repr(('a' + nl) * 100)              | 1.23 us | 907 ns: 1.36x faster |
+-------------------------------------+---------+----------------------+
| repr(dquote + ('a' + squote) * 100) | 1.08 us | 858 ns: 1.25x faster |
+-------------------------------------+---------+----------------------+
| Geometric mean                      | (ref)   | 1.16x faster         |
+-------------------------------------+---------+----------------------+
This commit is contained in:
Victor Stinner 2024-05-28 18:05:20 +02:00 committed by GitHub
parent 2da0dc094f
commit 0518edc170
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 131 additions and 102 deletions

View File

@ -1841,6 +1841,7 @@ UNICODE_DEPS = \
$(srcdir)/Objects/stringlib/localeutil.h \ $(srcdir)/Objects/stringlib/localeutil.h \
$(srcdir)/Objects/stringlib/partition.h \ $(srcdir)/Objects/stringlib/partition.h \
$(srcdir)/Objects/stringlib/replace.h \ $(srcdir)/Objects/stringlib/replace.h \
$(srcdir)/Objects/stringlib/repr.h \
$(srcdir)/Objects/stringlib/split.h \ $(srcdir)/Objects/stringlib/split.h \
$(srcdir)/Objects/stringlib/ucs1lib.h \ $(srcdir)/Objects/stringlib/ucs1lib.h \
$(srcdir)/Objects/stringlib/ucs2lib.h \ $(srcdir)/Objects/stringlib/ucs2lib.h \

95
Objects/stringlib/repr.h Normal file
View File

@ -0,0 +1,95 @@
/* stringlib: repr() implementation */
#ifndef STRINGLIB_FASTSEARCH_H
#error must include "stringlib/fastsearch.h" before including this module
#endif
static void
STRINGLIB(repr)(PyObject *unicode, Py_UCS4 quote,
STRINGLIB_CHAR *odata)
{
Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
const void *idata = PyUnicode_DATA(unicode);
int ikind = PyUnicode_KIND(unicode);
*odata++ = quote;
for (Py_ssize_t i = 0; i < isize; i++) {
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
/* Escape quotes and backslashes */
if ((ch == quote) || (ch == '\\')) {
*odata++ = '\\';
*odata++ = ch;
continue;
}
/* Map special whitespace to '\t', \n', '\r' */
if (ch == '\t') {
*odata++ = '\\';
*odata++ = 't';
}
else if (ch == '\n') {
*odata++ = '\\';
*odata++ = 'n';
}
else if (ch == '\r') {
*odata++ = '\\';
*odata++ = 'r';
}
/* Map non-printable US ASCII to '\xhh' */
else if (ch < ' ' || ch == 0x7F) {
*odata++ = '\\';
*odata++ = 'x';
*odata++ = Py_hexdigits[(ch >> 4) & 0x000F];
*odata++ = Py_hexdigits[ch & 0x000F];
}
/* Copy ASCII characters as-is */
else if (ch < 0x7F) {
*odata++ = ch;
}
/* Non-ASCII characters */
else {
/* Map Unicode whitespace and control characters
(categories Z* and C* except ASCII space)
*/
if (!Py_UNICODE_ISPRINTABLE(ch)) {
*odata++ = '\\';
/* Map 8-bit characters to '\xhh' */
if (ch <= 0xff) {
*odata++ = 'x';
*odata++ = Py_hexdigits[(ch >> 4) & 0x000F];
*odata++ = Py_hexdigits[ch & 0x000F];
}
/* Map 16-bit characters to '\uxxxx' */
else if (ch <= 0xffff) {
*odata++ = 'u';
*odata++ = Py_hexdigits[(ch >> 12) & 0xF];
*odata++ = Py_hexdigits[(ch >> 8) & 0xF];
*odata++ = Py_hexdigits[(ch >> 4) & 0xF];
*odata++ = Py_hexdigits[ch & 0xF];
}
/* Map 21-bit characters to '\U00xxxxxx' */
else {
*odata++ = 'U';
*odata++ = Py_hexdigits[(ch >> 28) & 0xF];
*odata++ = Py_hexdigits[(ch >> 24) & 0xF];
*odata++ = Py_hexdigits[(ch >> 20) & 0xF];
*odata++ = Py_hexdigits[(ch >> 16) & 0xF];
*odata++ = Py_hexdigits[(ch >> 12) & 0xF];
*odata++ = Py_hexdigits[(ch >> 8) & 0xF];
*odata++ = Py_hexdigits[(ch >> 4) & 0xF];
*odata++ = Py_hexdigits[ch & 0xF];
}
}
/* Copy characters as-is */
else {
*odata++ = ch;
}
}
}
*odata = quote;
}

View File

@ -899,6 +899,7 @@ ensure_unicode(PyObject *obj)
#include "stringlib/count.h" #include "stringlib/count.h"
#include "stringlib/find.h" #include "stringlib/find.h"
#include "stringlib/replace.h" #include "stringlib/replace.h"
#include "stringlib/repr.h"
#include "stringlib/find_max_char.h" #include "stringlib/find_max_char.h"
#include "stringlib/undef.h" #include "stringlib/undef.h"
@ -909,6 +910,7 @@ ensure_unicode(PyObject *obj)
#include "stringlib/count.h" #include "stringlib/count.h"
#include "stringlib/find.h" #include "stringlib/find.h"
#include "stringlib/replace.h" #include "stringlib/replace.h"
#include "stringlib/repr.h"
#include "stringlib/find_max_char.h" #include "stringlib/find_max_char.h"
#include "stringlib/undef.h" #include "stringlib/undef.h"
@ -919,6 +921,7 @@ ensure_unicode(PyObject *obj)
#include "stringlib/count.h" #include "stringlib/count.h"
#include "stringlib/find.h" #include "stringlib/find.h"
#include "stringlib/replace.h" #include "stringlib/replace.h"
#include "stringlib/repr.h"
#include "stringlib/find_max_char.h" #include "stringlib/find_max_char.h"
#include "stringlib/undef.h" #include "stringlib/undef.h"
@ -12336,24 +12339,17 @@ unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
static PyObject * static PyObject *
unicode_repr(PyObject *unicode) unicode_repr(PyObject *unicode)
{ {
PyObject *repr; Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
Py_ssize_t isize; const void *idata = PyUnicode_DATA(unicode);
Py_ssize_t osize, squote, dquote, i, o;
Py_UCS4 max, quote;
int ikind, okind, unchanged;
const void *idata;
void *odata;
isize = PyUnicode_GET_LENGTH(unicode);
idata = PyUnicode_DATA(unicode);
/* Compute length of output, quote characters, and /* Compute length of output, quote characters, and
maximum character */ maximum character */
osize = 0; Py_ssize_t osize = 0;
max = 127; Py_UCS4 maxch = 127;
squote = dquote = 0; Py_ssize_t squote = 0;
ikind = PyUnicode_KIND(unicode); Py_ssize_t dquote = 0;
for (i = 0; i < isize; i++) { int ikind = PyUnicode_KIND(unicode);
for (Py_ssize_t i = 0; i < isize; i++) {
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Py_ssize_t incr = 1; Py_ssize_t incr = 1;
switch (ch) { switch (ch) {
@ -12369,7 +12365,7 @@ unicode_repr(PyObject *unicode)
else if (ch < 0x7f) else if (ch < 0x7f)
; ;
else if (Py_UNICODE_ISPRINTABLE(ch)) else if (Py_UNICODE_ISPRINTABLE(ch))
max = ch > max ? ch : max; maxch = (ch > maxch) ? ch : maxch;
else if (ch < 0x100) else if (ch < 0x100)
incr = 4; /* \xHH */ incr = 4; /* \xHH */
else if (ch < 0x10000) else if (ch < 0x10000)
@ -12385,10 +12381,10 @@ unicode_repr(PyObject *unicode)
osize += incr; osize += incr;
} }
quote = '\''; Py_UCS4 quote = '\'';
unchanged = (osize == isize); int changed = (osize != isize);
if (squote) { if (squote) {
unchanged = 0; changed = 1;
if (dquote) if (dquote)
/* Both squote and dquote present. Use squote, /* Both squote and dquote present. Use squote,
and escape them */ and escape them */
@ -12398,99 +12394,35 @@ unicode_repr(PyObject *unicode)
} }
osize += 2; /* quotes */ osize += 2; /* quotes */
repr = PyUnicode_New(osize, max); PyObject *repr = PyUnicode_New(osize, maxch);
if (repr == NULL) if (repr == NULL)
return NULL; return NULL;
okind = PyUnicode_KIND(repr); int okind = PyUnicode_KIND(repr);
odata = PyUnicode_DATA(repr); void *odata = PyUnicode_DATA(repr);
if (!changed) {
PyUnicode_WRITE(okind, odata, 0, quote);
PyUnicode_WRITE(okind, odata, 0, quote);
PyUnicode_WRITE(okind, odata, osize-1, quote);
if (unchanged) {
_PyUnicode_FastCopyCharacters(repr, 1, _PyUnicode_FastCopyCharacters(repr, 1,
unicode, 0, unicode, 0,
isize); isize);
PyUnicode_WRITE(okind, odata, osize-1, quote);
} }
else { else {
for (i = 0, o = 1; i < isize; i++) { switch (okind) {
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); case PyUnicode_1BYTE_KIND:
ucs1lib_repr(unicode, quote, odata);
/* Escape quotes and backslashes */ break;
if ((ch == quote) || (ch == '\\')) { case PyUnicode_2BYTE_KIND:
PyUnicode_WRITE(okind, odata, o++, '\\'); ucs2lib_repr(unicode, quote, odata);
PyUnicode_WRITE(okind, odata, o++, ch); break;
continue; default:
} assert(okind == PyUnicode_4BYTE_KIND);
ucs4lib_repr(unicode, quote, odata);
/* Map special whitespace to '\t', \n', '\r' */
if (ch == '\t') {
PyUnicode_WRITE(okind, odata, o++, '\\');
PyUnicode_WRITE(okind, odata, o++, 't');
}
else if (ch == '\n') {
PyUnicode_WRITE(okind, odata, o++, '\\');
PyUnicode_WRITE(okind, odata, o++, 'n');
}
else if (ch == '\r') {
PyUnicode_WRITE(okind, odata, o++, '\\');
PyUnicode_WRITE(okind, odata, o++, 'r');
}
/* Map non-printable US ASCII to '\xhh' */
else if (ch < ' ' || ch == 0x7F) {
PyUnicode_WRITE(okind, odata, o++, '\\');
PyUnicode_WRITE(okind, odata, o++, 'x');
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
}
/* Copy ASCII characters as-is */
else if (ch < 0x7F) {
PyUnicode_WRITE(okind, odata, o++, ch);
}
/* Non-ASCII characters */
else {
/* Map Unicode whitespace and control characters
(categories Z* and C* except ASCII space)
*/
if (!Py_UNICODE_ISPRINTABLE(ch)) {
PyUnicode_WRITE(okind, odata, o++, '\\');
/* Map 8-bit characters to '\xhh' */
if (ch <= 0xff) {
PyUnicode_WRITE(okind, odata, o++, 'x');
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
}
/* Map 16-bit characters to '\uxxxx' */
else if (ch <= 0xffff) {
PyUnicode_WRITE(okind, odata, o++, 'u');
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
}
/* Map 21-bit characters to '\U00xxxxxx' */
else {
PyUnicode_WRITE(okind, odata, o++, 'U');
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
}
}
/* Copy characters as-is */
else {
PyUnicode_WRITE(okind, odata, o++, ch);
}
}
} }
} }
/* Closing quote already added at the beginning */
assert(_PyUnicode_CheckConsistency(repr, 1)); assert(_PyUnicode_CheckConsistency(repr, 1));
return repr; return repr;
} }

View File

@ -167,6 +167,7 @@ Objects/stringlib/count.h Objects/stringlib/fastsearch.h
Objects/stringlib/find.h Objects/stringlib/fastsearch.h Objects/stringlib/find.h Objects/stringlib/fastsearch.h
Objects/stringlib/partition.h Objects/stringlib/fastsearch.h Objects/stringlib/partition.h Objects/stringlib/fastsearch.h
Objects/stringlib/replace.h Objects/stringlib/fastsearch.h Objects/stringlib/replace.h Objects/stringlib/fastsearch.h
Objects/stringlib/repr.h Objects/stringlib/fastsearch.h
Objects/stringlib/split.h Objects/stringlib/fastsearch.h Objects/stringlib/split.h Objects/stringlib/fastsearch.h
# @end=tsv@ # @end=tsv@