mirror of https://github.com/python/cpython
gh-119396: Optimize unicode_repr() (#119617)
Use stringlib to specialize unicode_repr() for each string kind (UCS1, UCS2, UCS4). Benchmark: +-------------------------------------+---------+----------------------+ | Benchmark | ref | change2 | +=====================================+=========+======================+ | repr('abc') | 100 ns | 103 ns: 1.02x slower | +-------------------------------------+---------+----------------------+ | repr('a' * 100) | 369 ns | 369 ns: 1.00x slower | +-------------------------------------+---------+----------------------+ | repr(('a' + squote) * 100) | 1.21 us | 946 ns: 1.27x faster | +-------------------------------------+---------+----------------------+ | repr(('a' + nl) * 100) | 1.23 us | 907 ns: 1.36x faster | +-------------------------------------+---------+----------------------+ | repr(dquote + ('a' + squote) * 100) | 1.08 us | 858 ns: 1.25x faster | +-------------------------------------+---------+----------------------+ | Geometric mean | (ref) | 1.16x faster | +-------------------------------------+---------+----------------------+
This commit is contained in:
parent
2da0dc094f
commit
0518edc170
|
@ -1841,6 +1841,7 @@ UNICODE_DEPS = \
|
||||||
$(srcdir)/Objects/stringlib/localeutil.h \
|
$(srcdir)/Objects/stringlib/localeutil.h \
|
||||||
$(srcdir)/Objects/stringlib/partition.h \
|
$(srcdir)/Objects/stringlib/partition.h \
|
||||||
$(srcdir)/Objects/stringlib/replace.h \
|
$(srcdir)/Objects/stringlib/replace.h \
|
||||||
|
$(srcdir)/Objects/stringlib/repr.h \
|
||||||
$(srcdir)/Objects/stringlib/split.h \
|
$(srcdir)/Objects/stringlib/split.h \
|
||||||
$(srcdir)/Objects/stringlib/ucs1lib.h \
|
$(srcdir)/Objects/stringlib/ucs1lib.h \
|
||||||
$(srcdir)/Objects/stringlib/ucs2lib.h \
|
$(srcdir)/Objects/stringlib/ucs2lib.h \
|
||||||
|
|
|
@ -0,0 +1,95 @@
|
||||||
|
/* stringlib: repr() implementation */
|
||||||
|
|
||||||
|
#ifndef STRINGLIB_FASTSEARCH_H
|
||||||
|
#error must include "stringlib/fastsearch.h" before including this module
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
STRINGLIB(repr)(PyObject *unicode, Py_UCS4 quote,
|
||||||
|
STRINGLIB_CHAR *odata)
|
||||||
|
{
|
||||||
|
Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
|
||||||
|
const void *idata = PyUnicode_DATA(unicode);
|
||||||
|
int ikind = PyUnicode_KIND(unicode);
|
||||||
|
|
||||||
|
*odata++ = quote;
|
||||||
|
for (Py_ssize_t i = 0; i < isize; i++) {
|
||||||
|
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
|
||||||
|
|
||||||
|
/* Escape quotes and backslashes */
|
||||||
|
if ((ch == quote) || (ch == '\\')) {
|
||||||
|
*odata++ = '\\';
|
||||||
|
*odata++ = ch;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Map special whitespace to '\t', \n', '\r' */
|
||||||
|
if (ch == '\t') {
|
||||||
|
*odata++ = '\\';
|
||||||
|
*odata++ = 't';
|
||||||
|
}
|
||||||
|
else if (ch == '\n') {
|
||||||
|
*odata++ = '\\';
|
||||||
|
*odata++ = 'n';
|
||||||
|
}
|
||||||
|
else if (ch == '\r') {
|
||||||
|
*odata++ = '\\';
|
||||||
|
*odata++ = 'r';
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Map non-printable US ASCII to '\xhh' */
|
||||||
|
else if (ch < ' ' || ch == 0x7F) {
|
||||||
|
*odata++ = '\\';
|
||||||
|
*odata++ = 'x';
|
||||||
|
*odata++ = Py_hexdigits[(ch >> 4) & 0x000F];
|
||||||
|
*odata++ = Py_hexdigits[ch & 0x000F];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Copy ASCII characters as-is */
|
||||||
|
else if (ch < 0x7F) {
|
||||||
|
*odata++ = ch;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Non-ASCII characters */
|
||||||
|
else {
|
||||||
|
/* Map Unicode whitespace and control characters
|
||||||
|
(categories Z* and C* except ASCII space)
|
||||||
|
*/
|
||||||
|
if (!Py_UNICODE_ISPRINTABLE(ch)) {
|
||||||
|
*odata++ = '\\';
|
||||||
|
/* Map 8-bit characters to '\xhh' */
|
||||||
|
if (ch <= 0xff) {
|
||||||
|
*odata++ = 'x';
|
||||||
|
*odata++ = Py_hexdigits[(ch >> 4) & 0x000F];
|
||||||
|
*odata++ = Py_hexdigits[ch & 0x000F];
|
||||||
|
}
|
||||||
|
/* Map 16-bit characters to '\uxxxx' */
|
||||||
|
else if (ch <= 0xffff) {
|
||||||
|
*odata++ = 'u';
|
||||||
|
*odata++ = Py_hexdigits[(ch >> 12) & 0xF];
|
||||||
|
*odata++ = Py_hexdigits[(ch >> 8) & 0xF];
|
||||||
|
*odata++ = Py_hexdigits[(ch >> 4) & 0xF];
|
||||||
|
*odata++ = Py_hexdigits[ch & 0xF];
|
||||||
|
}
|
||||||
|
/* Map 21-bit characters to '\U00xxxxxx' */
|
||||||
|
else {
|
||||||
|
*odata++ = 'U';
|
||||||
|
*odata++ = Py_hexdigits[(ch >> 28) & 0xF];
|
||||||
|
*odata++ = Py_hexdigits[(ch >> 24) & 0xF];
|
||||||
|
*odata++ = Py_hexdigits[(ch >> 20) & 0xF];
|
||||||
|
*odata++ = Py_hexdigits[(ch >> 16) & 0xF];
|
||||||
|
*odata++ = Py_hexdigits[(ch >> 12) & 0xF];
|
||||||
|
*odata++ = Py_hexdigits[(ch >> 8) & 0xF];
|
||||||
|
*odata++ = Py_hexdigits[(ch >> 4) & 0xF];
|
||||||
|
*odata++ = Py_hexdigits[ch & 0xF];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* Copy characters as-is */
|
||||||
|
else {
|
||||||
|
*odata++ = ch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*odata = quote;
|
||||||
|
}
|
|
@ -899,6 +899,7 @@ ensure_unicode(PyObject *obj)
|
||||||
#include "stringlib/count.h"
|
#include "stringlib/count.h"
|
||||||
#include "stringlib/find.h"
|
#include "stringlib/find.h"
|
||||||
#include "stringlib/replace.h"
|
#include "stringlib/replace.h"
|
||||||
|
#include "stringlib/repr.h"
|
||||||
#include "stringlib/find_max_char.h"
|
#include "stringlib/find_max_char.h"
|
||||||
#include "stringlib/undef.h"
|
#include "stringlib/undef.h"
|
||||||
|
|
||||||
|
@ -909,6 +910,7 @@ ensure_unicode(PyObject *obj)
|
||||||
#include "stringlib/count.h"
|
#include "stringlib/count.h"
|
||||||
#include "stringlib/find.h"
|
#include "stringlib/find.h"
|
||||||
#include "stringlib/replace.h"
|
#include "stringlib/replace.h"
|
||||||
|
#include "stringlib/repr.h"
|
||||||
#include "stringlib/find_max_char.h"
|
#include "stringlib/find_max_char.h"
|
||||||
#include "stringlib/undef.h"
|
#include "stringlib/undef.h"
|
||||||
|
|
||||||
|
@ -919,6 +921,7 @@ ensure_unicode(PyObject *obj)
|
||||||
#include "stringlib/count.h"
|
#include "stringlib/count.h"
|
||||||
#include "stringlib/find.h"
|
#include "stringlib/find.h"
|
||||||
#include "stringlib/replace.h"
|
#include "stringlib/replace.h"
|
||||||
|
#include "stringlib/repr.h"
|
||||||
#include "stringlib/find_max_char.h"
|
#include "stringlib/find_max_char.h"
|
||||||
#include "stringlib/undef.h"
|
#include "stringlib/undef.h"
|
||||||
|
|
||||||
|
@ -12336,24 +12339,17 @@ unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
|
||||||
static PyObject *
|
static PyObject *
|
||||||
unicode_repr(PyObject *unicode)
|
unicode_repr(PyObject *unicode)
|
||||||
{
|
{
|
||||||
PyObject *repr;
|
Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
|
||||||
Py_ssize_t isize;
|
const void *idata = PyUnicode_DATA(unicode);
|
||||||
Py_ssize_t osize, squote, dquote, i, o;
|
|
||||||
Py_UCS4 max, quote;
|
|
||||||
int ikind, okind, unchanged;
|
|
||||||
const void *idata;
|
|
||||||
void *odata;
|
|
||||||
|
|
||||||
isize = PyUnicode_GET_LENGTH(unicode);
|
|
||||||
idata = PyUnicode_DATA(unicode);
|
|
||||||
|
|
||||||
/* Compute length of output, quote characters, and
|
/* Compute length of output, quote characters, and
|
||||||
maximum character */
|
maximum character */
|
||||||
osize = 0;
|
Py_ssize_t osize = 0;
|
||||||
max = 127;
|
Py_UCS4 maxch = 127;
|
||||||
squote = dquote = 0;
|
Py_ssize_t squote = 0;
|
||||||
ikind = PyUnicode_KIND(unicode);
|
Py_ssize_t dquote = 0;
|
||||||
for (i = 0; i < isize; i++) {
|
int ikind = PyUnicode_KIND(unicode);
|
||||||
|
for (Py_ssize_t i = 0; i < isize; i++) {
|
||||||
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
|
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
|
||||||
Py_ssize_t incr = 1;
|
Py_ssize_t incr = 1;
|
||||||
switch (ch) {
|
switch (ch) {
|
||||||
|
@ -12369,7 +12365,7 @@ unicode_repr(PyObject *unicode)
|
||||||
else if (ch < 0x7f)
|
else if (ch < 0x7f)
|
||||||
;
|
;
|
||||||
else if (Py_UNICODE_ISPRINTABLE(ch))
|
else if (Py_UNICODE_ISPRINTABLE(ch))
|
||||||
max = ch > max ? ch : max;
|
maxch = (ch > maxch) ? ch : maxch;
|
||||||
else if (ch < 0x100)
|
else if (ch < 0x100)
|
||||||
incr = 4; /* \xHH */
|
incr = 4; /* \xHH */
|
||||||
else if (ch < 0x10000)
|
else if (ch < 0x10000)
|
||||||
|
@ -12385,10 +12381,10 @@ unicode_repr(PyObject *unicode)
|
||||||
osize += incr;
|
osize += incr;
|
||||||
}
|
}
|
||||||
|
|
||||||
quote = '\'';
|
Py_UCS4 quote = '\'';
|
||||||
unchanged = (osize == isize);
|
int changed = (osize != isize);
|
||||||
if (squote) {
|
if (squote) {
|
||||||
unchanged = 0;
|
changed = 1;
|
||||||
if (dquote)
|
if (dquote)
|
||||||
/* Both squote and dquote present. Use squote,
|
/* Both squote and dquote present. Use squote,
|
||||||
and escape them */
|
and escape them */
|
||||||
|
@ -12398,99 +12394,35 @@ unicode_repr(PyObject *unicode)
|
||||||
}
|
}
|
||||||
osize += 2; /* quotes */
|
osize += 2; /* quotes */
|
||||||
|
|
||||||
repr = PyUnicode_New(osize, max);
|
PyObject *repr = PyUnicode_New(osize, maxch);
|
||||||
if (repr == NULL)
|
if (repr == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
okind = PyUnicode_KIND(repr);
|
int okind = PyUnicode_KIND(repr);
|
||||||
odata = PyUnicode_DATA(repr);
|
void *odata = PyUnicode_DATA(repr);
|
||||||
|
|
||||||
|
if (!changed) {
|
||||||
PyUnicode_WRITE(okind, odata, 0, quote);
|
PyUnicode_WRITE(okind, odata, 0, quote);
|
||||||
PyUnicode_WRITE(okind, odata, osize-1, quote);
|
|
||||||
if (unchanged) {
|
|
||||||
_PyUnicode_FastCopyCharacters(repr, 1,
|
_PyUnicode_FastCopyCharacters(repr, 1,
|
||||||
unicode, 0,
|
unicode, 0,
|
||||||
isize);
|
isize);
|
||||||
|
|
||||||
|
PyUnicode_WRITE(okind, odata, osize-1, quote);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (i = 0, o = 1; i < isize; i++) {
|
switch (okind) {
|
||||||
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
|
case PyUnicode_1BYTE_KIND:
|
||||||
|
ucs1lib_repr(unicode, quote, odata);
|
||||||
/* Escape quotes and backslashes */
|
break;
|
||||||
if ((ch == quote) || (ch == '\\')) {
|
case PyUnicode_2BYTE_KIND:
|
||||||
PyUnicode_WRITE(okind, odata, o++, '\\');
|
ucs2lib_repr(unicode, quote, odata);
|
||||||
PyUnicode_WRITE(okind, odata, o++, ch);
|
break;
|
||||||
continue;
|
default:
|
||||||
|
assert(okind == PyUnicode_4BYTE_KIND);
|
||||||
|
ucs4lib_repr(unicode, quote, odata);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Map special whitespace to '\t', \n', '\r' */
|
|
||||||
if (ch == '\t') {
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, '\\');
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, 't');
|
|
||||||
}
|
|
||||||
else if (ch == '\n') {
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, '\\');
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, 'n');
|
|
||||||
}
|
|
||||||
else if (ch == '\r') {
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, '\\');
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, 'r');
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Map non-printable US ASCII to '\xhh' */
|
|
||||||
else if (ch < ' ' || ch == 0x7F) {
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, '\\');
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, 'x');
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Copy ASCII characters as-is */
|
|
||||||
else if (ch < 0x7F) {
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, ch);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Non-ASCII characters */
|
|
||||||
else {
|
|
||||||
/* Map Unicode whitespace and control characters
|
|
||||||
(categories Z* and C* except ASCII space)
|
|
||||||
*/
|
|
||||||
if (!Py_UNICODE_ISPRINTABLE(ch)) {
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, '\\');
|
|
||||||
/* Map 8-bit characters to '\xhh' */
|
|
||||||
if (ch <= 0xff) {
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, 'x');
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
|
|
||||||
}
|
|
||||||
/* Map 16-bit characters to '\uxxxx' */
|
|
||||||
else if (ch <= 0xffff) {
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, 'u');
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
|
|
||||||
}
|
|
||||||
/* Map 21-bit characters to '\U00xxxxxx' */
|
|
||||||
else {
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, 'U');
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* Copy characters as-is */
|
|
||||||
else {
|
|
||||||
PyUnicode_WRITE(okind, odata, o++, ch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* Closing quote already added at the beginning */
|
|
||||||
assert(_PyUnicode_CheckConsistency(repr, 1));
|
assert(_PyUnicode_CheckConsistency(repr, 1));
|
||||||
return repr;
|
return repr;
|
||||||
}
|
}
|
||||||
|
|
|
@ -167,6 +167,7 @@ Objects/stringlib/count.h Objects/stringlib/fastsearch.h
|
||||||
Objects/stringlib/find.h Objects/stringlib/fastsearch.h
|
Objects/stringlib/find.h Objects/stringlib/fastsearch.h
|
||||||
Objects/stringlib/partition.h Objects/stringlib/fastsearch.h
|
Objects/stringlib/partition.h Objects/stringlib/fastsearch.h
|
||||||
Objects/stringlib/replace.h Objects/stringlib/fastsearch.h
|
Objects/stringlib/replace.h Objects/stringlib/fastsearch.h
|
||||||
|
Objects/stringlib/repr.h Objects/stringlib/fastsearch.h
|
||||||
Objects/stringlib/split.h Objects/stringlib/fastsearch.h
|
Objects/stringlib/split.h Objects/stringlib/fastsearch.h
|
||||||
|
|
||||||
# @end=tsv@
|
# @end=tsv@
|
||||||
|
|
Loading…
Reference in New Issue