mirror of https://github.com/python/cpython
gh-126024: optimize UTF-8 decoder for short non-ASCII string (#126025)
This commit is contained in:
parent
bfabf96b50
commit
322b486010
|
@ -0,0 +1,2 @@
|
||||||
|
Optimize decoding of short UTF-8 sequences containing non-ASCII characters
|
||||||
|
by approximately 15%.
|
|
@ -4979,39 +4979,228 @@ PyUnicode_DecodeUTF8(const char *s,
|
||||||
#include "stringlib/codecs.h"
|
#include "stringlib/codecs.h"
|
||||||
#include "stringlib/undef.h"
|
#include "stringlib/undef.h"
|
||||||
|
|
||||||
|
#if (SIZEOF_SIZE_T == 8)
|
||||||
/* Mask to quickly check whether a C 'size_t' contains a
|
/* Mask to quickly check whether a C 'size_t' contains a
|
||||||
non-ASCII, UTF8-encoded char. */
|
non-ASCII, UTF8-encoded char. */
|
||||||
#if (SIZEOF_SIZE_T == 8)
|
|
||||||
# define ASCII_CHAR_MASK 0x8080808080808080ULL
|
# define ASCII_CHAR_MASK 0x8080808080808080ULL
|
||||||
|
// used to count codepoints in UTF-8 string.
|
||||||
|
# define VECTOR_0101 0x0101010101010101ULL
|
||||||
|
# define VECTOR_00FF 0x00ff00ff00ff00ffULL
|
||||||
#elif (SIZEOF_SIZE_T == 4)
|
#elif (SIZEOF_SIZE_T == 4)
|
||||||
# define ASCII_CHAR_MASK 0x80808080U
|
# define ASCII_CHAR_MASK 0x80808080U
|
||||||
|
# define VECTOR_0101 0x01010101U
|
||||||
|
# define VECTOR_00FF 0x00ff00ffU
|
||||||
#else
|
#else
|
||||||
# error C 'size_t' size should be either 4 or 8!
|
# error C 'size_t' size should be either 4 or 8!
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if (defined(__clang__) || defined(__GNUC__))
|
||||||
|
#define HAVE_CTZ 1
|
||||||
|
static inline unsigned int
|
||||||
|
ctz(size_t v)
|
||||||
|
{
|
||||||
|
return __builtin_ctzll((unsigned long long)v);
|
||||||
|
}
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
#define HAVE_CTZ 1
|
||||||
|
static inline unsigned int
|
||||||
|
ctz(size_t v)
|
||||||
|
{
|
||||||
|
unsigned long pos;
|
||||||
|
#if SIZEOF_SIZE_T == 4
|
||||||
|
_BitScanForward(&pos, v);
|
||||||
|
#else
|
||||||
|
_BitScanForward64(&pos, v);
|
||||||
|
#endif /* SIZEOF_SIZE_T */
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_CTZ
|
||||||
|
// load p[0]..p[size-1] as a little-endian size_t
|
||||||
|
// without unaligned access nor read ahead.
|
||||||
|
static size_t
|
||||||
|
load_unaligned(const unsigned char *p, size_t size)
|
||||||
|
{
|
||||||
|
assert(size <= SIZEOF_SIZE_T);
|
||||||
|
union {
|
||||||
|
size_t s;
|
||||||
|
unsigned char b[SIZEOF_SIZE_T];
|
||||||
|
} u;
|
||||||
|
u.s = 0;
|
||||||
|
switch (size) {
|
||||||
|
case 8:
|
||||||
|
u.b[7] = p[7];
|
||||||
|
_Py_FALLTHROUGH;
|
||||||
|
case 7:
|
||||||
|
u.b[6] = p[6];
|
||||||
|
_Py_FALLTHROUGH;
|
||||||
|
case 6:
|
||||||
|
u.b[5] = p[5];
|
||||||
|
_Py_FALLTHROUGH;
|
||||||
|
case 5:
|
||||||
|
u.b[4] = p[4];
|
||||||
|
_Py_FALLTHROUGH;
|
||||||
|
case 4:
|
||||||
|
u.b[3] = p[3];
|
||||||
|
_Py_FALLTHROUGH;
|
||||||
|
case 3:
|
||||||
|
u.b[2] = p[2];
|
||||||
|
_Py_FALLTHROUGH;
|
||||||
|
case 2:
|
||||||
|
u.b[1] = p[1];
|
||||||
|
_Py_FALLTHROUGH;
|
||||||
|
case 1:
|
||||||
|
u.b[0] = p[0];
|
||||||
|
break;
|
||||||
|
case 0:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
Py_UNREACHABLE();
|
||||||
|
}
|
||||||
|
return u.s;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Find the first non-ASCII character in a byte sequence.
|
||||||
|
*
|
||||||
|
* This function scans a range of bytes from `start` to `end` and returns the
|
||||||
|
* index of the first byte that is not an ASCII character (i.e., has the most
|
||||||
|
* significant bit set). If all characters in the range are ASCII, it returns
|
||||||
|
* `end - start`.
|
||||||
|
*/
|
||||||
|
static Py_ssize_t
|
||||||
|
find_first_nonascii(const unsigned char *start, const unsigned char *end)
|
||||||
|
{
|
||||||
|
const unsigned char *p = start;
|
||||||
|
|
||||||
|
if (end - start >= SIZEOF_SIZE_T) {
|
||||||
|
const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
|
||||||
|
if (p < p2) {
|
||||||
|
#if HAVE_CTZ
|
||||||
|
#if defined(_M_AMD64) || defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
|
||||||
|
// x86 and amd64 are little endian and can load unaligned memory.
|
||||||
|
size_t u = *(const size_t*)p & ASCII_CHAR_MASK;
|
||||||
|
#else
|
||||||
|
size_t u = load_unaligned(p, p2 - p) & ASCII_CHAR_MASK;
|
||||||
|
#endif
|
||||||
|
if (u) {
|
||||||
|
return p - start + (ctz(u) - 7) / 8;
|
||||||
|
}
|
||||||
|
p = p2;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
while (p < p2) {
|
||||||
|
if (*p & 0x80) {
|
||||||
|
return p - start;
|
||||||
|
}
|
||||||
|
p++;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
const unsigned char *e = end - SIZEOF_SIZE_T;
|
||||||
|
while (p <= e) {
|
||||||
|
size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
|
||||||
|
if (u) {
|
||||||
|
#if PY_LITTLE_ENDIAN && HAVE_CTZ
|
||||||
|
return p - start + (ctz(u) - 7) / 8;
|
||||||
|
#else
|
||||||
|
// big endian and minor compilers are difficult to test.
|
||||||
|
// fallback to per byte check.
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
p += SIZEOF_SIZE_T;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#if HAVE_CTZ
|
||||||
|
// we can not use *(const size_t*)p to avoid buffer overrun.
|
||||||
|
size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
|
||||||
|
if (u) {
|
||||||
|
return p - start + (ctz(u) - 7) / 8;
|
||||||
|
}
|
||||||
|
return end - start;
|
||||||
|
#else
|
||||||
|
while (p < end) {
|
||||||
|
if (*p & 0x80) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
p++;
|
||||||
|
}
|
||||||
|
return p - start;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int
|
||||||
|
scalar_utf8_start_char(unsigned int ch)
|
||||||
|
{
|
||||||
|
// 0xxxxxxx or 11xxxxxx are first byte.
|
||||||
|
return (~ch >> 7 | ch >> 6) & 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline size_t
|
||||||
|
vector_utf8_start_chars(size_t v)
|
||||||
|
{
|
||||||
|
return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Count the number of UTF-8 code points in a given byte sequence.
|
||||||
|
static Py_ssize_t
|
||||||
|
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
|
||||||
|
{
|
||||||
|
Py_ssize_t len = 0;
|
||||||
|
|
||||||
|
if (end - s >= SIZEOF_SIZE_T) {
|
||||||
|
while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
|
||||||
|
len += scalar_utf8_start_char(*s++);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (s + SIZEOF_SIZE_T <= end) {
|
||||||
|
const unsigned char *e = end;
|
||||||
|
if (e - s > SIZEOF_SIZE_T * 255) {
|
||||||
|
e = s + SIZEOF_SIZE_T * 255;
|
||||||
|
}
|
||||||
|
Py_ssize_t vstart = 0;
|
||||||
|
while (s + SIZEOF_SIZE_T <= e) {
|
||||||
|
size_t v = *(size_t*)s;
|
||||||
|
size_t vs = vector_utf8_start_chars(v);
|
||||||
|
vstart += vs;
|
||||||
|
s += SIZEOF_SIZE_T;
|
||||||
|
}
|
||||||
|
vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
|
||||||
|
vstart += vstart >> 16;
|
||||||
|
#if SIZEOF_SIZE_T == 8
|
||||||
|
vstart += vstart >> 32;
|
||||||
|
#endif
|
||||||
|
len += vstart & 0x7ff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (s < end) {
|
||||||
|
len += scalar_utf8_start_char(*s++);
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
static Py_ssize_t
|
static Py_ssize_t
|
||||||
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
|
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
|
||||||
{
|
{
|
||||||
const char *p = start;
|
|
||||||
|
|
||||||
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
|
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
|
||||||
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)
|
if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
|
||||||
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
|
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
|
||||||
{
|
{
|
||||||
/* Fast path, see in STRINGLIB(utf8_decode) for
|
/* Fast path, see in STRINGLIB(utf8_decode) for
|
||||||
an explanation. */
|
an explanation. */
|
||||||
/* Help allocation */
|
const char *p = start;
|
||||||
const char *_p = p;
|
Py_UCS1 *q = dest;
|
||||||
Py_UCS1 * q = dest;
|
while (p + SIZEOF_SIZE_T <= end) {
|
||||||
while (_p + SIZEOF_SIZE_T <= end) {
|
size_t value = *(const size_t *) p;
|
||||||
size_t value = *(const size_t *) _p;
|
|
||||||
if (value & ASCII_CHAR_MASK)
|
if (value & ASCII_CHAR_MASK)
|
||||||
break;
|
break;
|
||||||
*((size_t *)q) = value;
|
*((size_t *)q) = value;
|
||||||
_p += SIZEOF_SIZE_T;
|
p += SIZEOF_SIZE_T;
|
||||||
q += SIZEOF_SIZE_T;
|
q += SIZEOF_SIZE_T;
|
||||||
}
|
}
|
||||||
p = _p;
|
|
||||||
while (p < end) {
|
while (p < end) {
|
||||||
if ((unsigned char)*p & 0x80)
|
if ((unsigned char)*p & 0x80)
|
||||||
break;
|
break;
|
||||||
|
@ -5020,31 +5209,12 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
|
||||||
return p - start;
|
return p - start;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
while (p < end) {
|
Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
|
||||||
/* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
|
(const unsigned char*)end);
|
||||||
for an explanation. */
|
memcpy(dest, start, pos);
|
||||||
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
|
return pos;
|
||||||
/* Help allocation */
|
|
||||||
const char *_p = p;
|
|
||||||
while (_p + SIZEOF_SIZE_T <= end) {
|
|
||||||
size_t value = *(const size_t *) _p;
|
|
||||||
if (value & ASCII_CHAR_MASK)
|
|
||||||
break;
|
|
||||||
_p += SIZEOF_SIZE_T;
|
|
||||||
}
|
|
||||||
p = _p;
|
|
||||||
if (_p == end)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if ((unsigned char)*p & 0x80)
|
|
||||||
break;
|
|
||||||
++p;
|
|
||||||
}
|
|
||||||
memcpy(dest, start, p - start);
|
|
||||||
return p - start;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
|
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
|
||||||
const char *starts, const char *s, const char *end,
|
const char *starts, const char *s, const char *end,
|
||||||
|
@ -5188,27 +5358,69 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
|
||||||
return get_latin1_char((unsigned char)s[0]);
|
return get_latin1_char((unsigned char)s[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// fast path: try ASCII string.
|
// I don't know this check is necessary or not. But there is a test
|
||||||
const char *starts = s;
|
// case that requires size=PY_SSIZE_T_MAX cause MemoryError.
|
||||||
const char *end = s + size;
|
if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
|
||||||
PyObject *u = PyUnicode_New(size, 127);
|
PyErr_NoMemory();
|
||||||
if (u == NULL) {
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
|
|
||||||
if (decoded == size) {
|
const char *starts = s;
|
||||||
|
const char *end = s + size;
|
||||||
|
|
||||||
|
Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
|
||||||
|
if (pos == size) { // fast path: ASCII string.
|
||||||
|
PyObject *u = PyUnicode_New(size, 127);
|
||||||
|
if (u == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
memcpy(PyUnicode_1BYTE_DATA(u), s, size);
|
||||||
if (consumed) {
|
if (consumed) {
|
||||||
*consumed = size;
|
*consumed = size;
|
||||||
}
|
}
|
||||||
return u;
|
return u;
|
||||||
}
|
}
|
||||||
s += decoded;
|
|
||||||
size -= decoded;
|
int maxchr = 127;
|
||||||
|
Py_ssize_t maxsize = size;
|
||||||
|
|
||||||
|
unsigned char ch = (unsigned char)(s[pos]);
|
||||||
|
// error handler other than strict may remove/replace the invalid byte.
|
||||||
|
// consumed != NULL allows 1~3 bytes remainings.
|
||||||
|
// 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
|
||||||
|
// otherwise: check the input and decide the maxchr and maxsize to reduce
|
||||||
|
// reallocation and copy.
|
||||||
|
if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
|
||||||
|
// we only calculate the number of codepoints and don't determine the exact maxchr.
|
||||||
|
// This is because writing fast and portable SIMD code to find maxchr is difficult.
|
||||||
|
// If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
|
||||||
|
// means that it is no longer necessary to allocate several times the required amount
|
||||||
|
// of memory.
|
||||||
|
maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
|
||||||
|
if (ch < 0xc4) { // latin1
|
||||||
|
maxchr = 0xff;
|
||||||
|
}
|
||||||
|
else if (ch < 0xf0) { // ucs2
|
||||||
|
maxchr = 0xffff;
|
||||||
|
}
|
||||||
|
else { // ucs4
|
||||||
|
maxchr = 0x10ffff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
PyObject *u = PyUnicode_New(maxsize, maxchr);
|
||||||
|
if (!u) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
// Use _PyUnicodeWriter after fast path is failed.
|
// Use _PyUnicodeWriter after fast path is failed.
|
||||||
_PyUnicodeWriter writer;
|
_PyUnicodeWriter writer;
|
||||||
_PyUnicodeWriter_InitWithBuffer(&writer, u);
|
_PyUnicodeWriter_InitWithBuffer(&writer, u);
|
||||||
writer.pos = decoded;
|
if (maxchr <= 255) {
|
||||||
|
memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
|
||||||
|
s += pos;
|
||||||
|
size -= pos;
|
||||||
|
writer.pos = pos;
|
||||||
|
}
|
||||||
|
|
||||||
if (unicode_decode_utf8_impl(&writer, starts, s, end,
|
if (unicode_decode_utf8_impl(&writer, starts, s, end,
|
||||||
error_handler, errors,
|
error_handler, errors,
|
||||||
|
@ -5268,7 +5480,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
||||||
const char *errors,
|
const char *errors,
|
||||||
Py_ssize_t *consumed)
|
Py_ssize_t *consumed)
|
||||||
{
|
{
|
||||||
return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
|
return unicode_decode_utf8(s, size,
|
||||||
|
errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
|
||||||
|
errors, consumed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue