diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-10-27-04-47-28.gh-issue-126024.XCQSqT.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-10-27-04-47-28.gh-issue-126024.XCQSqT.rst new file mode 100644 index 00000000000..b41fff30433 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-10-27-04-47-28.gh-issue-126024.XCQSqT.rst @@ -0,0 +1,2 @@ +Optimize decoding of short UTF-8 sequences containing non-ASCII characters +by approximately 15%. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 562e3312b63..ab4f07ed054 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4979,39 +4979,228 @@ PyUnicode_DecodeUTF8(const char *s, #include "stringlib/codecs.h" #include "stringlib/undef.h" +#if (SIZEOF_SIZE_T == 8) /* Mask to quickly check whether a C 'size_t' contains a non-ASCII, UTF8-encoded char. */ -#if (SIZEOF_SIZE_T == 8) # define ASCII_CHAR_MASK 0x8080808080808080ULL +// used to count codepoints in UTF-8 string. +# define VECTOR_0101 0x0101010101010101ULL +# define VECTOR_00FF 0x00ff00ff00ff00ffULL #elif (SIZEOF_SIZE_T == 4) # define ASCII_CHAR_MASK 0x80808080U +# define VECTOR_0101 0x01010101U +# define VECTOR_00FF 0x00ff00ffU #else # error C 'size_t' size should be either 4 or 8! #endif +#if (defined(__clang__) || defined(__GNUC__)) +#define HAVE_CTZ 1 +static inline unsigned int +ctz(size_t v) +{ + return __builtin_ctzll((unsigned long long)v); +} +#elif defined(_MSC_VER) +#define HAVE_CTZ 1 +static inline unsigned int +ctz(size_t v) +{ + unsigned long pos; +#if SIZEOF_SIZE_T == 4 + _BitScanForward(&pos, v); +#else + _BitScanForward64(&pos, v); +#endif /* SIZEOF_SIZE_T */ + return pos; +} +#endif + +#if HAVE_CTZ +// load p[0]..p[size-1] as a little-endian size_t +// without unaligned access nor read ahead. +static size_t +load_unaligned(const unsigned char *p, size_t size) +{ + assert(size <= SIZEOF_SIZE_T); + union { + size_t s; + unsigned char b[SIZEOF_SIZE_T]; + } u; + u.s = 0; + switch (size) { + case 8: + u.b[7] = p[7]; + _Py_FALLTHROUGH; + case 7: + u.b[6] = p[6]; + _Py_FALLTHROUGH; + case 6: + u.b[5] = p[5]; + _Py_FALLTHROUGH; + case 5: + u.b[4] = p[4]; + _Py_FALLTHROUGH; + case 4: + u.b[3] = p[3]; + _Py_FALLTHROUGH; + case 3: + u.b[2] = p[2]; + _Py_FALLTHROUGH; + case 2: + u.b[1] = p[1]; + _Py_FALLTHROUGH; + case 1: + u.b[0] = p[0]; + break; + case 0: + break; + default: + Py_UNREACHABLE(); + } + return u.s; +} +#endif + +/* + * Find the first non-ASCII character in a byte sequence. + * + * This function scans a range of bytes from `start` to `end` and returns the + * index of the first byte that is not an ASCII character (i.e., has the most + * significant bit set). If all characters in the range are ASCII, it returns + * `end - start`. + */ +static Py_ssize_t +find_first_nonascii(const unsigned char *start, const unsigned char *end) +{ + const unsigned char *p = start; + + if (end - start >= SIZEOF_SIZE_T) { + const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T); + if (p < p2) { +#if HAVE_CTZ +#if defined(_M_AMD64) || defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) + // x86 and amd64 are little endian and can load unaligned memory. + size_t u = *(const size_t*)p & ASCII_CHAR_MASK; +#else + size_t u = load_unaligned(p, p2 - p) & ASCII_CHAR_MASK; +#endif + if (u) { + return p - start + (ctz(u) - 7) / 8; + } + p = p2; + } +#else + while (p < p2) { + if (*p & 0x80) { + return p - start; + } + p++; + } +#endif + const unsigned char *e = end - SIZEOF_SIZE_T; + while (p <= e) { + size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK; + if (u) { +#if PY_LITTLE_ENDIAN && HAVE_CTZ + return p - start + (ctz(u) - 7) / 8; +#else + // big endian and minor compilers are difficult to test. + // fallback to per byte check. + break; +#endif + } + p += SIZEOF_SIZE_T; + } + } +#if HAVE_CTZ + // we can not use *(const size_t*)p to avoid buffer overrun. + size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK; + if (u) { + return p - start + (ctz(u) - 7) / 8; + } + return end - start; +#else + while (p < end) { + if (*p & 0x80) { + break; + } + p++; + } + return p - start; +#endif +} + +static inline int +scalar_utf8_start_char(unsigned int ch) +{ + // 0xxxxxxx or 11xxxxxx are first byte. + return (~ch >> 7 | ch >> 6) & 1; +} + +static inline size_t +vector_utf8_start_chars(size_t v) +{ + return ((~v >> 7) | (v >> 6)) & VECTOR_0101; +} + + +// Count the number of UTF-8 code points in a given byte sequence. +static Py_ssize_t +utf8_count_codepoints(const unsigned char *s, const unsigned char *end) +{ + Py_ssize_t len = 0; + + if (end - s >= SIZEOF_SIZE_T) { + while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { + len += scalar_utf8_start_char(*s++); + } + + while (s + SIZEOF_SIZE_T <= end) { + const unsigned char *e = end; + if (e - s > SIZEOF_SIZE_T * 255) { + e = s + SIZEOF_SIZE_T * 255; + } + Py_ssize_t vstart = 0; + while (s + SIZEOF_SIZE_T <= e) { + size_t v = *(size_t*)s; + size_t vs = vector_utf8_start_chars(v); + vstart += vs; + s += SIZEOF_SIZE_T; + } + vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF); + vstart += vstart >> 16; +#if SIZEOF_SIZE_T == 8 + vstart += vstart >> 32; +#endif + len += vstart & 0x7ff; + } + } + while (s < end) { + len += scalar_utf8_start_char(*s++); + } + return len; +} + static Py_ssize_t ascii_decode(const char *start, const char *end, Py_UCS1 *dest) { - const char *p = start; - #if SIZEOF_SIZE_T <= SIZEOF_VOID_P - if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T) + if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T) && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T)) { /* Fast path, see in STRINGLIB(utf8_decode) for an explanation. */ - /* Help allocation */ - const char *_p = p; - Py_UCS1 * q = dest; - while (_p + SIZEOF_SIZE_T <= end) { - size_t value = *(const size_t *) _p; + const char *p = start; + Py_UCS1 *q = dest; + while (p + SIZEOF_SIZE_T <= end) { + size_t value = *(const size_t *) p; if (value & ASCII_CHAR_MASK) break; *((size_t *)q) = value; - _p += SIZEOF_SIZE_T; + p += SIZEOF_SIZE_T; q += SIZEOF_SIZE_T; } - p = _p; while (p < end) { if ((unsigned char)*p & 0x80) break; @@ -5020,31 +5209,12 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest) return p - start; } #endif - while (p < end) { - /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h - for an explanation. */ - if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { - /* Help allocation */ - const char *_p = p; - while (_p + SIZEOF_SIZE_T <= end) { - size_t value = *(const size_t *) _p; - if (value & ASCII_CHAR_MASK) - break; - _p += SIZEOF_SIZE_T; - } - p = _p; - if (_p == end) - break; - } - if ((unsigned char)*p & 0x80) - break; - ++p; - } - memcpy(dest, start, p - start); - return p - start; + Py_ssize_t pos = find_first_nonascii((const unsigned char*)start, + (const unsigned char*)end); + memcpy(dest, start, pos); + return pos; } - static int unicode_decode_utf8_impl(_PyUnicodeWriter *writer, const char *starts, const char *s, const char *end, @@ -5188,27 +5358,69 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, return get_latin1_char((unsigned char)s[0]); } - // fast path: try ASCII string. - const char *starts = s; - const char *end = s + size; - PyObject *u = PyUnicode_New(size, 127); - if (u == NULL) { + // I don't know this check is necessary or not. But there is a test + // case that requires size=PY_SSIZE_T_MAX cause MemoryError. + if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) { + PyErr_NoMemory(); return NULL; } - Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u)); - if (decoded == size) { + + const char *starts = s; + const char *end = s + size; + + Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end); + if (pos == size) { // fast path: ASCII string. + PyObject *u = PyUnicode_New(size, 127); + if (u == NULL) { + return NULL; + } + memcpy(PyUnicode_1BYTE_DATA(u), s, size); if (consumed) { *consumed = size; } return u; } - s += decoded; - size -= decoded; + + int maxchr = 127; + Py_ssize_t maxsize = size; + + unsigned char ch = (unsigned char)(s[pos]); + // error handler other than strict may remove/replace the invalid byte. + // consumed != NULL allows 1~3 bytes remainings. + // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError. + // otherwise: check the input and decide the maxchr and maxsize to reduce + // reallocation and copy. + if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) { + // we only calculate the number of codepoints and don't determine the exact maxchr. + // This is because writing fast and portable SIMD code to find maxchr is difficult. + // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints + // means that it is no longer necessary to allocate several times the required amount + // of memory. + maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end); + if (ch < 0xc4) { // latin1 + maxchr = 0xff; + } + else if (ch < 0xf0) { // ucs2 + maxchr = 0xffff; + } + else { // ucs4 + maxchr = 0x10ffff; + } + } + PyObject *u = PyUnicode_New(maxsize, maxchr); + if (!u) { + return NULL; + } // Use _PyUnicodeWriter after fast path is failed. _PyUnicodeWriter writer; _PyUnicodeWriter_InitWithBuffer(&writer, u); - writer.pos = decoded; + if (maxchr <= 255) { + memcpy(PyUnicode_1BYTE_DATA(u), s, pos); + s += pos; + size -= pos; + writer.pos = pos; + } if (unicode_decode_utf8_impl(&writer, starts, s, end, error_handler, errors, @@ -5268,7 +5480,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s, const char *errors, Py_ssize_t *consumed) { - return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed); + return unicode_decode_utf8(s, size, + errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT, + errors, consumed); }