Issue #13417: speed up utf-8 decoding by around 2x for the non-fully-ASCII case.
This almost catches up with pre-PEP 393 performance, when decoding needed only one pass.
This commit is contained in:
parent
7fe601c5bf
commit
0a3229de6b
|
@ -645,6 +645,7 @@ BYTESTR_DEPS = \
|
|||
|
||||
UNICODE_DEPS = $(BYTESTR_DEPS) \
|
||||
$(srcdir)/Objects/stringlib/asciilib.h \
|
||||
$(srcdir)/Objects/stringlib/codecs.h \
|
||||
$(srcdir)/Objects/stringlib/ucs1lib.h \
|
||||
$(srcdir)/Objects/stringlib/ucs2lib.h \
|
||||
$(srcdir)/Objects/stringlib/ucs4lib.h \
|
||||
|
|
|
@ -0,0 +1,156 @@
|
|||
/* stringlib: codec implementations */
|
||||
|
||||
#if STRINGLIB_IS_UNICODE
|
||||
|
||||
/* Mask to check or force alignment of a pointer to C 'long' boundaries */
|
||||
#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
|
||||
|
||||
/* Mask to quickly check whether a C 'long' contains a
|
||||
non-ASCII, UTF8-encoded char. */
|
||||
#if (SIZEOF_LONG == 8)
|
||||
# define ASCII_CHAR_MASK 0x8080808080808080L
|
||||
#elif (SIZEOF_LONG == 4)
|
||||
# define ASCII_CHAR_MASK 0x80808080L
|
||||
#else
|
||||
# error C 'long' size should be either 4 or 8!
|
||||
#endif
|
||||
|
||||
Py_LOCAL_INLINE(int)
|
||||
STRINGLIB(utf8_try_decode)(const char *start, const char *end,
|
||||
STRINGLIB_CHAR *dest,
|
||||
const char **src_pos, Py_ssize_t *dest_index)
|
||||
{
|
||||
int ret;
|
||||
Py_ssize_t n;
|
||||
const char *s = start;
|
||||
const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
|
||||
STRINGLIB_CHAR *p = dest;
|
||||
|
||||
while (s < end) {
|
||||
Py_UCS4 ch = (unsigned char)*s;
|
||||
|
||||
if (ch < 0x80) {
|
||||
/* Fast path for runs of ASCII characters. Given that common UTF-8
|
||||
input will consist of an overwhelming majority of ASCII
|
||||
characters, we try to optimize for this case by checking
|
||||
as many characters as a C 'long' can contain.
|
||||
First, check if we can do an aligned read, as most CPUs have
|
||||
a penalty for unaligned reads.
|
||||
*/
|
||||
if (!((size_t) s & LONG_PTR_MASK)) {
|
||||
/* Help register allocation */
|
||||
register const char *_s = s;
|
||||
register STRINGLIB_CHAR *_p = p;
|
||||
while (_s < aligned_end) {
|
||||
/* Read a whole long at a time (either 4 or 8 bytes),
|
||||
and do a fast unrolled copy if it only contains ASCII
|
||||
characters. */
|
||||
unsigned long value = *(unsigned long *) _s;
|
||||
if (value & ASCII_CHAR_MASK)
|
||||
break;
|
||||
_p[0] = _s[0];
|
||||
_p[1] = _s[1];
|
||||
_p[2] = _s[2];
|
||||
_p[3] = _s[3];
|
||||
#if (SIZEOF_LONG == 8)
|
||||
_p[4] = _s[4];
|
||||
_p[5] = _s[5];
|
||||
_p[6] = _s[6];
|
||||
_p[7] = _s[7];
|
||||
#endif
|
||||
_s += SIZEOF_LONG;
|
||||
_p += SIZEOF_LONG;
|
||||
}
|
||||
s = _s;
|
||||
p = _p;
|
||||
if (s == end)
|
||||
break;
|
||||
ch = (unsigned char)*s;
|
||||
}
|
||||
}
|
||||
|
||||
if (ch < 0x80) {
|
||||
s++;
|
||||
*p++ = ch;
|
||||
continue;
|
||||
}
|
||||
|
||||
n = utf8_code_length[ch];
|
||||
|
||||
if (s + n > end) {
|
||||
/* unexpected end of data: the caller will decide whether
|
||||
it's an error or not */
|
||||
goto _error;
|
||||
}
|
||||
|
||||
switch (n) {
|
||||
case 0:
|
||||
/* invalid start byte */
|
||||
goto _error;
|
||||
case 1:
|
||||
/* internal error */
|
||||
goto _error;
|
||||
case 2:
|
||||
if ((s[1] & 0xc0) != 0x80)
|
||||
/* invalid continuation byte */
|
||||
goto _error;
|
||||
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
|
||||
assert ((ch > 0x007F) && (ch <= 0x07FF));
|
||||
s += 2;
|
||||
*p++ = ch;
|
||||
break;
|
||||
|
||||
case 3:
|
||||
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
|
||||
will result in surrogates in range d800-dfff. Surrogates are
|
||||
not valid UTF-8 so they are rejected.
|
||||
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
|
||||
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
|
||||
if ((s[1] & 0xc0) != 0x80 ||
|
||||
(s[2] & 0xc0) != 0x80 ||
|
||||
((unsigned char)s[0] == 0xE0 &&
|
||||
(unsigned char)s[1] < 0xA0) ||
|
||||
((unsigned char)s[0] == 0xED &&
|
||||
(unsigned char)s[1] > 0x9F)) {
|
||||
/* invalid continuation byte */
|
||||
goto _error;
|
||||
}
|
||||
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
|
||||
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
|
||||
s += 3;
|
||||
*p++ = ch;
|
||||
break;
|
||||
|
||||
case 4:
|
||||
if ((s[1] & 0xc0) != 0x80 ||
|
||||
(s[2] & 0xc0) != 0x80 ||
|
||||
(s[3] & 0xc0) != 0x80 ||
|
||||
((unsigned char)s[0] == 0xF0 &&
|
||||
(unsigned char)s[1] < 0x90) ||
|
||||
((unsigned char)s[0] == 0xF4 &&
|
||||
(unsigned char)s[1] > 0x8F)) {
|
||||
/* invalid continuation byte */
|
||||
goto _error;
|
||||
}
|
||||
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
|
||||
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
|
||||
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
|
||||
s += 4;
|
||||
*p++ = ch;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ret = 0;
|
||||
goto _ok;
|
||||
_error:
|
||||
ret = -1;
|
||||
_ok:
|
||||
*src_pos = s;
|
||||
*dest_index = p - dest;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#undef LONG_PTR_MASK
|
||||
#undef ASCII_CHAR_MASK
|
||||
|
||||
#endif /* STRINGLIB_IS_UNICODE */
|
|
@ -523,6 +523,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
|
|||
#include "stringlib/fastsearch.h"
|
||||
#include "stringlib/count.h"
|
||||
#include "stringlib/find.h"
|
||||
#include "stringlib/undef.h"
|
||||
|
||||
/* --- Unicode Object ----------------------------------------------------- */
|
||||
|
||||
|
@ -4190,6 +4191,18 @@ PyUnicode_DecodeUTF8(const char *s,
|
|||
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
|
||||
}
|
||||
|
||||
#include "stringlib/ucs1lib.h"
|
||||
#include "stringlib/codecs.h"
|
||||
#include "stringlib/undef.h"
|
||||
|
||||
#include "stringlib/ucs2lib.h"
|
||||
#include "stringlib/codecs.h"
|
||||
#include "stringlib/undef.h"
|
||||
|
||||
#include "stringlib/ucs4lib.h"
|
||||
#include "stringlib/codecs.h"
|
||||
#include "stringlib/undef.h"
|
||||
|
||||
/* Mask to check or force alignment of a pointer to C 'long' boundaries */
|
||||
#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
|
||||
|
||||
|
@ -4203,33 +4216,41 @@ PyUnicode_DecodeUTF8(const char *s,
|
|||
# error C 'long' size should be either 4 or 8!
|
||||
#endif
|
||||
|
||||
/* Scans a UTF-8 string and returns the maximum character to be expected,
|
||||
the size of the decoded unicode string and if any major errors were
|
||||
encountered.
|
||||
/* Scans a UTF-8 string and returns the maximum character to be expected
|
||||
and the size of the decoded unicode string.
|
||||
|
||||
This function does check basic UTF-8 sanity, it does however NOT CHECK
|
||||
if the string contains surrogates, and if all continuation bytes are
|
||||
within the correct ranges, these checks are performed in
|
||||
This function doesn't check for errors, these checks are performed in
|
||||
PyUnicode_DecodeUTF8Stateful.
|
||||
|
||||
If it sets has_errors to 1, it means the value of unicode_size and max_char
|
||||
will be bogus and you should not rely on useful information in them.
|
||||
*/
|
||||
static Py_UCS4
|
||||
utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
|
||||
Py_ssize_t *unicode_size, Py_ssize_t* consumed,
|
||||
int *has_errors)
|
||||
utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
|
||||
Py_ssize_t *unicode_size)
|
||||
{
|
||||
Py_ssize_t n;
|
||||
Py_ssize_t char_count = 0;
|
||||
Py_UCS4 max_char = 127, new_max;
|
||||
Py_UCS4 upper_bound;
|
||||
const unsigned char *p = (const unsigned char *)s;
|
||||
const unsigned char *end = p + string_size;
|
||||
const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
|
||||
int err = 0;
|
||||
|
||||
for (; p < end && !err; ++p, ++char_count) {
|
||||
assert(unicode_size != NULL);
|
||||
|
||||
/* By having a cascade of independent loops which fallback onto each
|
||||
other, we minimize the amount of work done in the average loop
|
||||
iteration, and we also maximize the CPU's ability to predict
|
||||
branches correctly (because a given condition will have always the
|
||||
same boolean outcome except perhaps in the last iteration of the
|
||||
corresponding loop).
|
||||
In the general case this brings us rather close to decoding
|
||||
performance pre-PEP 393, despite the two-pass decoding.
|
||||
|
||||
Note that the pure ASCII loop is not duplicated once a non-ASCII
|
||||
character has been encountered. It is actually a pessimization (by
|
||||
a significant factor) to use this loop on text with many non-ASCII
|
||||
characters, and it is important to avoid bad performance on valid
|
||||
utf-8 data (invalid utf-8 being a different can of worms).
|
||||
*/
|
||||
|
||||
/* ASCII */
|
||||
for (; p < end; ++p) {
|
||||
/* Only check value if it's not a ASCII char... */
|
||||
if (*p < 0x80) {
|
||||
/* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
|
||||
|
@ -4249,76 +4270,59 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
|
|||
break;
|
||||
}
|
||||
}
|
||||
if (*p >= 0x80) {
|
||||
n = utf8_code_length[*p];
|
||||
new_max = max_char;
|
||||
switch (n) {
|
||||
/* invalid start byte */
|
||||
case 0:
|
||||
err = 1;
|
||||
break;
|
||||
case 2:
|
||||
/* Code points between 0x00FF and 0x07FF inclusive.
|
||||
Approximate the upper bound of the code point,
|
||||
if this flips over 255 we can be sure it will be more
|
||||
than 255 and the string will need 2 bytes per code coint,
|
||||
if it stays under or equal to 255, we can be sure 1 byte
|
||||
is enough.
|
||||
((*p & 0b00011111) << 6) | 0b00111111 */
|
||||
upper_bound = ((*p & 0x1F) << 6) | 0x3F;
|
||||
if (max_char < upper_bound)
|
||||
new_max = upper_bound;
|
||||
/* Ensure we track at least that we left ASCII space. */
|
||||
if (new_max < 128)
|
||||
new_max = 128;
|
||||
break;
|
||||
case 3:
|
||||
/* Between 0x0FFF and 0xFFFF inclusive, so values are
|
||||
always > 255 and <= 65535 and will always need 2 bytes. */
|
||||
if (max_char < 65535)
|
||||
new_max = 65535;
|
||||
break;
|
||||
case 4:
|
||||
/* Code point will be above 0xFFFF for sure in this case. */
|
||||
new_max = 65537;
|
||||
break;
|
||||
/* Internal error, this should be caught by the first if */
|
||||
case 1:
|
||||
default:
|
||||
assert(0 && "Impossible case in utf8_max_char_and_size");
|
||||
err = 1;
|
||||
}
|
||||
/* Instead of number of overall bytes for this code point,
|
||||
n contains the number of following bytes: */
|
||||
--n;
|
||||
/* Check if the follow up chars are all valid continuation bytes */
|
||||
if (n >= 1) {
|
||||
const unsigned char *cont;
|
||||
if ((p + n) >= end) {
|
||||
if (consumed == 0)
|
||||
/* incomplete data, non-incremental decoding */
|
||||
err = 1;
|
||||
break;
|
||||
}
|
||||
for (cont = p + 1; cont <= (p + n); ++cont) {
|
||||
if ((*cont & 0xc0) != 0x80) {
|
||||
err = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
p += n;
|
||||
}
|
||||
else
|
||||
err = 1;
|
||||
max_char = new_max;
|
||||
}
|
||||
if (*p < 0x80)
|
||||
++char_count;
|
||||
else
|
||||
goto _ucs1loop;
|
||||
}
|
||||
*unicode_size = char_count;
|
||||
return 127;
|
||||
|
||||
if (unicode_size)
|
||||
*unicode_size = char_count;
|
||||
if (has_errors)
|
||||
*has_errors = err;
|
||||
return max_char;
|
||||
_ucs1loop:
|
||||
for (; p < end; ++p) {
|
||||
if (*p < 0xc4)
|
||||
char_count += ((*p & 0xc0) != 0x80);
|
||||
else
|
||||
goto _ucs2loop;
|
||||
}
|
||||
*unicode_size = char_count;
|
||||
return 255;
|
||||
|
||||
_ucs2loop:
|
||||
for (; p < end; ++p) {
|
||||
if (*p < 0xf0)
|
||||
char_count += ((*p & 0xc0) != 0x80);
|
||||
else
|
||||
goto _ucs4loop;
|
||||
}
|
||||
*unicode_size = char_count;
|
||||
return 65535;
|
||||
|
||||
_ucs4loop:
|
||||
for (; p < end; ++p) {
|
||||
char_count += ((*p & 0xc0) != 0x80);
|
||||
}
|
||||
*unicode_size = char_count;
|
||||
return 65537;
|
||||
}
|
||||
|
||||
/* Called when we encountered some error that wasn't detected in the original
|
||||
scan, e.g. an encoded surrogate character. The original maxchar computation
|
||||
may have been incorrect, so redo it. */
|
||||
static int
|
||||
refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
|
||||
{
|
||||
PyObject *tmp;
|
||||
Py_ssize_t k, maxchar;
|
||||
for (k = 0, maxchar = 0; k < n; k++)
|
||||
maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
|
||||
tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
|
||||
if (tmp == NULL)
|
||||
return -1;
|
||||
PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
|
||||
Py_DECREF(*unicode);
|
||||
*unicode = tmp;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
|
||||
|
@ -4361,35 +4365,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
|||
Py_ssize_t i;
|
||||
int kind;
|
||||
void *data;
|
||||
int has_errors;
|
||||
int has_errors = 0;
|
||||
|
||||
if (size == 0) {
|
||||
if (consumed)
|
||||
*consumed = 0;
|
||||
return (PyObject *)PyUnicode_New(0, 0);
|
||||
}
|
||||
maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
|
||||
consumed, &has_errors);
|
||||
if (has_errors)
|
||||
/* maxchar and size computation might be incorrect;
|
||||
code below widens and resizes as necessary. */
|
||||
unicode = PyUnicode_New(size, 127);
|
||||
else
|
||||
unicode = PyUnicode_New(unicode_size, maxchar);
|
||||
maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
|
||||
/* In case of errors, maxchar and size computation might be incorrect;
|
||||
code below refits and resizes as necessary. */
|
||||
unicode = PyUnicode_New(unicode_size, maxchar);
|
||||
if (!unicode)
|
||||
return NULL;
|
||||
/* When the string is ASCII only, just use memcpy and return.
|
||||
unicode_size may be != size if there is an incomplete UTF-8
|
||||
sequence at the end of the ASCII block. */
|
||||
if (!has_errors && maxchar < 128 && size == unicode_size) {
|
||||
if (maxchar < 128 && size == unicode_size) {
|
||||
Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
|
||||
return unicode;
|
||||
}
|
||||
kind = PyUnicode_KIND(unicode);
|
||||
data = PyUnicode_DATA(unicode);
|
||||
|
||||
/* Unpack UTF-8 encoded data */
|
||||
i = 0;
|
||||
e = s + size;
|
||||
switch (kind) {
|
||||
case PyUnicode_1BYTE_KIND:
|
||||
has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
|
||||
break;
|
||||
case PyUnicode_2BYTE_KIND:
|
||||
has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
|
||||
break;
|
||||
case PyUnicode_4BYTE_KIND:
|
||||
has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
|
||||
break;
|
||||
}
|
||||
if (!has_errors) {
|
||||
/* Ensure the unicode size calculation was correct */
|
||||
assert(i == unicode_size);
|
||||
assert(s == e);
|
||||
if (consumed)
|
||||
*consumed = s-starts;
|
||||
return unicode;
|
||||
}
|
||||
/* Fall through to the generic decoding loop for the rest of
|
||||
the string */
|
||||
if (refit_partial_string(&unicode, kind, data, i) < 0)
|
||||
goto onError;
|
||||
|
||||
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
|
||||
|
||||
while (s < e) {
|
||||
|
@ -4541,19 +4566,8 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
|||
|
||||
utf8Error:
|
||||
if (!has_errors) {
|
||||
PyObject *tmp;
|
||||
Py_ssize_t k;
|
||||
/* We encountered some error that wasn't detected in the original scan,
|
||||
e.g. an encoded surrogate character. The original maxchar computation may
|
||||
have been incorrect, so redo it now. */
|
||||
for (k = 0, maxchar = 0; k < i; k++)
|
||||
maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
|
||||
tmp = PyUnicode_New(PyUnicode_GET_LENGTH(unicode), maxchar);
|
||||
if (tmp == NULL)
|
||||
if (refit_partial_string(&unicode, kind, data, i) < 0)
|
||||
goto onError;
|
||||
PyUnicode_CopyCharacters(tmp, 0, unicode, 0, i);
|
||||
Py_DECREF(unicode);
|
||||
unicode = tmp;
|
||||
has_errors = 1;
|
||||
}
|
||||
if (unicode_decode_call_errorhandler(
|
||||
|
|
Loading…
Reference in New Issue