Issue #13417: speed up utf-8 decoding by around 2x for the non-fully-ASCII case.

This almost catches up with pre-PEP 393 performance, when decoding needed
only one pass.
This commit is contained in:
Antoine Pitrou 2011-11-21 20:39:13 +01:00
parent 7fe601c5bf
commit 0a3229de6b
3 changed files with 278 additions and 107 deletions

View File

@ -645,6 +645,7 @@ BYTESTR_DEPS = \
UNICODE_DEPS = $(BYTESTR_DEPS) \
$(srcdir)/Objects/stringlib/asciilib.h \
$(srcdir)/Objects/stringlib/codecs.h \
$(srcdir)/Objects/stringlib/ucs1lib.h \
$(srcdir)/Objects/stringlib/ucs2lib.h \
$(srcdir)/Objects/stringlib/ucs4lib.h \

156
Objects/stringlib/codecs.h Normal file
View File

@ -0,0 +1,156 @@
/* stringlib: codec implementations */
#if STRINGLIB_IS_UNICODE
/* Mask to check or force alignment of a pointer to C 'long' boundaries */
#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
/* Mask to quickly check whether a C 'long' contains a
non-ASCII, UTF8-encoded char. */
#if (SIZEOF_LONG == 8)
# define ASCII_CHAR_MASK 0x8080808080808080L
#elif (SIZEOF_LONG == 4)
# define ASCII_CHAR_MASK 0x80808080L
#else
# error C 'long' size should be either 4 or 8!
#endif
Py_LOCAL_INLINE(int)
STRINGLIB(utf8_try_decode)(const char *start, const char *end,
STRINGLIB_CHAR *dest,
const char **src_pos, Py_ssize_t *dest_index)
{
int ret;
Py_ssize_t n;
const char *s = start;
const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
STRINGLIB_CHAR *p = dest;
while (s < end) {
Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) {
/* Fast path for runs of ASCII characters. Given that common UTF-8
input will consist of an overwhelming majority of ASCII
characters, we try to optimize for this case by checking
as many characters as a C 'long' can contain.
First, check if we can do an aligned read, as most CPUs have
a penalty for unaligned reads.
*/
if (!((size_t) s & LONG_PTR_MASK)) {
/* Help register allocation */
register const char *_s = s;
register STRINGLIB_CHAR *_p = p;
while (_s < aligned_end) {
/* Read a whole long at a time (either 4 or 8 bytes),
and do a fast unrolled copy if it only contains ASCII
characters. */
unsigned long value = *(unsigned long *) _s;
if (value & ASCII_CHAR_MASK)
break;
_p[0] = _s[0];
_p[1] = _s[1];
_p[2] = _s[2];
_p[3] = _s[3];
#if (SIZEOF_LONG == 8)
_p[4] = _s[4];
_p[5] = _s[5];
_p[6] = _s[6];
_p[7] = _s[7];
#endif
_s += SIZEOF_LONG;
_p += SIZEOF_LONG;
}
s = _s;
p = _p;
if (s == end)
break;
ch = (unsigned char)*s;
}
}
if (ch < 0x80) {
s++;
*p++ = ch;
continue;
}
n = utf8_code_length[ch];
if (s + n > end) {
/* unexpected end of data: the caller will decide whether
it's an error or not */
goto _error;
}
switch (n) {
case 0:
/* invalid start byte */
goto _error;
case 1:
/* internal error */
goto _error;
case 2:
if ((s[1] & 0xc0) != 0x80)
/* invalid continuation byte */
goto _error;
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
assert ((ch > 0x007F) && (ch <= 0x07FF));
s += 2;
*p++ = ch;
break;
case 3:
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
will result in surrogates in range d800-dfff. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xE0 &&
(unsigned char)s[1] < 0xA0) ||
((unsigned char)s[0] == 0xED &&
(unsigned char)s[1] > 0x9F)) {
/* invalid continuation byte */
goto _error;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
s += 3;
*p++ = ch;
break;
case 4:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xF0 &&
(unsigned char)s[1] < 0x90) ||
((unsigned char)s[0] == 0xF4 &&
(unsigned char)s[1] > 0x8F)) {
/* invalid continuation byte */
goto _error;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
s += 4;
*p++ = ch;
break;
}
}
ret = 0;
goto _ok;
_error:
ret = -1;
_ok:
*src_pos = s;
*dest_index = p - dest;
return ret;
}
#undef LONG_PTR_MASK
#undef ASCII_CHAR_MASK
#endif /* STRINGLIB_IS_UNICODE */

View File

@ -523,6 +523,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
#include "stringlib/fastsearch.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/undef.h"
/* --- Unicode Object ----------------------------------------------------- */
@ -4190,6 +4191,18 @@ PyUnicode_DecodeUTF8(const char *s,
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
}
#include "stringlib/ucs1lib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
#include "stringlib/ucs2lib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
#include "stringlib/ucs4lib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
/* Mask to check or force alignment of a pointer to C 'long' boundaries */
#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
@ -4203,33 +4216,41 @@ PyUnicode_DecodeUTF8(const char *s,
# error C 'long' size should be either 4 or 8!
#endif
/* Scans a UTF-8 string and returns the maximum character to be expected,
the size of the decoded unicode string and if any major errors were
encountered.
/* Scans a UTF-8 string and returns the maximum character to be expected
and the size of the decoded unicode string.
This function does check basic UTF-8 sanity, it does however NOT CHECK
if the string contains surrogates, and if all continuation bytes are
within the correct ranges, these checks are performed in
This function doesn't check for errors, these checks are performed in
PyUnicode_DecodeUTF8Stateful.
If it sets has_errors to 1, it means the value of unicode_size and max_char
will be bogus and you should not rely on useful information in them.
*/
static Py_UCS4
utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
Py_ssize_t *unicode_size, Py_ssize_t* consumed,
int *has_errors)
utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
Py_ssize_t *unicode_size)
{
Py_ssize_t n;
Py_ssize_t char_count = 0;
Py_UCS4 max_char = 127, new_max;
Py_UCS4 upper_bound;
const unsigned char *p = (const unsigned char *)s;
const unsigned char *end = p + string_size;
const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
int err = 0;
for (; p < end && !err; ++p, ++char_count) {
assert(unicode_size != NULL);
/* By having a cascade of independent loops which fallback onto each
other, we minimize the amount of work done in the average loop
iteration, and we also maximize the CPU's ability to predict
branches correctly (because a given condition will have always the
same boolean outcome except perhaps in the last iteration of the
corresponding loop).
In the general case this brings us rather close to decoding
performance pre-PEP 393, despite the two-pass decoding.
Note that the pure ASCII loop is not duplicated once a non-ASCII
character has been encountered. It is actually a pessimization (by
a significant factor) to use this loop on text with many non-ASCII
characters, and it is important to avoid bad performance on valid
utf-8 data (invalid utf-8 being a different can of worms).
*/
/* ASCII */
for (; p < end; ++p) {
/* Only check value if it's not a ASCII char... */
if (*p < 0x80) {
/* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
@ -4249,76 +4270,59 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
break;
}
}
if (*p >= 0x80) {
n = utf8_code_length[*p];
new_max = max_char;
switch (n) {
/* invalid start byte */
case 0:
err = 1;
break;
case 2:
/* Code points between 0x00FF and 0x07FF inclusive.
Approximate the upper bound of the code point,
if this flips over 255 we can be sure it will be more
than 255 and the string will need 2 bytes per code coint,
if it stays under or equal to 255, we can be sure 1 byte
is enough.
((*p & 0b00011111) << 6) | 0b00111111 */
upper_bound = ((*p & 0x1F) << 6) | 0x3F;
if (max_char < upper_bound)
new_max = upper_bound;
/* Ensure we track at least that we left ASCII space. */
if (new_max < 128)
new_max = 128;
break;
case 3:
/* Between 0x0FFF and 0xFFFF inclusive, so values are
always > 255 and <= 65535 and will always need 2 bytes. */
if (max_char < 65535)
new_max = 65535;
break;
case 4:
/* Code point will be above 0xFFFF for sure in this case. */
new_max = 65537;
break;
/* Internal error, this should be caught by the first if */
case 1:
default:
assert(0 && "Impossible case in utf8_max_char_and_size");
err = 1;
}
/* Instead of number of overall bytes for this code point,
n contains the number of following bytes: */
--n;
/* Check if the follow up chars are all valid continuation bytes */
if (n >= 1) {
const unsigned char *cont;
if ((p + n) >= end) {
if (consumed == 0)
/* incomplete data, non-incremental decoding */
err = 1;
break;
}
for (cont = p + 1; cont <= (p + n); ++cont) {
if ((*cont & 0xc0) != 0x80) {
err = 1;
break;
}
}
p += n;
}
else
err = 1;
max_char = new_max;
}
if (*p < 0x80)
++char_count;
else
goto _ucs1loop;
}
*unicode_size = char_count;
return 127;
if (unicode_size)
*unicode_size = char_count;
if (has_errors)
*has_errors = err;
return max_char;
_ucs1loop:
for (; p < end; ++p) {
if (*p < 0xc4)
char_count += ((*p & 0xc0) != 0x80);
else
goto _ucs2loop;
}
*unicode_size = char_count;
return 255;
_ucs2loop:
for (; p < end; ++p) {
if (*p < 0xf0)
char_count += ((*p & 0xc0) != 0x80);
else
goto _ucs4loop;
}
*unicode_size = char_count;
return 65535;
_ucs4loop:
for (; p < end; ++p) {
char_count += ((*p & 0xc0) != 0x80);
}
*unicode_size = char_count;
return 65537;
}
/* Called when we encountered some error that wasn't detected in the original
scan, e.g. an encoded surrogate character. The original maxchar computation
may have been incorrect, so redo it. */
static int
refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
{
PyObject *tmp;
Py_ssize_t k, maxchar;
for (k = 0, maxchar = 0; k < n; k++)
maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
if (tmp == NULL)
return -1;
PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
Py_DECREF(*unicode);
*unicode = tmp;
return 0;
}
/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
@ -4361,35 +4365,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t i;
int kind;
void *data;
int has_errors;
int has_errors = 0;
if (size == 0) {
if (consumed)
*consumed = 0;
return (PyObject *)PyUnicode_New(0, 0);
}
maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
consumed, &has_errors);
if (has_errors)
/* maxchar and size computation might be incorrect;
code below widens and resizes as necessary. */
unicode = PyUnicode_New(size, 127);
else
unicode = PyUnicode_New(unicode_size, maxchar);
maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
/* In case of errors, maxchar and size computation might be incorrect;
code below refits and resizes as necessary. */
unicode = PyUnicode_New(unicode_size, maxchar);
if (!unicode)
return NULL;
/* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8
sequence at the end of the ASCII block. */
if (!has_errors && maxchar < 128 && size == unicode_size) {
if (maxchar < 128 && size == unicode_size) {
Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
return unicode;
}
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
/* Unpack UTF-8 encoded data */
i = 0;
e = s + size;
switch (kind) {
case PyUnicode_1BYTE_KIND:
has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
break;
case PyUnicode_2BYTE_KIND:
has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
break;
case PyUnicode_4BYTE_KIND:
has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
break;
}
if (!has_errors) {
/* Ensure the unicode size calculation was correct */
assert(i == unicode_size);
assert(s == e);
if (consumed)
*consumed = s-starts;
return unicode;
}
/* Fall through to the generic decoding loop for the rest of
the string */
if (refit_partial_string(&unicode, kind, data, i) < 0)
goto onError;
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
while (s < e) {
@ -4541,19 +4566,8 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
utf8Error:
if (!has_errors) {
PyObject *tmp;
Py_ssize_t k;
/* We encountered some error that wasn't detected in the original scan,
e.g. an encoded surrogate character. The original maxchar computation may
have been incorrect, so redo it now. */
for (k = 0, maxchar = 0; k < i; k++)
maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
tmp = PyUnicode_New(PyUnicode_GET_LENGTH(unicode), maxchar);
if (tmp == NULL)
if (refit_partial_string(&unicode, kind, data, i) < 0)
goto onError;
PyUnicode_CopyCharacters(tmp, 0, unicode, 0, i);
Py_DECREF(unicode);
unicode = tmp;
has_errors = 1;
}
if (unicode_decode_call_errorhandler(