Issue #4868: utf-8, utf-16 and latin1 decoding are now 2x to 4x faster. The

common cases are optimized thanks to a dedicated fast path and a moderate
amount of loop unrolling.

This will especially help text I/O (we already register a 30% speedup on large
reads on the io-c branch).
This commit is contained in:
Antoine Pitrou 2009-01-10 15:40:25 +00:00
parent dd6351e6b1
commit ab868311a5
2 changed files with 211 additions and 24 deletions

View File

@ -12,6 +12,10 @@ What's New in Python 3.1 alpha 0
Core and Builtins
-----------------
- Issue #4868: utf-8, utf-16 and latin1 decoding are now 2x to 4x faster. The
common cases are optimized thanks to a dedicated fast path and a moderate
amount of loop unrolling.
- Issue #4074: Change the criteria for doing a full garbage collection (i.e.
collecting the oldest generation) so that allocating lots of objects without
destroying them does not show quadratic performance. Based on a proposal by

View File

@ -2001,6 +2001,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
}
/* Mask to check or force alignment of a pointer to C 'long' boundaries */
#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
/* Mask to quickly check whether a C 'long' contains a
non-ASCII, UTF8-encoded char. */
#if (SIZEOF_LONG == 8)
# define ASCII_CHAR_MASK 0x8080808080808080L
#elif (SIZEOF_LONG == 4)
# define ASCII_CHAR_MASK 0x80808080L
#else
# error C 'long' size should be either 4 or 8!
#endif
PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t size,
const char *errors,
@ -2011,7 +2024,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t startinpos;
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
const char *e, *aligned_end;
PyUnicodeObject *unicode;
Py_UNICODE *p;
const char *errmsg = "";
@ -2032,10 +2045,51 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
/* Unpack UTF-8 encoded data */
p = unicode->str;
e = s + size;
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
while (s < e) {
Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) {
/* Fast path for runs of ASCII characters. Given that common UTF-8
input will consist of an overwhelming majority of ASCII
characters, we try to optimize for this case by checking
as many characters as a C 'long' can contain.
First, check if we can do an aligned read, as most CPUs have
a penalty for unaligned reads.
*/
if (!((size_t) s & LONG_PTR_MASK)) {
/* Help register allocation */
register const char *_s = s;
register Py_UNICODE *_p = p;
while (_s < aligned_end) {
/* Read a whole long at a time (either 4 or 8 bytes),
and do a fast unrolled copy if it only contains ASCII
characters. */
unsigned long data = *(unsigned long *) _s;
if (data & ASCII_CHAR_MASK)
break;
_p[0] = (unsigned char) _s[0];
_p[1] = (unsigned char) _s[1];
_p[2] = (unsigned char) _s[2];
_p[3] = (unsigned char) _s[3];
#if (SIZEOF_LONG == 8)
_p[4] = (unsigned char) _s[4];
_p[5] = (unsigned char) _s[5];
_p[6] = (unsigned char) _s[6];
_p[7] = (unsigned char) _s[7];
#endif
_s += SIZEOF_LONG;
_p += SIZEOF_LONG;
}
s = _s;
p = _p;
if (s == e)
break;
ch = (unsigned char)*s;
}
}
if (ch < 0x80) {
*p++ = (Py_UNICODE)ch;
s++;
@ -2169,6 +2223,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
&starts, &e, &startinpos, &endinpos, &exc, &s,
&unicode, &outpos, &p))
goto onError;
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
}
if (consumed)
*consumed = s-starts;
@ -2188,6 +2243,9 @@ onError:
return NULL;
}
#undef ASCII_CHAR_MASK
/* Allocation strategy: if the string is short, convert into a stack buffer
and allocate exactly as much space needed at the end. Else allocate the
maximum possible needed (4 result bytes per Unicode character), and return
@ -2582,6 +2640,23 @@ PyUnicode_DecodeUTF16(const char *s,
return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
}
/* Two masks for fast checking of whether a C 'long' may contain
UTF16-encoded surrogate characters. This is an efficient heuristic,
assuming that non-surrogate characters with a code point >= 0x8000 are
rare in most input.
FAST_CHAR_MASK is used when the input is in native byte ordering,
SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
*/
#if (SIZEOF_LONG == 8)
# define FAST_CHAR_MASK 0x8000800080008000L
# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
#elif (SIZEOF_LONG == 4)
# define FAST_CHAR_MASK 0x80008000L
# define SWAPPED_FAST_CHAR_MASK 0x00800080L
#else
# error C 'long' size should be either 4 or 8!
#endif
PyObject *
PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t size,
@ -2595,8 +2670,9 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t outpos;
PyUnicodeObject *unicode;
Py_UNICODE *p;
const unsigned char *q, *e;
const unsigned char *q, *e, *aligned_end;
int bo = 0; /* assume native ordering by default */
int native_ordering = 0;
const char *errmsg = "";
/* Offsets from q for retrieving byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
@ -2618,7 +2694,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
/* Unpack UTF-16 encoded data */
p = unicode->str;
q = (unsigned char *)s;
e = q + size;
e = q + size - 1;
if (byteorder)
bo = *byteorder;
@ -2662,20 +2738,78 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
ihi = 0;
ilo = 1;
}
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
native_ordering = ilo < ihi;
#else
native_ordering = ilo > ihi;
#endif
aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
while (q < e) {
Py_UNICODE ch;
/* remaining bytes at the end? (size should be even) */
if (e-q<2) {
if (consumed)
break;
errmsg = "truncated data";
startinpos = ((const char *)q)-starts;
endinpos = ((const char *)e)-starts;
goto utf16Error;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
}
/* First check for possible aligned read of a C 'long'. Unaligned
reads are more expensive, better to defer to another iteration. */
if (!((size_t) q & LONG_PTR_MASK)) {
/* Fast path for runs of non-surrogate chars. */
register const unsigned char *_q = q;
Py_UNICODE *_p = p;
if (native_ordering) {
/* Native ordering is simple: as long as the input cannot
possibly contain a surrogate char, do an unrolled copy
of several 16-bit code points to the target object.
The non-surrogate check is done on several input bytes
at a time (as many as a C 'long' can contain). */
while (_q < aligned_end) {
unsigned long data = * (unsigned long *) _q;
if (data & FAST_CHAR_MASK)
break;
_p[0] = ((unsigned short *) _q)[0];
_p[1] = ((unsigned short *) _q)[1];
#if (SIZEOF_LONG == 8)
_p[2] = ((unsigned short *) _q)[2];
_p[3] = ((unsigned short *) _q)[3];
#endif
_q += SIZEOF_LONG;
_p += SIZEOF_LONG / 2;
}
}
else {
/* Byteswapped ordering is similar, but we must decompose
the copy bytewise, and take care of zero'ing out the
upper bytes if the target object is in 32-bit units
(that is, in UCS-4 builds). */
while (_q < aligned_end) {
unsigned long data = * (unsigned long *) _q;
if (data & SWAPPED_FAST_CHAR_MASK)
break;
/* Zero upper bytes in UCS-4 builds */
#if (Py_UNICODE_SIZE > 2)
_p[0] = 0;
_p[1] = 0;
#if (SIZEOF_LONG == 8)
_p[2] = 0;
_p[3] = 0;
#endif
#endif
((unsigned char *) _p)[1] = _q[0];
((unsigned char *) _p)[0] = _q[1];
((unsigned char *) _p)[1 + Py_UNICODE_SIZE] = _q[2];
((unsigned char *) _p)[0 + Py_UNICODE_SIZE] = _q[3];
#if (SIZEOF_LONG == 8)
((unsigned char *) _p)[1 + 2 * Py_UNICODE_SIZE] = _q[4];
((unsigned char *) _p)[0 + 2 * Py_UNICODE_SIZE] = _q[5];
((unsigned char *) _p)[1 + 3 * Py_UNICODE_SIZE] = _q[6];
((unsigned char *) _p)[0 + 3 * Py_UNICODE_SIZE] = _q[7];
#endif
_q += SIZEOF_LONG;
_p += SIZEOF_LONG / 2;
}
}
p = _p;
q = _q;
if (q >= e)
break;
}
ch = (q[ihi] << 8) | q[ilo];
q += 2;
@ -2686,10 +2820,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
}
/* UTF-16 code pair: */
if (q >= e) {
if (q > e) {
errmsg = "unexpected end of data";
startinpos = (((const char *)q)-2)-starts;
endinpos = ((const char *)e)-starts;
startinpos = (((const char *)q) - 2) - starts;
endinpos = ((const char *)e) + 1 - starts;
goto utf16Error;
}
if (0xD800 <= ch && ch <= 0xDBFF) {
@ -2718,14 +2852,47 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
/* Fall through to report the error */
utf16Error:
outpos = p-PyUnicode_AS_UNICODE(unicode);
outpos = p - PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf16", errmsg,
&starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
&unicode, &outpos, &p))
errors,
&errorHandler,
"utf16", errmsg,
&starts,
(const char **)&e,
&startinpos,
&endinpos,
&exc,
(const char **)&q,
&unicode,
&outpos,
&p))
goto onError;
}
/* remaining byte at the end? (size should be even) */
if (e == q) {
if (!consumed) {
errmsg = "truncated data";
startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e) + 1 - starts;
outpos = p - PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler(
errors,
&errorHandler,
"utf16", errmsg,
&starts,
(const char **)&e,
&startinpos,
&endinpos,
&exc,
(const char **)&q,
&unicode,
&outpos,
&p))
goto onError;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
}
}
if (byteorder)
*byteorder = bo;
@ -2748,6 +2915,9 @@ onError:
return NULL;
}
#undef FAST_CHAR_MASK
#undef SWAPPED_FAST_CHAR_MASK
PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Py_ssize_t size,
@ -3571,6 +3741,7 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
{
PyUnicodeObject *v;
Py_UNICODE *p;
const char *e, *unrolled_end;
/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
if (size == 1) {
@ -3584,8 +3755,20 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
if (size == 0)
return (PyObject *)v;
p = PyUnicode_AS_UNICODE(v);
while (size-- > 0)
*p++ = (unsigned char)*s++;
e = s + size;
/* Unrolling the copy makes it much faster by reducing the looping
overhead. This is similar to what many memcpy() implementations do. */
unrolled_end = e - 4;
while (s < unrolled_end) {
p[0] = (unsigned char) s[0];
p[1] = (unsigned char) s[1];
p[2] = (unsigned char) s[2];
p[3] = (unsigned char) s[3];
s += 4;
p += 4;
}
while (s < e)
*p++ = (unsigned char) *s++;
return (PyObject *)v;
onError: