Issue #14624: UTF-16 decoding is now 3x to 4x faster on various inputs.

Patch by Serhiy Storchaka.
This commit is contained in:
Antoine Pitrou 2012-05-15 23:48:04 +02:00
parent 12ea86adce
commit 63065d761e
3 changed files with 230 additions and 199 deletions

View File

@ -10,6 +10,9 @@ What's New in Python 3.3.0 Alpha 4?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #14624: UTF-16 decoding is now 3x to 4x faster on various inputs.
Patch by Serhiy Storchaka.
- asdl_seq and asdl_int_seq are now Py_ssize_t sized. - asdl_seq and asdl_int_seq are now Py_ssize_t sized.
- Issue #14133 (PEP 415): Implement suppression of __context__ display with an - Issue #14133 (PEP 415): Implement suppression of __context__ display with an

View File

@ -215,7 +215,6 @@ InvalidContinuation:
goto Return; goto Return;
} }
#undef LONG_PTR_MASK
#undef ASCII_CHAR_MASK #undef ASCII_CHAR_MASK
@ -415,4 +414,152 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
#undef MAX_SHORT_UNICHARS #undef MAX_SHORT_UNICHARS
} }
/* The pattern for constructing UCS2-repeated masks. */
#if SIZEOF_LONG == 8
# define UCS2_REPEAT_MASK 0x0001000100010001ul
#elif SIZEOF_LONG == 4
# define UCS2_REPEAT_MASK 0x00010001ul
#else
# error C 'long' size should be either 4 or 8!
#endif
/* The mask for fast checking. */
#if STRINGLIB_SIZEOF_CHAR == 1
/* The mask for fast checking of whether a C 'long' contains a
non-ASCII or non-Latin1 UTF16-encoded characters. */
# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
#else
/* The mask for fast checking of whether a C 'long' may contain
UTF16-encoded surrogate characters. This is an efficient heuristic,
assuming that non-surrogate characters with a code point >= 0x8000 are
rare in most input.
*/
# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
#endif
/* The mask for fast byte-swapping. */
#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
/* Swap bytes. */
#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
(((value) & STRIPPED_MASK) << 8))
Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
int native_ordering)
{
Py_UCS4 ch;
const unsigned char *aligned_end =
(const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
const unsigned char *q = *inptr;
STRINGLIB_CHAR *p = dest + *outpos;
/* Offsets from q for retrieving byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int ihi = !!native_ordering, ilo = !native_ordering;
#else
int ihi = !native_ordering, ilo = !!native_ordering;
#endif
--e;
while (q < e) {
Py_UCS4 ch2;
/* First check for possible aligned read of a C 'long'. Unaligned
reads are more expensive, better to defer to another iteration. */
if (!((size_t) q & LONG_PTR_MASK)) {
/* Fast path for runs of in-range non-surrogate chars. */
register const unsigned char *_q = q;
while (_q < aligned_end) {
unsigned long block = * (unsigned long *) _q;
if (native_ordering) {
/* Can use buffer directly */
if (block & FAST_CHAR_MASK)
break;
}
else {
/* Need to byte-swap */
if (block & SWAB(FAST_CHAR_MASK))
break;
#if STRINGLIB_SIZEOF_CHAR == 1
block >>= 8;
#else
block = SWAB(block);
#endif
}
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
# if SIZEOF_LONG == 4
p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
p[1] = (STRINGLIB_CHAR)(block >> 16);
# elif SIZEOF_LONG == 8
p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
p[3] = (STRINGLIB_CHAR)(block >> 48);
# endif
#else
# if SIZEOF_LONG == 4
p[0] = (STRINGLIB_CHAR)(block >> 16);
p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
# elif SIZEOF_LONG == 8
p[0] = (STRINGLIB_CHAR)(block >> 48);
p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
# endif
#endif
_q += SIZEOF_LONG;
p += SIZEOF_LONG / 2;
}
q = _q;
if (q >= e)
break;
}
ch = (q[ihi] << 8) | q[ilo];
q += 2;
if (!Py_UNICODE_IS_SURROGATE(ch)) {
#if STRINGLIB_SIZEOF_CHAR < 2
if (ch > STRINGLIB_MAX_CHAR)
/* Out-of-range */
goto Return;
#endif
*p++ = (STRINGLIB_CHAR)ch;
continue;
}
/* UTF-16 code pair: */
if (q >= e)
goto UnexpectedEnd;
if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
goto IllegalEncoding;
ch2 = (q[ihi] << 8) | q[ilo];
q += 2;
if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
goto IllegalSurrogate;
ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
#if STRINGLIB_SIZEOF_CHAR < 4
/* Out-of-range */
goto Return;
#else
*p++ = (STRINGLIB_CHAR)ch;
#endif
}
ch = 0;
Return:
*inptr = q;
*outpos = p - dest;
return ch;
UnexpectedEnd:
ch = 1;
goto Return;
IllegalEncoding:
ch = 2;
goto Return;
IllegalSurrogate:
ch = 3;
goto Return;
}
#undef UCS2_REPEAT_MASK
#undef FAST_CHAR_MASK
#undef STRIPPED_MASK
#undef SWAB
#undef LONG_PTR_MASK
#endif /* STRINGLIB_IS_UNICODE */ #endif /* STRINGLIB_IS_UNICODE */

View File

@ -5195,25 +5195,6 @@ PyUnicode_DecodeUTF16(const char *s,
return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
} }
/* Two masks for fast checking of whether a C 'long' may contain
UTF16-encoded surrogate characters. This is an efficient heuristic,
assuming that non-surrogate characters with a code point >= 0x8000 are
rare in most input.
FAST_CHAR_MASK is used when the input is in native byte ordering,
SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
*/
#if (SIZEOF_LONG == 8)
# define FAST_CHAR_MASK 0x8000800080008000L
# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
# define STRIPPED_MASK 0x00FF00FF00FF00FFL
#elif (SIZEOF_LONG == 4)
# define FAST_CHAR_MASK 0x80008000L
# define SWAPPED_FAST_CHAR_MASK 0x00800080L
# define STRIPPED_MASK 0x00FF00FFL
#else
# error C 'long' size should be either 4 or 8!
#endif
PyObject * PyObject *
PyUnicode_DecodeUTF16Stateful(const char *s, PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t size, Py_ssize_t size,
@ -5226,30 +5207,15 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t endinpos; Py_ssize_t endinpos;
Py_ssize_t outpos; Py_ssize_t outpos;
PyObject *unicode; PyObject *unicode;
const unsigned char *q, *e, *aligned_end; const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */ int bo = 0; /* assume native ordering by default */
int native_ordering = 0; int native_ordering;
const char *errmsg = ""; const char *errmsg = "";
/* Offsets from q for retrieving byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int ihi = 1, ilo = 0;
#else
int ihi = 0, ilo = 1;
#endif
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
/* Note: size will always be longer than the resulting Unicode
character count */
unicode = PyUnicode_New(size, 127);
if (!unicode)
return NULL;
if (size == 0)
return unicode;
outpos = 0;
q = (unsigned char *)s; q = (unsigned char *)s;
e = q + size - 1; e = q + size;
if (byteorder) if (byteorder)
bo = *byteorder; bo = *byteorder;
@ -5258,155 +5224,98 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
byte order setting accordingly. In native mode, the leading BOM byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */ stream as-is (giving a ZWNBSP character). */
if (bo == 0) { if (bo == 0 && size >= 2) {
if (size >= 2) { const Py_UCS4 bom = (q[1] << 8) | q[0];
const Py_UCS4 bom = (q[ihi] << 8) | q[ilo]; if (bom == 0xFEFF) {
#ifdef BYTEORDER_IS_LITTLE_ENDIAN q += 2;
if (bom == 0xFEFF) { bo = -1;
q += 2;
bo = -1;
}
else if (bom == 0xFFFE) {
q += 2;
bo = 1;
}
#else
if (bom == 0xFEFF) {
q += 2;
bo = 1;
}
else if (bom == 0xFFFE) {
q += 2;
bo = -1;
}
#endif
} }
else if (bom == 0xFFFE) {
q += 2;
bo = 1;
}
if (byteorder)
*byteorder = bo;
} }
if (bo == -1) { if (q == e) {
/* force LE */ if (consumed)
ihi = 1; *consumed = size;
ilo = 0; Py_INCREF(unicode_empty);
} return unicode_empty;
else if (bo == 1) {
/* force BE */
ihi = 0;
ilo = 1;
} }
#ifdef BYTEORDER_IS_LITTLE_ENDIAN #ifdef BYTEORDER_IS_LITTLE_ENDIAN
native_ordering = ilo < ihi; native_ordering = bo <= 0;
#else #else
native_ordering = ilo > ihi; native_ordering = bo >= 0;
#endif #endif
aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); /* Note: size will always be longer than the resulting Unicode
while (q < e) { character count */
Py_UCS4 ch; unicode = PyUnicode_New((e - q + 1) / 2, 127);
/* First check for possible aligned read of a C 'long'. Unaligned if (!unicode)
reads are more expensive, better to defer to another iteration. */ return NULL;
if (!((size_t) q & LONG_PTR_MASK)) {
/* Fast path for runs of non-surrogate chars. */ outpos = 0;
register const unsigned char *_q = q; while (1) {
Py_UCS4 ch = 0;
if (e - q >= 2) {
int kind = PyUnicode_KIND(unicode); int kind = PyUnicode_KIND(unicode);
void *data = PyUnicode_DATA(unicode); if (kind == PyUnicode_1BYTE_KIND) {
while (_q < aligned_end) { if (PyUnicode_IS_ASCII(unicode))
unsigned long block = * (unsigned long *) _q; ch = asciilib_utf16_decode(&q, e,
Py_UCS4 maxch; PyUnicode_1BYTE_DATA(unicode), &outpos,
if (native_ordering) { native_ordering);
/* Can use buffer directly */ else
if (block & FAST_CHAR_MASK) ch = ucs1lib_utf16_decode(&q, e,
break; PyUnicode_1BYTE_DATA(unicode), &outpos,
} native_ordering);
else { } else if (kind == PyUnicode_2BYTE_KIND) {
/* Need to byte-swap */ ch = ucs2lib_utf16_decode(&q, e,
if (block & SWAPPED_FAST_CHAR_MASK) PyUnicode_2BYTE_DATA(unicode), &outpos,
break; native_ordering);
block = ((block >> 8) & STRIPPED_MASK) | } else {
((block & STRIPPED_MASK) << 8); assert(kind == PyUnicode_4BYTE_KIND);
} ch = ucs4lib_utf16_decode(&q, e,
maxch = (Py_UCS2)(block & 0xFFFF); PyUnicode_4BYTE_DATA(unicode), &outpos,
#if SIZEOF_LONG == 8 native_ordering);
ch = (Py_UCS2)((block >> 16) & 0xFFFF);
maxch = MAX_MAXCHAR(maxch, ch);
ch = (Py_UCS2)((block >> 32) & 0xFFFF);
maxch = MAX_MAXCHAR(maxch, ch);
ch = (Py_UCS2)(block >> 48);
maxch = MAX_MAXCHAR(maxch, ch);
#else
ch = (Py_UCS2)(block >> 16);
maxch = MAX_MAXCHAR(maxch, ch);
#endif
if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
if (unicode_widen(&unicode, outpos, maxch) < 0)
goto onError;
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
}
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
#if SIZEOF_LONG == 8
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
#else
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
#endif
#else
#if SIZEOF_LONG == 8
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
#else
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
#endif
PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
#endif
_q += SIZEOF_LONG;
} }
q = _q;
if (q >= e)
break;
} }
ch = (q[ihi] << 8) | q[ilo];
q += 2; switch (ch)
{
if (!Py_UNICODE_IS_SURROGATE(ch)) { case 0:
/* remaining byte at the end? (size should be even) */
if (q == e || consumed)
goto End;
errmsg = "truncated data";
startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e) - starts;
break;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
case 1:
errmsg = "unexpected end of data";
startinpos = ((const char *)q) - 2 - starts;
endinpos = ((const char *)e) - starts;
break;
case 2:
errmsg = "illegal encoding";
startinpos = ((const char *)q) - 2 - starts;
endinpos = startinpos + 2;
break;
case 3:
errmsg = "illegal UTF-16 surrogate";
startinpos = ((const char *)q) - 4 - starts;
endinpos = startinpos + 2;
break;
default:
if (unicode_putchar(&unicode, &outpos, ch) < 0) if (unicode_putchar(&unicode, &outpos, ch) < 0)
goto onError; goto onError;
continue; continue;
} }
/* UTF-16 code pair: */
if (q > e) {
errmsg = "unexpected end of data";
startinpos = (((const char *)q) - 2) - starts;
endinpos = ((const char *)e) + 1 - starts;
goto utf16Error;
}
if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
q += 2;
if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
if (unicode_putchar(&unicode, &outpos,
Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
goto onError;
continue;
}
else {
errmsg = "illegal UTF-16 surrogate";
startinpos = (((const char *)q)-4)-starts;
endinpos = startinpos+2;
goto utf16Error;
}
}
errmsg = "illegal encoding";
startinpos = (((const char *)q)-2)-starts;
endinpos = startinpos+2;
/* Fall through to report the error */
utf16Error:
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, errors,
&errorHandler, &errorHandler,
@ -5421,33 +5330,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
&outpos)) &outpos))
goto onError; goto onError;
} }
/* remaining byte at the end? (size should be even) */
if (e == q) {
if (!consumed) {
errmsg = "truncated data";
startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e) + 1 - starts;
if (unicode_decode_call_errorhandler(
errors,
&errorHandler,
"utf16", errmsg,
&starts,
(const char **)&e,
&startinpos,
&endinpos,
&exc,
(const char **)&q,
&unicode,
&outpos))
goto onError;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
}
}
if (byteorder)
*byteorder = bo;
End:
if (consumed) if (consumed)
*consumed = (const char *)q-starts; *consumed = (const char *)q-starts;
@ -5466,9 +5350,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
return NULL; return NULL;
} }
#undef FAST_CHAR_MASK
#undef SWAPPED_FAST_CHAR_MASK
PyObject * PyObject *
_PyUnicode_EncodeUTF16(PyObject *str, _PyUnicode_EncodeUTF16(PyObject *str,
const char *errors, const char *errors,