Issue #14624: UTF-16 decoding is now 3x to 4x faster on various inputs.

Patch by Serhiy Storchaka.
2012-05-15 23:48:04 +02:00 · 2012-05-15 23:48:04 +02:00 · 63065d761e
parent 12ea86adce
commit 63065d761e
3 changed files with 230 additions and 199 deletions
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,9 @@ What's New in Python 3.3.0 Alpha 4?
 Core and Builtins
 -----------------
 - Issue #14624: UTF-16 decoding is now 3x to 4x faster on various inputs.
  Patch by Serhiy Storchaka.
 - asdl_seq and asdl_int_seq are now Py_ssize_t sized.
 - Issue #14133 (PEP 415): Implement suppression of __context__ display with an
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@ -215,7 +215,6 @@ InvalidContinuation:
    goto Return;
 }
 #undef LONG_PTR_MASK
 #undef ASCII_CHAR_MASK
@ -415,4 +414,152 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 #undef MAX_SHORT_UNICHARS
 }
 /* The pattern for constructing UCS2-repeated masks. */
 #if SIZEOF_LONG == 8
 # define UCS2_REPEAT_MASK 0x0001000100010001ul
 #elif SIZEOF_LONG == 4
 # define UCS2_REPEAT_MASK 0x00010001ul
 #else
 # error C 'long' size should be either 4 or 8!
 #endif
 /* The mask for fast checking. */
 #if STRINGLIB_SIZEOF_CHAR == 1
 /* The mask for fast checking of whether a C 'long' contains a
   non-ASCII or non-Latin1 UTF16-encoded characters. */
 # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
 #else
 /* The mask for fast checking of whether a C 'long' may contain
   UTF16-encoded surrogate characters. This is an efficient heuristic,
   assuming that non-surrogate characters with a code point >= 0x8000 are
   rare in most input.
 */
 # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * 0x8000u)
 #endif
 /* The mask for fast byte-swapping. */
 #define STRIPPED_MASK           (UCS2_REPEAT_MASK * 0x00FFu)
 /* Swap bytes. */
 #define SWAB(value)             ((((value) >> 8) & STRIPPED_MASK) | \
                                 (((value) & STRIPPED_MASK) << 8))
 Py_LOCAL_INLINE(Py_UCS4)
 STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
                        STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
                        int native_ordering)
 {
    Py_UCS4 ch;
    const unsigned char *aligned_end =
            (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
    const unsigned char *q = *inptr;
    STRINGLIB_CHAR *p = dest + *outpos;
    /* Offsets from q for retrieving byte pairs in the right order. */
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    int ihi = !!native_ordering, ilo = !native_ordering;
 #else
    int ihi = !native_ordering, ilo = !!native_ordering;
 #endif
    --e;
    while (q < e) {
        Py_UCS4 ch2;
        /* First check for possible aligned read of a C 'long'. Unaligned
           reads are more expensive, better to defer to another iteration. */
        if (!((size_t) q & LONG_PTR_MASK)) {
            /* Fast path for runs of in-range non-surrogate chars. */
            register const unsigned char *_q = q;
            while (_q < aligned_end) {
                unsigned long block = * (unsigned long *) _q;
                if (native_ordering) {
                    /* Can use buffer directly */
                    if (block & FAST_CHAR_MASK)
                        break;
                }
                else {
                    /* Need to byte-swap */
                    if (block & SWAB(FAST_CHAR_MASK))
                        break;
 #if STRINGLIB_SIZEOF_CHAR == 1
                    block >>= 8;
 #else
                    block = SWAB(block);
 #endif
                }
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
 # if SIZEOF_LONG == 4
                p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
                p[1] = (STRINGLIB_CHAR)(block >> 16);
 # elif SIZEOF_LONG == 8
                p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
                p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
                p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
                p[3] = (STRINGLIB_CHAR)(block >> 48);
 # endif
 #else
 # if SIZEOF_LONG == 4
                p[0] = (STRINGLIB_CHAR)(block >> 16);
                p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
 # elif SIZEOF_LONG == 8
                p[0] = (STRINGLIB_CHAR)(block >> 48);
                p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
                p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
                p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
 # endif
 #endif
                _q += SIZEOF_LONG;
                p += SIZEOF_LONG / 2;
            }
            q = _q;
            if (q >= e)
                break;
        }
        ch = (q[ihi] << 8) | q[ilo];
        q += 2;
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
 #if STRINGLIB_SIZEOF_CHAR < 2
            if (ch > STRINGLIB_MAX_CHAR)
                /* Out-of-range */
                goto Return;
 #endif
            *p++ = (STRINGLIB_CHAR)ch;
            continue;
        }
        /* UTF-16 code pair: */
        if (q >= e)
            goto UnexpectedEnd;
        if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
            goto IllegalEncoding;
        ch2 = (q[ihi] << 8) | q[ilo];
        q += 2;
        if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
            goto IllegalSurrogate;
        ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
 #if STRINGLIB_SIZEOF_CHAR < 4
        /* Out-of-range */
        goto Return;
 #else
        *p++ = (STRINGLIB_CHAR)ch;
 #endif
    }
    ch = 0;
 Return:
    *inptr = q;
    *outpos = p - dest;
    return ch;
 UnexpectedEnd:
    ch = 1;
    goto Return;
 IllegalEncoding:
    ch = 2;
    goto Return;
 IllegalSurrogate:
    ch = 3;
    goto Return;
 }
 #undef UCS2_REPEAT_MASK
 #undef FAST_CHAR_MASK
 #undef STRIPPED_MASK
 #undef SWAB
 #undef LONG_PTR_MASK
 #endif /* STRINGLIB_IS_UNICODE */
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -5195,25 +5195,6 @@ PyUnicode_DecodeUTF16(const char *s,
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
 }
 /* Two masks for fast checking of whether a C 'long' may contain
   UTF16-encoded surrogate characters. This is an efficient heuristic,
   assuming that non-surrogate characters with a code point >= 0x8000 are
   rare in most input.
   FAST_CHAR_MASK is used when the input is in native byte ordering,
   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
 */
 #if (SIZEOF_LONG == 8)
 # define FAST_CHAR_MASK         0x8000800080008000L
 # define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
 # define STRIPPED_MASK          0x00FF00FF00FF00FFL
 #elif (SIZEOF_LONG == 4)
 # define FAST_CHAR_MASK         0x80008000L
 # define SWAPPED_FAST_CHAR_MASK 0x00800080L
 # define STRIPPED_MASK          0x00FF00FFL
 #else
 # error C 'long' size should be either 4 or 8!
 #endif
 PyObject *
 PyUnicode_DecodeUTF16Stateful(const char *s,
                              Py_ssize_t size,
@ -5226,30 +5207,15 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
    PyObject *unicode;
-    const unsigned char *q, *e, *aligned_end;
+    const unsigned char *q, *e;
    int bo = 0;       /* assume native ordering by default */
-    int native_ordering = 0;
+    int native_ordering;
    const char *errmsg = "";
    /* Offsets from q for retrieving byte pairs in the right order. */
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
    int ihi = 1, ilo = 0;
 #else
    int ihi = 0, ilo = 1;
 #endif
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    /* Note: size will always be longer than the resulting Unicode
       character count */
    unicode = PyUnicode_New(size, 127);
    if (!unicode)
        return NULL;
    if (size == 0)
        return unicode;
    outpos = 0;
    q = (unsigned char *)s;
-    e = q + size - 1;
+    e = q + size;
    if (byteorder)
        bo = *byteorder;
@ -5258,155 +5224,98 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
       byte order setting accordingly. In native mode, the leading BOM
       mark is skipped, in all other modes, it is copied to the output
       stream as-is (giving a ZWNBSP character). */
-    if (bo == 0) {
+    if (bo == 0 && size >= 2) {
-        if (size >= 2) {
+        const Py_UCS4 bom = (q[1] << 8) | q[0];
-            const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
+        if (bom == 0xFEFF) {
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+            q += 2;
-            if (bom == 0xFEFF) {
+            bo = -1;
                q += 2;
                bo = -1;
            }
            else if (bom == 0xFFFE) {
                q += 2;
                bo = 1;
            }
 #else
            if (bom == 0xFEFF) {
                q += 2;
                bo = 1;
            }
            else if (bom == 0xFFFE) {
                q += 2;
                bo = -1;
            }
 #endif
        }
        else if (bom == 0xFFFE) {
            q += 2;
            bo = 1;
        }
        if (byteorder)
            *byteorder = bo;
    }
-    if (bo == -1) {
+    if (q == e) {
-        /* force LE */
+        if (consumed)
-        ihi = 1;
+            *consumed = size;
-        ilo = 0;
+        Py_INCREF(unicode_empty);
-    }
+        return unicode_empty;
    else if (bo == 1) {
        /* force BE */
        ihi = 0;
        ilo = 1;
    }
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
-    native_ordering = ilo < ihi;
+    native_ordering = bo <= 0;
 #else
-    native_ordering = ilo > ihi;
+    native_ordering = bo >= 0;
 #endif
-    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
+    /* Note: size will always be longer than the resulting Unicode
-    while (q < e) {
+       character count */
-        Py_UCS4 ch;
+    unicode = PyUnicode_New((e - q + 1) / 2, 127);
-        /* First check for possible aligned read of a C 'long'. Unaligned
+    if (!unicode)
-           reads are more expensive, better to defer to another iteration. */
+        return NULL;
-        if (!((size_t) q & LONG_PTR_MASK)) {
+
-            /* Fast path for runs of non-surrogate chars. */
+    outpos = 0;
-            register const unsigned char *_q = q;
+    while (1) {
        Py_UCS4 ch = 0;
        if (e - q >= 2) {
            int kind = PyUnicode_KIND(unicode);
-            void *data = PyUnicode_DATA(unicode);
+            if (kind == PyUnicode_1BYTE_KIND) {
-            while (_q < aligned_end) {
+                if (PyUnicode_IS_ASCII(unicode))
-                unsigned long block = * (unsigned long *) _q;
+                    ch = asciilib_utf16_decode(&q, e,
-                Py_UCS4 maxch;
+                            PyUnicode_1BYTE_DATA(unicode), &outpos,
-                if (native_ordering) {
+                            native_ordering);
-                    /* Can use buffer directly */
+                else
-                    if (block & FAST_CHAR_MASK)
+                    ch = ucs1lib_utf16_decode(&q, e,
-                        break;
+                            PyUnicode_1BYTE_DATA(unicode), &outpos,
-                }
+                            native_ordering);
-                else {
+            } else if (kind == PyUnicode_2BYTE_KIND) {
-                    /* Need to byte-swap */
+                ch = ucs2lib_utf16_decode(&q, e,
-                    if (block & SWAPPED_FAST_CHAR_MASK)
+                        PyUnicode_2BYTE_DATA(unicode), &outpos,
-                        break;
+                        native_ordering);
-                    block = ((block >> 8) & STRIPPED_MASK) |
+            } else {
-                            ((block & STRIPPED_MASK) << 8);
+                assert(kind == PyUnicode_4BYTE_KIND);
-                }
+                ch = ucs4lib_utf16_decode(&q, e,
-                maxch = (Py_UCS2)(block & 0xFFFF);
+                        PyUnicode_4BYTE_DATA(unicode), &outpos,
-#if SIZEOF_LONG == 8
+                        native_ordering);
                ch = (Py_UCS2)((block >> 16) & 0xFFFF);
                maxch = MAX_MAXCHAR(maxch, ch);
                ch = (Py_UCS2)((block >> 32) & 0xFFFF);
                maxch = MAX_MAXCHAR(maxch, ch);
                ch = (Py_UCS2)(block >> 48);
                maxch = MAX_MAXCHAR(maxch, ch);
 #else
                ch = (Py_UCS2)(block >> 16);
                maxch = MAX_MAXCHAR(maxch, ch);
 #endif
                if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
                    if (unicode_widen(&unicode, outpos, maxch) < 0)
                        goto onError;
                    kind = PyUnicode_KIND(unicode);
                    data = PyUnicode_DATA(unicode);
                }
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
 #if SIZEOF_LONG == 8
                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
 #else
                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
 #endif
 #else
 #if SIZEOF_LONG == 8
                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
 #else
                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
 #endif
                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
 #endif
                _q += SIZEOF_LONG;
            }
            q = _q;
            if (q >= e)
                break;
        }
        ch = (q[ihi] << 8) | q[ilo];
-        q += 2;
+        switch (ch)
-
+        {
-        if (!Py_UNICODE_IS_SURROGATE(ch)) {
+        case 0:
            /* remaining byte at the end? (size should be even) */
            if (q == e || consumed)
                goto End;
            errmsg = "truncated data";
            startinpos = ((const char *)q) - starts;
            endinpos = ((const char *)e) - starts;
            break;
            /* The remaining input chars are ignored if the callback
               chooses to skip the input */
        case 1:
            errmsg = "unexpected end of data";
            startinpos = ((const char *)q) - 2 - starts;
            endinpos = ((const char *)e) - starts;
            break;
        case 2:
            errmsg = "illegal encoding";
            startinpos = ((const char *)q) - 2 - starts;
            endinpos = startinpos + 2;
            break;
        case 3:
            errmsg = "illegal UTF-16 surrogate";
            startinpos = ((const char *)q) - 4 - starts;
            endinpos = startinpos + 2;
            break;
        default:
            if (unicode_putchar(&unicode, &outpos, ch) < 0)
                goto onError;
            continue;
        }
        /* UTF-16 code pair: */
        if (q > e) {
            errmsg = "unexpected end of data";
            startinpos = (((const char *)q) - 2) - starts;
            endinpos = ((const char *)e) + 1 - starts;
            goto utf16Error;
        }
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
            Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
            q += 2;
            if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
                if (unicode_putchar(&unicode, &outpos,
                                    Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
                    goto onError;
                continue;
            }
            else {
                errmsg = "illegal UTF-16 surrogate";
                startinpos = (((const char *)q)-4)-starts;
                endinpos = startinpos+2;
                goto utf16Error;
            }
        }
        errmsg = "illegal encoding";
        startinpos = (((const char *)q)-2)-starts;
        endinpos = startinpos+2;
        /* Fall through to report the error */
      utf16Error:
        if (unicode_decode_call_errorhandler(
                errors,
                &errorHandler,
@ -5421,33 +5330,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
                &outpos))
            goto onError;
    }
    /* remaining byte at the end? (size should be even) */
    if (e == q) {
        if (!consumed) {
            errmsg = "truncated data";
            startinpos = ((const char *)q) - starts;
            endinpos = ((const char *)e) + 1 - starts;
            if (unicode_decode_call_errorhandler(
                    errors,
                    &errorHandler,
                    "utf16", errmsg,
                    &starts,
                    (const char **)&e,
                    &startinpos,
                    &endinpos,
                    &exc,
                    (const char **)&q,
                    &unicode,
                    &outpos))
                goto onError;
            /* The remaining input chars are ignored if the callback
               chooses to skip the input */
        }
    }
    if (byteorder)
        *byteorder = bo;
 End:
    if (consumed)
        *consumed = (const char *)q-starts;
@ -5466,9 +5350,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
    return NULL;
 }
 #undef FAST_CHAR_MASK
 #undef SWAPPED_FAST_CHAR_MASK
 PyObject *
 _PyUnicode_EncodeUTF16(PyObject *str,
                       const char *errors,