From 0a3229de6b80cfa9e432ef5a9c72548569503075 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <solipsis@pitrou.net>
Date: Mon, 21 Nov 2011 20:39:13 +0100
Subject: [PATCH] Issue #13417: speed up utf-8 decoding by around 2x for the
 non-fully-ASCII case. This almost catches up with pre-PEP 393 performance,
 when decoding needed only one pass.

---
 Makefile.pre.in            |   1 +
 Objects/stringlib/codecs.h | 156 +++++++++++++++++++++++++
 Objects/unicodeobject.c    | 228 ++++++++++++++++++++-----------------
 3 files changed, 278 insertions(+), 107 deletions(-)
 create mode 100644 Objects/stringlib/codecs.h

diff --git a/Makefile.pre.in b/Makefile.pre.in
index df1c0e704b5..dbf290ed736 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -645,6 +645,7 @@ BYTESTR_DEPS = \
 
 UNICODE_DEPS = $(BYTESTR_DEPS) \
 		$(srcdir)/Objects/stringlib/asciilib.h \
+		$(srcdir)/Objects/stringlib/codecs.h \
 		$(srcdir)/Objects/stringlib/ucs1lib.h \
 		$(srcdir)/Objects/stringlib/ucs2lib.h \
 		$(srcdir)/Objects/stringlib/ucs4lib.h \
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
new file mode 100644
index 00000000000..148bacbaa69
--- /dev/null
+++ b/Objects/stringlib/codecs.h
@@ -0,0 +1,156 @@
+/* stringlib: codec implementations */
+
+#if STRINGLIB_IS_UNICODE
+
+/* Mask to check or force alignment of a pointer to C 'long' boundaries */
+#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
+
+/* Mask to quickly check whether a C 'long' contains a
+   non-ASCII, UTF8-encoded char. */
+#if (SIZEOF_LONG == 8)
+# define ASCII_CHAR_MASK 0x8080808080808080L
+#elif (SIZEOF_LONG == 4)
+# define ASCII_CHAR_MASK 0x80808080L
+#else
+# error C 'long' size should be either 4 or 8!
+#endif
+
+Py_LOCAL_INLINE(int)
+STRINGLIB(utf8_try_decode)(const char *start, const char *end,
+                           STRINGLIB_CHAR *dest,
+                           const char **src_pos, Py_ssize_t *dest_index)
+{
+    int ret;
+    Py_ssize_t n;
+    const char *s = start;
+    const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
+    STRINGLIB_CHAR *p = dest;
+
+    while (s < end) {
+        Py_UCS4 ch = (unsigned char)*s;
+
+        if (ch < 0x80) {
+            /* Fast path for runs of ASCII characters. Given that common UTF-8
+               input will consist of an overwhelming majority of ASCII
+               characters, we try to optimize for this case by checking
+               as many characters as a C 'long' can contain.
+               First, check if we can do an aligned read, as most CPUs have
+               a penalty for unaligned reads.
+            */
+            if (!((size_t) s & LONG_PTR_MASK)) {
+                /* Help register allocation */
+                register const char *_s = s;
+                register STRINGLIB_CHAR *_p = p;
+                while (_s < aligned_end) {
+                    /* Read a whole long at a time (either 4 or 8 bytes),
+                       and do a fast unrolled copy if it only contains ASCII
+                       characters. */
+                    unsigned long value = *(unsigned long *) _s;
+                    if (value & ASCII_CHAR_MASK)
+                        break;
+                    _p[0] = _s[0];
+                    _p[1] = _s[1];
+                    _p[2] = _s[2];
+                    _p[3] = _s[3];
+#if (SIZEOF_LONG == 8)
+                    _p[4] = _s[4];
+                    _p[5] = _s[5];
+                    _p[6] = _s[6];
+                    _p[7] = _s[7];
+#endif
+                    _s += SIZEOF_LONG;
+                    _p += SIZEOF_LONG;
+                }
+                s = _s;
+                p = _p;
+                if (s == end)
+                    break;
+                ch = (unsigned char)*s;
+            }
+        }
+
+        if (ch < 0x80) {
+            s++;
+            *p++ = ch;
+            continue;
+        }
+
+        n = utf8_code_length[ch];
+
+        if (s + n > end) {
+            /* unexpected end of data: the caller will decide whether
+               it's an error or not */
+            goto _error;
+        }
+
+        switch (n) {
+        case 0:
+            /* invalid start byte */
+            goto _error;
+        case 1:
+            /* internal error */
+            goto _error;
+        case 2:
+            if ((s[1] & 0xc0) != 0x80)
+                /* invalid continuation byte */
+                goto _error;
+            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
+            assert ((ch > 0x007F) && (ch <= 0x07FF));
+            s += 2;
+            *p++ = ch;
+            break;
+
+        case 3:
+            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
+               will result in surrogates in range d800-dfff. Surrogates are
+               not valid UTF-8 so they are rejected.
+               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
+               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[2] & 0xc0) != 0x80 ||
+                ((unsigned char)s[0] == 0xE0 &&
+                 (unsigned char)s[1] < 0xA0) ||
+                ((unsigned char)s[0] == 0xED &&
+                 (unsigned char)s[1] > 0x9F)) {
+                /* invalid continuation byte */
+                goto _error;
+            }
+            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
+            s += 3;
+            *p++ = ch;
+            break;
+
+        case 4:
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[2] & 0xc0) != 0x80 ||
+                (s[3] & 0xc0) != 0x80 ||
+                ((unsigned char)s[0] == 0xF0 &&
+                 (unsigned char)s[1] < 0x90) ||
+                ((unsigned char)s[0] == 0xF4 &&
+                 (unsigned char)s[1] > 0x8F)) {
+                /* invalid continuation byte */
+                goto _error;
+            }
+            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
+                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
+            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
+            s += 4;
+            *p++ = ch;
+            break;
+        }
+    }
+    ret = 0;
+    goto _ok;
+_error:
+    ret = -1;
+_ok:
+    *src_pos = s;
+    *dest_index = p - dest;
+    return ret;
+}
+
+#undef LONG_PTR_MASK
+#undef ASCII_CHAR_MASK
+
+#endif /* STRINGLIB_IS_UNICODE */
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 6307a983e51..9dedf0b1b48 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -523,6 +523,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
 #include "stringlib/fastsearch.h"
 #include "stringlib/count.h"
 #include "stringlib/find.h"
+#include "stringlib/undef.h"
 
 /* --- Unicode Object ----------------------------------------------------- */
 
@@ -4190,6 +4191,18 @@ PyUnicode_DecodeUTF8(const char *s,
     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
 }
 
+#include "stringlib/ucs1lib.h"
+#include "stringlib/codecs.h"
+#include "stringlib/undef.h"
+
+#include "stringlib/ucs2lib.h"
+#include "stringlib/codecs.h"
+#include "stringlib/undef.h"
+
+#include "stringlib/ucs4lib.h"
+#include "stringlib/codecs.h"
+#include "stringlib/undef.h"
+
 /* Mask to check or force alignment of a pointer to C 'long' boundaries */
 #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
 
@@ -4203,33 +4216,41 @@ PyUnicode_DecodeUTF8(const char *s,
 # error C 'long' size should be either 4 or 8!
 #endif
 
-/* Scans a UTF-8 string and returns the maximum character to be expected,
-   the size of the decoded unicode string and if any major errors were
-   encountered.
+/* Scans a UTF-8 string and returns the maximum character to be expected
+   and the size of the decoded unicode string.
 
-   This function does check basic UTF-8 sanity, it does however NOT CHECK
-   if the string contains surrogates, and if all continuation bytes are
-   within the correct ranges, these checks are performed in
+   This function doesn't check for errors, these checks are performed in
    PyUnicode_DecodeUTF8Stateful.
-
-   If it sets has_errors to 1, it means the value of unicode_size and max_char
-   will be bogus and you should not rely on useful information in them.
    */
 static Py_UCS4
-utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
-                                  Py_ssize_t *unicode_size, Py_ssize_t* consumed,
-                                  int *has_errors)
+utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
+                                  Py_ssize_t *unicode_size)
 {
-    Py_ssize_t n;
     Py_ssize_t char_count = 0;
-    Py_UCS4 max_char = 127, new_max;
-    Py_UCS4 upper_bound;
     const unsigned char *p = (const unsigned char *)s;
     const unsigned char *end = p + string_size;
     const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
-    int err = 0;
 
-    for (; p < end && !err; ++p, ++char_count) {
+    assert(unicode_size != NULL);
+
+    /* By having a cascade of independent loops which fallback onto each
+       other, we minimize the amount of work done in the average loop
+       iteration, and we also maximize the CPU's ability to predict
+       branches correctly (because a given condition will have always the
+       same boolean outcome except perhaps in the last iteration of the
+       corresponding loop).
+       In the general case this brings us rather close to decoding
+       performance pre-PEP 393, despite the two-pass decoding.
+
+       Note that the pure ASCII loop is not duplicated once a non-ASCII
+       character has been encountered. It is actually a pessimization (by
+       a significant factor) to use this loop on text with many non-ASCII
+       characters, and it is important to avoid bad performance on valid
+       utf-8 data (invalid utf-8 being a different can of worms).
+    */
+
+    /* ASCII */
+    for (; p < end; ++p) {
         /* Only check value if it's not a ASCII char... */
         if (*p < 0x80) {
             /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
@@ -4249,76 +4270,59 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
                     break;
             }
         }
-        if (*p >= 0x80) {
-            n = utf8_code_length[*p];
-            new_max = max_char;
-            switch (n) {
-            /* invalid start byte */
-            case 0:
-                err = 1;
-                break;
-            case 2:
-                /* Code points between 0x00FF and 0x07FF inclusive.
-                   Approximate the upper bound of the code point,
-                   if this flips over 255 we can be sure it will be more
-                   than 255 and the string will need 2 bytes per code coint,
-                   if it stays under or equal to 255, we can be sure 1 byte
-                   is enough.
-                   ((*p & 0b00011111) << 6) | 0b00111111 */
-                upper_bound = ((*p & 0x1F) << 6) | 0x3F;
-                if (max_char < upper_bound)
-                    new_max = upper_bound;
-                /* Ensure we track at least that we left ASCII space. */
-                if (new_max < 128)
-                    new_max = 128;
-                break;
-            case 3:
-                /* Between 0x0FFF and 0xFFFF inclusive, so values are
-                   always > 255 and <= 65535 and will always need 2 bytes. */
-                if (max_char < 65535)
-                    new_max = 65535;
-                break;
-            case 4:
-                /* Code point will be above 0xFFFF for sure in this case. */
-                new_max = 65537;
-                break;
-            /* Internal error, this should be caught by the first if */
-            case 1:
-            default:
-                assert(0 && "Impossible case in utf8_max_char_and_size");
-                err = 1;
-            }
-            /* Instead of number of overall bytes for this code point,
-               n contains the number of following bytes: */
-            --n;
-            /* Check if the follow up chars are all valid continuation bytes */
-            if (n >= 1) {
-                const unsigned char *cont;
-                if ((p + n) >= end) {
-                    if (consumed == 0)
-                        /* incomplete data, non-incremental decoding */
-                        err = 1;
-                    break;
-                }
-                for (cont = p + 1; cont <= (p + n); ++cont) {
-                    if ((*cont & 0xc0) != 0x80) {
-                        err = 1;
-                        break;
-                    }
-                }
-                p += n;
-            }
-            else
-                err = 1;
-            max_char = new_max;
-        }
+        if (*p < 0x80)
+            ++char_count;
+        else
+            goto _ucs1loop;
     }
+    *unicode_size = char_count;
+    return 127;
 
-    if (unicode_size)
-        *unicode_size = char_count;
-    if (has_errors)
-        *has_errors = err;
-    return max_char;
+_ucs1loop:
+    for (; p < end; ++p) {
+        if (*p < 0xc4)
+            char_count += ((*p & 0xc0) != 0x80);
+        else
+            goto _ucs2loop;
+    }
+    *unicode_size = char_count;
+    return 255;
+
+_ucs2loop:
+    for (; p < end; ++p) {
+        if (*p < 0xf0)
+            char_count += ((*p & 0xc0) != 0x80);
+        else
+            goto _ucs4loop;
+    }
+    *unicode_size = char_count;
+    return 65535;
+
+_ucs4loop:
+    for (; p < end; ++p) {
+        char_count += ((*p & 0xc0) != 0x80);
+    }
+    *unicode_size = char_count;
+    return 65537;
+}
+
+/* Called when we encountered some error that wasn't detected in the original
+   scan, e.g. an encoded surrogate character. The original maxchar computation
+   may have been incorrect, so redo it. */
+static int
+refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
+{
+    PyObject *tmp;
+    Py_ssize_t k, maxchar;
+    for (k = 0, maxchar = 0; k < n; k++)
+        maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
+    tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
+    if (tmp == NULL)
+        return -1;
+    PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
+    Py_DECREF(*unicode);
+    *unicode = tmp;
+    return 0;
 }
 
 /* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
@@ -4361,35 +4365,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
     Py_ssize_t i;
     int kind;
     void *data;
-    int has_errors;
+    int has_errors = 0;
 
     if (size == 0) {
         if (consumed)
             *consumed = 0;
         return (PyObject *)PyUnicode_New(0, 0);
     }
-    maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
-                                                consumed, &has_errors);
-    if (has_errors)
-        /* maxchar and size computation might be incorrect;
-           code below widens and resizes as necessary. */
-        unicode = PyUnicode_New(size, 127);
-    else
-        unicode = PyUnicode_New(unicode_size, maxchar);
+    maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
+    /* In case of errors, maxchar and size computation might be incorrect;
+       code below refits and resizes as necessary. */
+    unicode = PyUnicode_New(unicode_size, maxchar);
     if (!unicode)
         return NULL;
     /* When the string is ASCII only, just use memcpy and return.
        unicode_size may be != size if there is an incomplete UTF-8
        sequence at the end of the ASCII block.  */
-    if (!has_errors && maxchar < 128 && size == unicode_size) {
+    if (maxchar < 128 && size == unicode_size) {
         Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
         return unicode;
     }
     kind = PyUnicode_KIND(unicode);
     data = PyUnicode_DATA(unicode);
+
     /* Unpack UTF-8 encoded data */
     i = 0;
     e = s + size;
+    switch (kind) {
+    case PyUnicode_1BYTE_KIND:
+        has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
+        break;
+    case PyUnicode_2BYTE_KIND:
+        has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
+        break;
+    case PyUnicode_4BYTE_KIND:
+        has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
+        break;
+    }
+    if (!has_errors) {
+        /* Ensure the unicode size calculation was correct */
+        assert(i == unicode_size);
+        assert(s == e);
+        if (consumed)
+            *consumed = s-starts;
+        return unicode;
+    }
+    /* Fall through to the generic decoding loop for the rest of
+       the string */
+    if (refit_partial_string(&unicode, kind, data, i) < 0)
+        goto onError;
+
     aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
 
     while (s < e) {
@@ -4541,19 +4566,8 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
 
       utf8Error:
         if (!has_errors) {
-            PyObject *tmp;
-            Py_ssize_t k;
-            /* We encountered some error that wasn't detected in the original scan,
-               e.g. an encoded surrogate character. The original maxchar computation may
-               have been incorrect, so redo it now. */
-            for (k = 0, maxchar = 0; k < i; k++)
-                maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
-            tmp = PyUnicode_New(PyUnicode_GET_LENGTH(unicode), maxchar);
-            if (tmp == NULL)
+            if (refit_partial_string(&unicode, kind, data, i) < 0)
                 goto onError;
-            PyUnicode_CopyCharacters(tmp, 0, unicode, 0, i);
-            Py_DECREF(unicode);
-            unicode = tmp;
             has_errors = 1;
         }
         if (unicode_decode_call_errorhandler(