From ab868311a5282f188a8cf831b021938420fee5c4 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <solipsis@pitrou.net>
Date: Sat, 10 Jan 2009 15:40:25 +0000
Subject: [PATCH] Issue #4868: utf-8, utf-16 and latin1 decoding are now 2x to
 4x faster. The common cases are optimized thanks to a dedicated fast path and
 a moderate amount of loop unrolling.

This will especially help text I/O (we already register a 30% speedup on large
reads on the io-c branch).
---
 Misc/NEWS               |   4 +
 Objects/unicodeobject.c | 231 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 211 insertions(+), 24 deletions(-)

diff --git a/Misc/NEWS b/Misc/NEWS
index 7832505816e..16e1eddc4df 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,10 @@ What's New in Python 3.1 alpha 0
 Core and Builtins
 -----------------
 
+- Issue #4868: utf-8, utf-16 and latin1 decoding are now 2x to 4x faster. The
+  common cases are optimized thanks to a dedicated fast path and a moderate
+  amount of loop unrolling.
+
 - Issue #4074: Change the criteria for doing a full garbage collection (i.e.
   collecting the oldest generation) so that allocating lots of objects without
   destroying them does not show quadratic performance. Based on a proposal by
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 8316e91bdb4..bc1612dcf65 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2001,6 +2001,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
 }
 
+/* Mask to check or force alignment of a pointer to C 'long' boundaries */
+#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
+
+/* Mask to quickly check whether a C 'long' contains a
+   non-ASCII, UTF8-encoded char. */
+#if (SIZEOF_LONG == 8)
+# define ASCII_CHAR_MASK 0x8080808080808080L
+#elif (SIZEOF_LONG == 4)
+# define ASCII_CHAR_MASK 0x80808080L
+#else
+# error C 'long' size should be either 4 or 8!
+#endif
+
 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
 			                Py_ssize_t size,
 			                const char *errors,
@@ -2011,7 +2024,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
     Py_ssize_t startinpos;
     Py_ssize_t endinpos;
     Py_ssize_t outpos;
-    const char *e;
+    const char *e, *aligned_end;
     PyUnicodeObject *unicode;
     Py_UNICODE *p;
     const char *errmsg = "";
@@ -2032,10 +2045,51 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
     /* Unpack UTF-8 encoded data */
     p = unicode->str;
     e = s + size;
+    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
 
     while (s < e) {
         Py_UCS4 ch = (unsigned char)*s;
 
+        if (ch < 0x80) {
+            /* Fast path for runs of ASCII characters. Given that common UTF-8
+               input will consist of an overwhelming majority of ASCII
+               characters, we try to optimize for this case by checking
+               as many characters as a C 'long' can contain.
+               First, check if we can do an aligned read, as most CPUs have
+               a penalty for unaligned reads.
+            */
+            if (!((size_t) s & LONG_PTR_MASK)) {
+                /* Help register allocation */
+                register const char *_s = s;
+                register Py_UNICODE *_p = p;
+                while (_s < aligned_end) {
+                    /* Read a whole long at a time (either 4 or 8 bytes),
+                       and do a fast unrolled copy if it only contains ASCII
+                       characters. */
+                    unsigned long data = *(unsigned long *) _s;
+                    if (data & ASCII_CHAR_MASK)
+                        break;
+                    _p[0] = (unsigned char) _s[0];
+                    _p[1] = (unsigned char) _s[1];
+                    _p[2] = (unsigned char) _s[2];
+                    _p[3] = (unsigned char) _s[3];
+#if (SIZEOF_LONG == 8)
+                    _p[4] = (unsigned char) _s[4];
+                    _p[5] = (unsigned char) _s[5];
+                    _p[6] = (unsigned char) _s[6];
+                    _p[7] = (unsigned char) _s[7];
+#endif
+                    _s += SIZEOF_LONG;
+                    _p += SIZEOF_LONG;
+                }
+                s = _s;
+                p = _p;
+                if (s == e)
+                    break;
+                ch = (unsigned char)*s;
+            }
+        }
+
         if (ch < 0x80) {
             *p++ = (Py_UNICODE)ch;
             s++;
@@ -2169,6 +2223,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
 	     &starts, &e, &startinpos, &endinpos, &exc, &s,
 	     &unicode, &outpos, &p))
 	goto onError;
+	aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
     }
     if (consumed)
 	*consumed = s-starts;
@@ -2188,6 +2243,9 @@ onError:
     return NULL;
 }
 
+#undef ASCII_CHAR_MASK
+
+
 /* Allocation strategy:  if the string is short, convert into a stack buffer
    and allocate exactly as much space needed at the end.  Else allocate the
    maximum possible needed (4 result bytes per Unicode character), and return
@@ -2582,6 +2640,23 @@ PyUnicode_DecodeUTF16(const char *s,
     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
 }
 
+/* Two masks for fast checking of whether a C 'long' may contain
+   UTF16-encoded surrogate characters. This is an efficient heuristic,
+   assuming that non-surrogate characters with a code point >= 0x8000 are
+   rare in most input.
+   FAST_CHAR_MASK is used when the input is in native byte ordering,
+   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
+   */
+#if (SIZEOF_LONG == 8)
+# define FAST_CHAR_MASK         0x8000800080008000L
+# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
+#elif (SIZEOF_LONG == 4)
+# define FAST_CHAR_MASK         0x80008000L
+# define SWAPPED_FAST_CHAR_MASK 0x00800080L
+#else
+# error C 'long' size should be either 4 or 8!
+#endif
+
 PyObject *
 PyUnicode_DecodeUTF16Stateful(const char *s,
 			      Py_ssize_t size,
@@ -2595,8 +2670,9 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
     Py_ssize_t outpos;
     PyUnicodeObject *unicode;
     Py_UNICODE *p;
-    const unsigned char *q, *e;
+    const unsigned char *q, *e, *aligned_end;
     int bo = 0;       /* assume native ordering by default */
+    int native_ordering = 0;
     const char *errmsg = "";
     /* Offsets from q for retrieving byte pairs in the right order. */
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
@@ -2618,7 +2694,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
     /* Unpack UTF-16 encoded data */
     p = unicode->str;
     q = (unsigned char *)s;
-    e = q + size;
+    e = q + size - 1;
 
     if (byteorder)
         bo = *byteorder;
@@ -2662,20 +2738,78 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
         ihi = 0;
         ilo = 1;
     }
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    native_ordering = ilo < ihi;
+#else
+    native_ordering = ilo > ihi;
+#endif
 
+    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
     while (q < e) {
 	Py_UNICODE ch;
-	/* remaining bytes at the end? (size should be even) */
-	if (e-q<2) {
-	    if (consumed)
-		break;
-	    errmsg = "truncated data";
-	    startinpos = ((const char *)q)-starts;
-	    endinpos = ((const char *)e)-starts;
-	    goto utf16Error;
-	    /* The remaining input chars are ignored if the callback
-	       chooses to skip the input */
-	}
+        /* First check for possible aligned read of a C 'long'. Unaligned
+           reads are more expensive, better to defer to another iteration. */
+        if (!((size_t) q & LONG_PTR_MASK)) {
+            /* Fast path for runs of non-surrogate chars. */
+            register const unsigned char *_q = q;
+            Py_UNICODE *_p = p;
+            if (native_ordering) {
+                /* Native ordering is simple: as long as the input cannot
+                   possibly contain a surrogate char, do an unrolled copy
+                   of several 16-bit code points to the target object.
+                   The non-surrogate check is done on several input bytes
+                   at a time (as many as a C 'long' can contain). */
+                while (_q < aligned_end) {
+                    unsigned long data = * (unsigned long *) _q;
+                    if (data & FAST_CHAR_MASK)
+                        break;
+                    _p[0] = ((unsigned short *) _q)[0];
+                    _p[1] = ((unsigned short *) _q)[1];
+#if (SIZEOF_LONG == 8)
+                    _p[2] = ((unsigned short *) _q)[2];
+                    _p[3] = ((unsigned short *) _q)[3];
+#endif
+                    _q += SIZEOF_LONG;
+                    _p += SIZEOF_LONG / 2;
+                }
+            }
+            else {
+                /* Byteswapped ordering is similar, but we must decompose
+                   the copy bytewise, and take care of zero'ing out the
+                   upper bytes if the target object is in 32-bit units
+                   (that is, in UCS-4 builds). */
+                while (_q < aligned_end) {
+                    unsigned long data = * (unsigned long *) _q;
+                    if (data & SWAPPED_FAST_CHAR_MASK)
+                        break;
+                    /* Zero upper bytes in UCS-4 builds */
+#if (Py_UNICODE_SIZE > 2)
+                    _p[0] = 0;
+                    _p[1] = 0;
+#if (SIZEOF_LONG == 8)
+                    _p[2] = 0;
+                    _p[3] = 0;
+#endif
+#endif
+                    ((unsigned char *) _p)[1] = _q[0];
+                    ((unsigned char *) _p)[0] = _q[1];
+                    ((unsigned char *) _p)[1 + Py_UNICODE_SIZE] = _q[2];
+                    ((unsigned char *) _p)[0 + Py_UNICODE_SIZE] = _q[3];
+#if (SIZEOF_LONG == 8)
+                    ((unsigned char *) _p)[1 + 2 * Py_UNICODE_SIZE] = _q[4];
+                    ((unsigned char *) _p)[0 + 2 * Py_UNICODE_SIZE] = _q[5];
+                    ((unsigned char *) _p)[1 + 3 * Py_UNICODE_SIZE] = _q[6];
+                    ((unsigned char *) _p)[0 + 3 * Py_UNICODE_SIZE] = _q[7];
+#endif
+                    _q += SIZEOF_LONG;
+                    _p += SIZEOF_LONG / 2;
+                }
+            }
+            p = _p;
+            q = _q;
+            if (q >= e)
+                break;
+        }
 	ch = (q[ihi] << 8) | q[ilo];
 
 	q += 2;
@@ -2686,10 +2820,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
 	}
 
 	/* UTF-16 code pair: */
-	if (q >= e) {
+	if (q > e) {
 	    errmsg = "unexpected end of data";
-	    startinpos = (((const char *)q)-2)-starts;
-	    endinpos = ((const char *)e)-starts;
+	    startinpos = (((const char *)q) - 2) - starts;
+	    endinpos = ((const char *)e) + 1 - starts;
 	    goto utf16Error;
 	}
 	if (0xD800 <= ch && ch <= 0xDBFF) {
@@ -2718,14 +2852,47 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
 	/* Fall through to report the error */
 
     utf16Error:
-	outpos = p-PyUnicode_AS_UNICODE(unicode);
+	outpos = p - PyUnicode_AS_UNICODE(unicode);
 	if (unicode_decode_call_errorhandler(
-	         errors, &errorHandler,
-	         "utf16", errmsg,
-	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
-	         &unicode, &outpos, &p))
+                errors,
+                &errorHandler,
+                "utf16", errmsg,
+                &starts,
+                (const char **)&e,
+                &startinpos,
+                &endinpos,
+                &exc,
+                (const char **)&q,
+                &unicode,
+                &outpos,
+                &p))
 	    goto onError;
     }
+    /* remaining byte at the end? (size should be even) */
+    if (e == q) {
+        if (!consumed) {
+            errmsg = "truncated data";
+            startinpos = ((const char *)q) - starts;
+            endinpos = ((const char *)e) + 1 - starts;
+            outpos = p - PyUnicode_AS_UNICODE(unicode);
+            if (unicode_decode_call_errorhandler(
+                    errors,
+                    &errorHandler,
+                    "utf16", errmsg,
+                    &starts,
+                    (const char **)&e,
+                    &startinpos,
+                    &endinpos,
+                    &exc,
+                    (const char **)&q,
+                    &unicode,
+                    &outpos,
+                    &p))
+                goto onError;
+            /* The remaining input chars are ignored if the callback
+               chooses to skip the input */
+        }
+    }
 
     if (byteorder)
         *byteorder = bo;
@@ -2748,6 +2915,9 @@ onError:
     return NULL;
 }
 
+#undef FAST_CHAR_MASK
+#undef SWAPPED_FAST_CHAR_MASK
+
 PyObject *
 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
 		      Py_ssize_t size,
@@ -3571,6 +3741,7 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
 {
     PyUnicodeObject *v;
     Py_UNICODE *p;
+    const char *e, *unrolled_end;
 
     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
     if (size == 1) {
@@ -3584,8 +3755,20 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
     if (size == 0)
 	return (PyObject *)v;
     p = PyUnicode_AS_UNICODE(v);
-    while (size-- > 0)
-	*p++ = (unsigned char)*s++;
+    e = s + size;
+    /* Unrolling the copy makes it much faster by reducing the looping
+       overhead. This is similar to what many memcpy() implementations do. */
+    unrolled_end = e - 4;
+    while (s < unrolled_end) {
+        p[0] = (unsigned char) s[0];
+        p[1] = (unsigned char) s[1];
+        p[2] = (unsigned char) s[2];
+        p[3] = (unsigned char) s[3];
+        s += 4;
+        p += 4;
+    }
+    while (s < e)
+        *p++ = (unsigned char) *s++;
     return (PyObject *)v;
 
  onError: