From 0d4df752acbaf14164f1e8b2b95ebe3fe288bb82 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 12 May 2015 23:12:45 +0300 Subject: [PATCH] Issue #15027: The UTF-32 encoder is now 3x to 7x faster. --- Doc/whatsnew/3.5.rst | 3 ++ Misc/NEWS | 2 + Objects/stringlib/codecs.h | 87 +++++++++++++++++++++++++++++++ Objects/unicodeobject.c | 102 +++++++++++++++---------------------- 4 files changed, 133 insertions(+), 61 deletions(-) diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst index 8a1bd91e2c4..90418119ab1 100644 --- a/Doc/whatsnew/3.5.rst +++ b/Doc/whatsnew/3.5.rst @@ -629,6 +629,9 @@ The following performance enhancements have been added: versions 0--2 on typical data, and up to 5x in best cases). (Contributed by Serhiy Storchaka in :issue:`20416` and :issue:`23344`.) +* The UTF-32 encoder is now 3x to 7x faster. (Contributed by Serhiy Storchaka + in :issue:`15027`.) + Build and C API Changes ======================= diff --git a/Misc/NEWS b/Misc/NEWS index 5a589d63449..2aa61d0db6c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,8 @@ Release date: 2015-04-24 Core and Builtins ----------------- +- Issue #15027: The UTF-32 encoder is now 3x to 7x faster. + - Issue #20274: When calling a _sqlite.Connection, it now complains if passed any keyword arguments. Previously it silently ignored them. diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 2eb2d1412ff..cd5d944d316 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -718,6 +718,93 @@ STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, return len - (end - in + 1); #endif } + +#if STRINGLIB_SIZEOF_CHAR == 1 +# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */ +#elif STRINGLIB_SIZEOF_CHAR == 2 +# define SWAB4(CH, tmp) (tmp = (CH), \ + ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8)) + /* high bytes are zero */ +#else +# define SWAB4(CH, tmp) (tmp = (CH), \ + tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \ + ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu)) +#endif +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, + Py_ssize_t len, + PY_UINT32_T **outptr, + int native_ordering) +{ + PY_UINT32_T *out = *outptr; + const STRINGLIB_CHAR *end = in + len; + if (native_ordering) { + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { +#if STRINGLIB_SIZEOF_CHAR > 1 + /* check if any character is a surrogate character */ + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; +#endif + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + in += 4; out += 4; + } + while (in < end) { + Py_UCS4 ch; + ch = *in++; +#if STRINGLIB_SIZEOF_CHAR > 1 + if (Py_UNICODE_IS_SURROGATE(ch)) { + /* reject surrogate characters (U+DC800-U+DFFF) */ + goto fail; + } +#endif + *out++ = ch; + } + } else { + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { +#if STRINGLIB_SIZEOF_CHAR > 1 + Py_UCS4 ch1, ch2, ch3, ch4; + /* check if any character is a surrogate character */ + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; +#endif + out[0] = SWAB4(in[0], ch1); + out[1] = SWAB4(in[1], ch2); + out[2] = SWAB4(in[2], ch3); + out[3] = SWAB4(in[3], ch4); + in += 4; out += 4; + } + while (in < end) { + Py_UCS4 ch = *in++; +#if STRINGLIB_SIZEOF_CHAR > 1 + if (Py_UNICODE_IS_SURROGATE(ch)) { + /* reject surrogate characters (U+DC800-U+DFFF) */ + goto fail; + } +#endif + *out++ = SWAB4(ch, ch); + } + } + *outptr = out; + return len; +#if STRINGLIB_SIZEOF_CHAR > 1 + fail: + *outptr = out; + return len - (end - in + 1); +#endif +} +#undef SWAB4 + #endif #endif /* STRINGLIB_IS_UNICODE */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3225fb3be9d..548cfff6a0a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5051,32 +5051,22 @@ _PyUnicode_EncodeUTF32(PyObject *str, const char *errors, int byteorder) { - int kind; - void *data; + enum PyUnicode_Kind kind; + const void *data; Py_ssize_t len; PyObject *v; - unsigned char *p; - Py_ssize_t nsize, i; - /* Offsets from p for storing byte pairs in the right order. */ + PY_UINT32_T *out; #if PY_LITTLE_ENDIAN - int iorder[] = {0, 1, 2, 3}; + int native_ordering = byteorder <= 0; #else - int iorder[] = {3, 2, 1, 0}; + int native_ordering = byteorder >= 0; #endif const char *encoding; + Py_ssize_t nsize, pos; PyObject *errorHandler = NULL; PyObject *exc = NULL; PyObject *rep = NULL; -#define STORECHAR(CH) \ - do { \ - p[iorder[3]] = ((CH) >> 24) & 0xff; \ - p[iorder[2]] = ((CH) >> 16) & 0xff; \ - p[iorder[1]] = ((CH) >> 8) & 0xff; \ - p[iorder[0]] = (CH) & 0xff; \ - p += 4; \ - } while(0) - if (!PyUnicode_Check(str)) { PyErr_BadArgument(); return NULL; @@ -5087,59 +5077,53 @@ _PyUnicode_EncodeUTF32(PyObject *str, data = PyUnicode_DATA(str); len = PyUnicode_GET_LENGTH(str); - nsize = len + (byteorder == 0); - if (nsize > PY_SSIZE_T_MAX / 4) + if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) return PyErr_NoMemory(); + nsize = len + (byteorder == 0); v = PyBytes_FromStringAndSize(NULL, nsize * 4); if (v == NULL) return NULL; - p = (unsigned char *)PyBytes_AS_STRING(v); + /* output buffer is 4-bytes aligned */ + assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); + out = (PY_UINT32_T *)PyBytes_AS_STRING(v); if (byteorder == 0) - STORECHAR(0xFEFF); + *out++ = 0xFEFF; if (len == 0) - return v; + goto done; - if (byteorder == -1) { - /* force LE */ - iorder[0] = 0; - iorder[1] = 1; - iorder[2] = 2; - iorder[3] = 3; + if (byteorder == -1) encoding = "utf-32-le"; - } - else if (byteorder == 1) { - /* force BE */ - iorder[0] = 3; - iorder[1] = 2; - iorder[2] = 1; - iorder[3] = 0; + else if (byteorder == 1) encoding = "utf-32-be"; - } else encoding = "utf-32"; if (kind == PyUnicode_1BYTE_KIND) { - for (i = 0; i < len; i++) - STORECHAR(PyUnicode_READ(kind, data, i)); - return v; + ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); + goto done; } - for (i = 0; i < len;) { + pos = 0; + while (pos < len) { Py_ssize_t repsize, moreunits; - Py_UCS4 ch = PyUnicode_READ(kind, data, i); - i++; - assert(ch <= MAX_UNICODE); - if (!Py_UNICODE_IS_SURROGATE(ch)) { - STORECHAR(ch); - continue; + + if (kind == PyUnicode_2BYTE_KIND) { + pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, + &out, native_ordering); } + else { + assert(kind == PyUnicode_4BYTE_KIND); + pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos, + &out, native_ordering); + } + if (pos == len) + break; rep = unicode_encode_call_errorhandler( errors, &errorHandler, encoding, "surrogates not allowed", - str, &exc, i-1, i, &i); - + str, &exc, pos, pos + 1, &pos); if (!rep) goto error; @@ -5147,7 +5131,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, repsize = PyBytes_GET_SIZE(rep); if (repsize & 3) { raise_encode_exception(&exc, encoding, - str, i - 1, i, + str, pos - 1, pos, "surrogates not allowed"); goto error; } @@ -5160,7 +5144,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, moreunits = repsize = PyUnicode_GET_LENGTH(rep); if (!PyUnicode_IS_ASCII(rep)) { raise_encode_exception(&exc, encoding, - str, i - 1, i, + str, pos - 1, pos, "surrogates not allowed"); goto error; } @@ -5168,7 +5152,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, /* four bytes are reserved for each surrogate */ if (moreunits > 1) { - Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v); + Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v); Py_ssize_t morebytes = 4 * (moreunits - 1); if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { /* integer overflow */ @@ -5177,20 +5161,16 @@ _PyUnicode_EncodeUTF32(PyObject *str, } if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) goto error; - p = (unsigned char*) PyBytes_AS_STRING(v) + outpos; + out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos; } if (PyBytes_Check(rep)) { - Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize); - p += repsize; + Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); + out += moreunits; } else /* rep is unicode */ { - const Py_UCS1 *repdata; assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); - repdata = PyUnicode_1BYTE_DATA(rep); - while (repsize--) { - Py_UCS4 ch = *repdata++; - STORECHAR(ch); - } + ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, + &out, native_ordering); } Py_CLEAR(rep); @@ -5199,11 +5179,12 @@ _PyUnicode_EncodeUTF32(PyObject *str, /* Cut back to size actually needed. This is necessary for, for example, encoding of a string containing isolated surrogates and the 'ignore' handler is used. */ - nsize = p - (unsigned char*) PyBytes_AS_STRING(v); + nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); if (nsize != PyBytes_GET_SIZE(v)) _PyBytes_Resize(&v, nsize); Py_XDECREF(errorHandler); Py_XDECREF(exc); + done: return v; error: Py_XDECREF(rep); @@ -5211,7 +5192,6 @@ _PyUnicode_EncodeUTF32(PyObject *str, Py_XDECREF(exc); Py_XDECREF(v); return NULL; -#undef STORECHAR } PyObject *