mirror of https://github.com/python/cpython
Issue #15026: utf-16 encoding is now significantly faster (up to 10x).
Patch by Serhiy Storchaka.
This commit is contained in:
parent
3049f1243e
commit
27f6a3b0bf
|
@ -188,9 +188,9 @@ typedef unsigned char Py_UCS1;
|
|||
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
|
||||
((Py_UCS4)(low) & 0x03FF)) + 0x10000)
|
||||
/* high surrogate = top 10 bits added to D800 */
|
||||
#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 | (((ch) - 0x10000) >> 10))
|
||||
#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
|
||||
/* low surrogate = bottom 10 bits added to DC00 */
|
||||
#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 | (((ch) - 0x10000) & 0x3FF))
|
||||
#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
|
||||
|
||||
/* Check if substring matches at given offset. The offset must be
|
||||
valid, and the substring must not be empty. */
|
||||
|
|
|
@ -10,6 +10,9 @@ What's New in Python 3.3.0 Beta 1?
|
|||
Core and Builtins
|
||||
-----------------
|
||||
|
||||
- Issue #15026: utf-16 encoding is now significantly faster (up to 10x).
|
||||
Patch by Serhiy Storchaka.
|
||||
|
||||
- Issue #11022: open() and io.TextIOWrapper are now calling
|
||||
locale.getpreferredencoding(False) instead of locale.getpreferredencoding()
|
||||
in text mode if the encoding is not specified. Don't change temporary the
|
||||
|
|
|
@ -562,4 +562,68 @@ IllegalSurrogate:
|
|||
#undef STRIPPED_MASK
|
||||
#undef SWAB
|
||||
#undef LONG_PTR_MASK
|
||||
|
||||
|
||||
Py_LOCAL_INLINE(void)
|
||||
STRINGLIB(utf16_encode)(unsigned short *out,
|
||||
const STRINGLIB_CHAR *in,
|
||||
Py_ssize_t len,
|
||||
int native_ordering)
|
||||
{
|
||||
const STRINGLIB_CHAR *end = in + len;
|
||||
#if STRINGLIB_SIZEOF_CHAR == 1
|
||||
# define SWAB2(CH) ((CH) << 8)
|
||||
#else
|
||||
# define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
|
||||
#endif
|
||||
#if STRINGLIB_MAX_CHAR < 0x10000
|
||||
if (native_ordering) {
|
||||
# if STRINGLIB_SIZEOF_CHAR == 2
|
||||
Py_MEMCPY(out, in, 2 * len);
|
||||
# else
|
||||
_PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out);
|
||||
# endif
|
||||
} else {
|
||||
const STRINGLIB_CHAR *unrolled_end = in + (len & ~ (Py_ssize_t) 3);
|
||||
while (in < unrolled_end) {
|
||||
out[0] = SWAB2(in[0]);
|
||||
out[1] = SWAB2(in[1]);
|
||||
out[2] = SWAB2(in[2]);
|
||||
out[3] = SWAB2(in[3]);
|
||||
in += 4; out += 4;
|
||||
}
|
||||
while (in < end) {
|
||||
*out++ = SWAB2(*in);
|
||||
++in;
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (native_ordering) {
|
||||
while (in < end) {
|
||||
Py_UCS4 ch = *in++;
|
||||
if (ch < 0x10000)
|
||||
*out++ = ch;
|
||||
else {
|
||||
out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
|
||||
out[1] = Py_UNICODE_LOW_SURROGATE(ch);
|
||||
out += 2;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while (in < end) {
|
||||
Py_UCS4 ch = *in++;
|
||||
if (ch < 0x10000)
|
||||
*out++ = SWAB2((Py_UCS2)ch);
|
||||
else {
|
||||
Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
|
||||
Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
|
||||
out[0] = SWAB2(ch1);
|
||||
out[1] = SWAB2(ch2);
|
||||
out += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#undef SWAB2
|
||||
}
|
||||
#endif /* STRINGLIB_IS_UNICODE */
|
||||
|
|
|
@ -5359,27 +5359,19 @@ _PyUnicode_EncodeUTF16(PyObject *str,
|
|||
const char *errors,
|
||||
int byteorder)
|
||||
{
|
||||
int kind;
|
||||
void *data;
|
||||
enum PyUnicode_Kind kind;
|
||||
const void *data;
|
||||
Py_ssize_t len;
|
||||
PyObject *v;
|
||||
unsigned char *p;
|
||||
Py_ssize_t nsize, bytesize;
|
||||
Py_ssize_t i, pairs;
|
||||
/* Offsets from p for storing byte pairs in the right order. */
|
||||
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
||||
int ihi = 1, ilo = 0;
|
||||
unsigned short *out;
|
||||
Py_ssize_t bytesize;
|
||||
Py_ssize_t pairs;
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
int native_ordering = byteorder >= 0;
|
||||
#else
|
||||
int ihi = 0, ilo = 1;
|
||||
int native_ordering = byteorder <= 0;
|
||||
#endif
|
||||
|
||||
#define STORECHAR(CH) \
|
||||
do { \
|
||||
p[ihi] = ((CH) >> 8) & 0xff; \
|
||||
p[ilo] = (CH) & 0xff; \
|
||||
p += 2; \
|
||||
} while(0)
|
||||
|
||||
if (!PyUnicode_Check(str)) {
|
||||
PyErr_BadArgument();
|
||||
return NULL;
|
||||
|
@ -5391,53 +5383,47 @@ _PyUnicode_EncodeUTF16(PyObject *str,
|
|||
len = PyUnicode_GET_LENGTH(str);
|
||||
|
||||
pairs = 0;
|
||||
if (kind == PyUnicode_4BYTE_KIND)
|
||||
for (i = 0; i < len; i++)
|
||||
if (PyUnicode_READ(kind, data, i) >= 0x10000)
|
||||
if (kind == PyUnicode_4BYTE_KIND) {
|
||||
const Py_UCS4 *in = (const Py_UCS4 *)data;
|
||||
const Py_UCS4 *end = in + len;
|
||||
while (in < end)
|
||||
if (*in++ >= 0x10000)
|
||||
pairs++;
|
||||
/* 2 * (len + pairs + (byteorder == 0)) */
|
||||
if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
|
||||
return PyErr_NoMemory();
|
||||
nsize = len + pairs + (byteorder == 0);
|
||||
bytesize = nsize * 2;
|
||||
if (bytesize / 2 != nsize)
|
||||
}
|
||||
if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
|
||||
return PyErr_NoMemory();
|
||||
bytesize = (len + pairs + (byteorder == 0)) * 2;
|
||||
v = PyBytes_FromStringAndSize(NULL, bytesize);
|
||||
if (v == NULL)
|
||||
return NULL;
|
||||
|
||||
p = (unsigned char *)PyBytes_AS_STRING(v);
|
||||
/* output buffer is 2-bytes aligned */
|
||||
assert(((Py_uintptr_t)PyBytes_AS_STRING(v) & 1) == 0);
|
||||
out = (unsigned short *)PyBytes_AS_STRING(v);
|
||||
if (byteorder == 0)
|
||||
STORECHAR(0xFEFF);
|
||||
*out++ = 0xFEFF;
|
||||
if (len == 0)
|
||||
goto done;
|
||||
|
||||
if (byteorder == -1) {
|
||||
/* force LE */
|
||||
ihi = 1;
|
||||
ilo = 0;
|
||||
switch (kind) {
|
||||
case PyUnicode_1BYTE_KIND: {
|
||||
ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
|
||||
break;
|
||||
}
|
||||
else if (byteorder == 1) {
|
||||
/* force BE */
|
||||
ihi = 0;
|
||||
ilo = 1;
|
||||
case PyUnicode_2BYTE_KIND: {
|
||||
ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
|
||||
break;
|
||||
}
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||
Py_UCS4 ch2 = 0;
|
||||
if (ch >= 0x10000) {
|
||||
ch2 = Py_UNICODE_LOW_SURROGATE(ch);
|
||||
ch = Py_UNICODE_HIGH_SURROGATE(ch);
|
||||
}
|
||||
STORECHAR(ch);
|
||||
if (ch2)
|
||||
STORECHAR(ch2);
|
||||
case PyUnicode_4BYTE_KIND: {
|
||||
ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
done:
|
||||
return v;
|
||||
#undef STORECHAR
|
||||
}
|
||||
|
||||
PyObject *
|
||||
|
|
Loading…
Reference in New Issue