Issue #14738: Speed-up UTF-8 decoding on non-ASCII data. Patch by Serhiy Storchaka.

This commit is contained in:
Antoine Pitrou 2012-05-10 16:36:02 +02:00
parent fda08b0860
commit ca5f91b888
8 changed files with 336 additions and 572 deletions

View File

@ -10,6 +10,9 @@ What's New in Python 3.3.0 Alpha 4?
Core and Builtins
-----------------
- Issue #14738: Speed-up UTF-8 decoding on non-ASCII data. Patch by Serhiy
Storchaka.
- Issue #14700: Fix two broken and undefined-behaviour-inducing overflow checks
in old-style string formatting.

View File

@ -7,6 +7,7 @@
#define STRINGLIB(F) asciilib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 1
#define STRINGLIB_MAX_CHAR 0x7Fu
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"

View File

@ -15,19 +15,18 @@
# error C 'long' size should be either 4 or 8!
#endif
Py_LOCAL_INLINE(int)
STRINGLIB(utf8_try_decode)(const char *start, const char *end,
STRINGLIB_CHAR *dest,
const char **src_pos, Py_ssize_t *dest_index)
Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf8_decode)(const char **inptr, const char *end,
STRINGLIB_CHAR *dest,
Py_ssize_t *outpos)
{
int ret;
Py_ssize_t n;
const char *s = start;
Py_UCS4 ch;
const char *s = *inptr;
const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
STRINGLIB_CHAR *p = dest;
STRINGLIB_CHAR *p = dest + *outpos;
while (s < end) {
Py_UCS4 ch = (unsigned char)*s;
ch = (unsigned char)*s;
if (ch < 0x80) {
/* Fast path for runs of ASCII characters. Given that common UTF-8
@ -48,15 +47,33 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
unsigned long value = *(unsigned long *) _s;
if (value & ASCII_CHAR_MASK)
break;
_p[0] = _s[0];
_p[1] = _s[1];
_p[2] = _s[2];
_p[3] = _s[3];
#if (SIZEOF_LONG == 8)
_p[4] = _s[4];
_p[5] = _s[5];
_p[6] = _s[6];
_p[7] = _s[7];
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
_p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
_p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
_p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
_p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
# if SIZEOF_LONG == 8
_p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
_p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
_p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
_p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
# endif
#else
# if SIZEOF_LONG == 8
_p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
_p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
_p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
_p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
_p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
_p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
_p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
_p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
# else
_p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
_p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
_p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
_p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
# endif
#endif
_s += SIZEOF_LONG;
_p += SIZEOF_LONG;
@ -67,87 +84,135 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
break;
ch = (unsigned char)*s;
}
if (ch < 0x80) {
s++;
*p++ = ch;
continue;
}
}
if (ch < 0x80) {
s++;
if (ch < 0xC2) {
/* invalid sequence
\x80-\xBF -- continuation byte
\xC0-\xC1 -- fake 0000-007F */
goto InvalidStart;
}
if (ch < 0xE0) {
/* \xC2\x80-\xDF\xBF -- 0080-07FF */
Py_UCS4 ch2;
if (end - s < 2) {
/* unexpected end of data: the caller will decide whether
it's an error or not */
break;
}
ch2 = (unsigned char)s[1];
if ((ch2 & 0xC0) != 0x80)
/* invalid continuation byte */
goto InvalidContinuation;
ch = (ch << 6) + ch2 -
((0xC0 << 6) + 0x80);
assert ((ch > 0x007F) && (ch <= 0x07FF));
s += 2;
if (STRINGLIB_MAX_CHAR <= 0x007F ||
(STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
goto Overflow;
*p++ = ch;
continue;
}
n = utf8_code_length[ch];
if (s + n > end) {
/* unexpected end of data: the caller will decide whether
it's an error or not */
goto _error;
}
switch (n) {
case 0:
/* invalid start byte */
goto _error;
case 1:
/* internal error */
goto _error;
case 2:
if ((s[1] & 0xc0) != 0x80)
/* invalid continuation byte */
goto _error;
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
assert ((ch > 0x007F) && (ch <= 0x07FF));
s += 2;
*p++ = ch;
break;
case 3:
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
will result in surrogates in range d800-dfff. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xE0 &&
(unsigned char)s[1] < 0xA0) ||
((unsigned char)s[0] == 0xED &&
(unsigned char)s[1] > 0x9F)) {
/* invalid continuation byte */
goto _error;
if (ch < 0xF0) {
/* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
Py_UCS4 ch2, ch3;
if (end - s < 3) {
/* unexpected end of data: the caller will decide whether
it's an error or not */
break;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
ch2 = (unsigned char)s[1];
ch3 = (unsigned char)s[2];
if ((ch2 & 0xC0) != 0x80 ||
(ch3 & 0xC0) != 0x80) {
/* invalid continuation byte */
goto InvalidContinuation;
}
if (ch == 0xE0) {
if (ch2 < 0xA0)
/* invalid sequence
\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
goto InvalidContinuation;
}
else if (ch == 0xED && ch2 > 0x9F) {
/* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
will result in surrogates in range D800-DFFF. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
goto InvalidContinuation;
}
ch = (ch << 12) + (ch2 << 6) + ch3 -
((0xE0 << 12) + (0x80 << 6) + 0x80);
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
s += 3;
if (STRINGLIB_MAX_CHAR <= 0x07FF ||
(STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
goto Overflow;
*p++ = ch;
break;
case 4:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xF0 &&
(unsigned char)s[1] < 0x90) ||
((unsigned char)s[0] == 0xF4 &&
(unsigned char)s[1] > 0x8F)) {
/* invalid continuation byte */
goto _error;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
s += 4;
*p++ = ch;
break;
continue;
}
if (ch < 0xF5) {
/* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
Py_UCS4 ch2, ch3, ch4;
if (end - s < 4) {
/* unexpected end of data: the caller will decide whether
it's an error or not */
break;
}
ch2 = (unsigned char)s[1];
ch3 = (unsigned char)s[2];
ch4 = (unsigned char)s[3];
if ((ch2 & 0xC0) != 0x80 ||
(ch3 & 0xC0) != 0x80 ||
(ch4 & 0xC0) != 0x80) {
/* invalid continuation byte */
goto InvalidContinuation;
}
if (ch == 0xF0) {
if (ch2 < 0x90)
/* invalid sequence
\xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
goto InvalidContinuation;
}
else if (ch == 0xF4 && ch2 > 0x8F) {
/* invalid sequence
\xF4\x90\x80\80- -- 110000- overflow */
goto InvalidContinuation;
}
ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
s += 4;
if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
(STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
goto Overflow;
*p++ = ch;
continue;
}
goto InvalidStart;
}
ret = 0;
goto _ok;
_error:
ret = -1;
_ok:
*src_pos = s;
*dest_index = p - dest;
return ret;
ch = 0;
Overflow:
Return:
*inptr = s;
*outpos = p - dest;
return ch;
InvalidStart:
ch = 1;
goto Return;
InvalidContinuation:
ch = 2;
goto Return;
}
#undef LONG_PTR_MASK

View File

@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs1lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 1
#define STRINGLIB_MAX_CHAR 0xFFu
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"

View File

@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs2lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 2
#define STRINGLIB_MAX_CHAR 0xFFFFu
#define STRINGLIB_CHAR Py_UCS2
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"

View File

@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs4lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 4
#define STRINGLIB_MAX_CHAR 0x10FFFFu
#define STRINGLIB_CHAR Py_UCS4
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"

View File

@ -1,6 +1,7 @@
#undef FASTSEARCH
#undef STRINGLIB
#undef STRINGLIB_SIZEOF_CHAR
#undef STRINGLIB_MAX_CHAR
#undef STRINGLIB_CHAR
#undef STRINGLIB_STR
#undef STRINGLIB_LEN

View File

@ -4615,28 +4615,6 @@ PyUnicode_EncodeUTF7(const Py_UNICODE *s,
/* --- UTF-8 Codec -------------------------------------------------------- */
static
char utf8_code_length[256] = {
/* Map UTF-8 encoded prefix byte to sequence length. Zero means
illegal prefix. See RFC 3629 for details */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
};
PyObject *
PyUnicode_DecodeUTF8(const char *s,
Py_ssize_t size,
@ -4645,6 +4623,10 @@ PyUnicode_DecodeUTF8(const char *s,
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
}
#include "stringlib/asciilib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
#include "stringlib/ucs1lib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
@ -4670,310 +4652,60 @@ PyUnicode_DecodeUTF8(const char *s,
# error C 'long' size should be either 4 or 8!
#endif
/* Scans a UTF-8 string and returns the maximum character to be expected
and the size of the decoded unicode string.
This function doesn't check for errors, these checks are performed in
PyUnicode_DecodeUTF8Stateful.
*/
static Py_UCS4
utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
static Py_ssize_t
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
{
Py_ssize_t char_count = 0;
const unsigned char *end = p + string_size;
const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
const char *p = start;
const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
assert(unicode_size != NULL);
/* By having a cascade of independent loops which fallback onto each
other, we minimize the amount of work done in the average loop
iteration, and we also maximize the CPU's ability to predict
branches correctly (because a given condition will have always the
same boolean outcome except perhaps in the last iteration of the
corresponding loop).
In the general case this brings us rather close to decoding
performance pre-PEP 393, despite the two-pass decoding.
Note that the pure ASCII loop is not duplicated once a non-ASCII
character has been encountered. It is actually a pessimization (by
a significant factor) to use this loop on text with many non-ASCII
characters, and it is important to avoid bad performance on valid
utf-8 data (invalid utf-8 being a different can of worms).
*/
/* ASCII */
for (; p < end; ++p) {
/* Only check value if it's not a ASCII char... */
if (*p < 0x80) {
/* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
an explanation. */
if (!((size_t) p & LONG_PTR_MASK)) {
/* Help register allocation */
register const unsigned char *_p = p;
while (_p < aligned_end) {
unsigned long value = *(unsigned long *) _p;
if (value & ASCII_CHAR_MASK)
break;
_p += SIZEOF_LONG;
char_count += SIZEOF_LONG;
}
p = _p;
if (p == end)
break;
}
}
if (*p < 0x80)
++char_count;
else
goto _ucs1loop;
}
*unicode_size = char_count;
return 127;
_ucs1loop:
for (; p < end; ++p) {
if (*p < 0xc4)
char_count += ((*p & 0xc0) != 0x80);
else
goto _ucs2loop;
}
*unicode_size = char_count;
return 255;
_ucs2loop:
for (; p < end; ++p) {
if (*p < 0xf0)
char_count += ((*p & 0xc0) != 0x80);
else
goto _ucs4loop;
}
*unicode_size = char_count;
return 65535;
_ucs4loop:
for (; p < end; ++p) {
char_count += ((*p & 0xc0) != 0x80);
}
*unicode_size = char_count;
return 65537;
}
/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
in case of errors. Implicit parameters: unicode, kind, data, onError.
Potential resizing overallocates, so the result needs to shrink at the end.
*/
#define WRITE_MAYBE_FAIL(index, value) \
do { \
Py_ssize_t pos = index; \
if (pos > PyUnicode_GET_LENGTH(unicode) && \
unicode_resize(&unicode, pos + pos/8) < 0) \
goto onError; \
if (unicode_putchar(&unicode, &pos, value) < 0) \
goto onError; \
} while (0)
static PyObject *
decode_utf8_errors(const char *starts,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed,
const char *s,
PyObject *unicode,
Py_ssize_t i)
{
int n;
int k;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *e = starts + size;
const char *aligned_end;
const char *errmsg = "";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
while (s < e) {
Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) {
/* Fast path for runs of ASCII characters. Given that common UTF-8
input will consist of an overwhelming majority of ASCII
characters, we try to optimize for this case by checking
as many characters as a C 'long' can contain.
First, check if we can do an aligned read, as most CPUs have
a penalty for unaligned reads.
*/
if (!((size_t) s & LONG_PTR_MASK)) {
/* Help register allocation */
register const char *_s = s;
register Py_ssize_t _i = i;
while (_s < aligned_end) {
/* Read a whole long at a time (either 4 or 8 bytes),
and do a fast unrolled copy if it only contains ASCII
characters. */
unsigned long value = *(unsigned long *) _s;
if (value & ASCII_CHAR_MASK)
break;
WRITE_MAYBE_FAIL(_i+0, _s[0]);
WRITE_MAYBE_FAIL(_i+1, _s[1]);
WRITE_MAYBE_FAIL(_i+2, _s[2]);
WRITE_MAYBE_FAIL(_i+3, _s[3]);
#if (SIZEOF_LONG == 8)
WRITE_MAYBE_FAIL(_i+4, _s[4]);
WRITE_MAYBE_FAIL(_i+5, _s[5]);
WRITE_MAYBE_FAIL(_i+6, _s[6]);
WRITE_MAYBE_FAIL(_i+7, _s[7]);
#endif
_s += SIZEOF_LONG;
_i += SIZEOF_LONG;
}
s = _s;
i = _i;
if (s == e)
break;
ch = (unsigned char)*s;
}
}
if (ch < 0x80) {
WRITE_MAYBE_FAIL(i++, ch);
s++;
continue;
}
n = utf8_code_length[ch];
if (s + n > e) {
if (consumed)
#if SIZEOF_LONG <= SIZEOF_VOID_P
assert(!((size_t) dest & LONG_PTR_MASK));
if (!((size_t) p & LONG_PTR_MASK)) {
/* Fast path, see in STRINGLIB(utf8_decode) for
an explanation. */
/* Help register allocation */
register const char *_p = p;
register Py_UCS1 * q = dest;
while (_p < aligned_end) {
unsigned long value = *(const unsigned long *) _p;
if (value & ASCII_CHAR_MASK)
break;
else {
errmsg = "unexpected end of data";
startinpos = s-starts;
endinpos = startinpos+1;
for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
endinpos++;
goto utf8Error;
}
*((unsigned long *)q) = value;
_p += SIZEOF_LONG;
q += SIZEOF_LONG;
}
switch (n) {
case 0:
errmsg = "invalid start byte";
startinpos = s-starts;
endinpos = startinpos+1;
goto utf8Error;
case 1:
errmsg = "internal error";
startinpos = s-starts;
endinpos = startinpos+1;
goto utf8Error;
case 2:
if ((s[1] & 0xc0) != 0x80) {
errmsg = "invalid continuation byte";
startinpos = s-starts;
endinpos = startinpos + 1;
goto utf8Error;
}
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
assert ((ch > 0x007F) && (ch <= 0x07FF));
WRITE_MAYBE_FAIL(i++, ch);
break;
case 3:
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
will result in surrogates in range d800-dfff. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xE0 &&
(unsigned char)s[1] < 0xA0) ||
((unsigned char)s[0] == 0xED &&
(unsigned char)s[1] > 0x9F)) {
errmsg = "invalid continuation byte";
startinpos = s-starts;
endinpos = startinpos + 1;
/* if s[1] first two bits are 1 and 0, then the invalid
continuation byte is s[2], so increment endinpos by 1,
if not, s[1] is invalid and endinpos doesn't need to
be incremented. */
if ((s[1] & 0xC0) == 0x80)
endinpos++;
goto utf8Error;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
WRITE_MAYBE_FAIL(i++, ch);
break;
case 4:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xF0 &&
(unsigned char)s[1] < 0x90) ||
((unsigned char)s[0] == 0xF4 &&
(unsigned char)s[1] > 0x8F)) {
errmsg = "invalid continuation byte";
startinpos = s-starts;
endinpos = startinpos + 1;
if ((s[1] & 0xC0) == 0x80) {
endinpos++;
if ((s[2] & 0xC0) == 0x80)
endinpos++;
}
goto utf8Error;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
WRITE_MAYBE_FAIL(i++, ch);
break;
p = _p;
while (p < end) {
if ((unsigned char)*p & 0x80)
break;
*q++ = *p++;
}
s += n;
continue;
utf8Error:
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf-8", errmsg,
&starts, &e, &startinpos, &endinpos, &exc, &s,
&unicode, &i))
goto onError;
/* Update data because unicode_decode_call_errorhandler might have
re-created or resized the unicode object. */
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
return p - start;
}
if (consumed)
*consumed = s-starts;
/* Adjust length and ready string when it contained errors and
is of the old resizable kind. */
if (unicode_resize(&unicode, i) < 0)
goto onError;
unicode_adjust_maxchar(&unicode);
if (unicode == NULL)
goto onError;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
onError:
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
Py_XDECREF(unicode);
return NULL;
#endif
while (p < end) {
/* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
for an explanation. */
if (!((size_t) p & LONG_PTR_MASK)) {
/* Help register allocation */
register const char *_p = p;
while (_p < aligned_end) {
unsigned long value = *(unsigned long *) _p;
if (value & ASCII_CHAR_MASK)
break;
_p += SIZEOF_LONG;
}
p = _p;
if (_p == end)
break;
}
if ((unsigned char)*p & 0x80)
break;
++p;
}
memcpy(dest, start, p - start);
return p - start;
}
#undef WRITE_MAYBE_FAIL
PyObject *
PyUnicode_DecodeUTF8Stateful(const char *s,
@ -4981,15 +4713,16 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
const char *errors,
Py_ssize_t *consumed)
{
Py_UCS4 maxchar = 0;
Py_ssize_t unicode_size;
int has_errors = 0;
PyObject *unicode;
int kind;
void *data;
const char *starts = s;
const char *e;
Py_ssize_t i;
const char *end = s + size;
Py_ssize_t outpos;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *errmsg = "";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
if (size == 0) {
if (consumed)
@ -4998,49 +4731,91 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
return unicode_empty;
}
maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
/* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8
sequence at the end of the ASCII block. */
if (maxchar < 128 && size == unicode_size) {
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && (unsigned char)s[0] < 128) {
if (consumed)
*consumed = size;
return unicode_fromascii((const unsigned char *)s, size);
*consumed = 1;
return get_latin1_char((unsigned char)s[0]);
}
unicode = PyUnicode_New(unicode_size, maxchar);
unicode = PyUnicode_New(size, 127);
if (!unicode)
return NULL;
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
/* Unpack UTF-8 encoded data */
i = 0;
e = starts + size;
switch (kind) {
case PyUnicode_1BYTE_KIND:
has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
break;
case PyUnicode_2BYTE_KIND:
has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
break;
case PyUnicode_4BYTE_KIND:
has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
break;
}
if (!has_errors) {
/* Ensure the unicode size calculation was correct */
assert(i == unicode_size);
assert(s == e);
if (consumed)
*consumed = size;
return unicode;
outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
s += outpos;
while (s < end) {
Py_UCS4 ch;
int kind = PyUnicode_KIND(unicode);
if (kind == PyUnicode_1BYTE_KIND) {
if (PyUnicode_IS_ASCII(unicode))
ch = asciilib_utf8_decode(&s, end,
PyUnicode_1BYTE_DATA(unicode), &outpos);
else
ch = ucs1lib_utf8_decode(&s, end,
PyUnicode_1BYTE_DATA(unicode), &outpos);
} else if (kind == PyUnicode_2BYTE_KIND) {
ch = ucs2lib_utf8_decode(&s, end,
PyUnicode_2BYTE_DATA(unicode), &outpos);
} else {
assert(kind == PyUnicode_4BYTE_KIND);
ch = ucs4lib_utf8_decode(&s, end,
PyUnicode_4BYTE_DATA(unicode), &outpos);
}
switch (ch) {
case 0:
if (s == end || consumed)
goto End;
errmsg = "unexpected end of data";
startinpos = s - starts;
endinpos = startinpos + 1;
while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
endinpos++;
break;
case 1:
errmsg = "invalid start byte";
startinpos = s - starts;
endinpos = startinpos + 1;
break;
case 2:
errmsg = "invalid continuation byte";
startinpos = s - starts;
endinpos = startinpos + 1;
while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
endinpos++;
break;
default:
if (unicode_putchar(&unicode, &outpos, ch) < 0)
goto onError;
continue;
}
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf-8", errmsg,
&starts, &end, &startinpos, &endinpos, &exc, &s,
&unicode, &outpos))
goto onError;
}
/* In case of errors, maxchar and size computation might be incorrect;
code below refits and resizes as necessary. */
return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
End:
if (unicode_resize(&unicode, outpos) < 0)
goto onError;
if (consumed)
*consumed = s - starts;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
onError:
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
Py_XDECREF(unicode);
return NULL;
}
#ifdef __APPLE__
@ -5051,9 +4826,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
wchar_t*
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
{
int n;
const char *e;
wchar_t *unicode, *p;
wchar_t *unicode;
Py_ssize_t outpos;
/* Note: size will always be longer than the resulting Unicode
character count */
@ -5066,86 +4841,33 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
return NULL;
/* Unpack UTF-8 encoded data */
p = unicode;
e = s + size;
outpos = 0;
while (s < e) {
Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) {
*p++ = (wchar_t)ch;
s++;
continue;
}
n = utf8_code_length[ch];
if (s + n > e) {
goto surrogateescape;
}
switch (n) {
case 0:
case 1:
goto surrogateescape;
case 2:
if ((s[1] & 0xc0) != 0x80)
goto surrogateescape;
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
assert ((ch > 0x007F) && (ch <= 0x07FF));
*p++ = (wchar_t)ch;
break;
case 3:
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
will result in surrogates in range d800-dfff. Surrogates are
not valid UTF-8 so they are rejected.
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xE0 &&
(unsigned char)s[1] < 0xA0) ||
((unsigned char)s[0] == 0xED &&
(unsigned char)s[1] > 0x9F)) {
goto surrogateescape;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
*p++ = (wchar_t)ch;
break;
case 4:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80 ||
((unsigned char)s[0] == 0xF0 &&
(unsigned char)s[1] < 0x90) ||
((unsigned char)s[0] == 0xF4 &&
(unsigned char)s[1] > 0x8F)) {
goto surrogateescape;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Py_UCS4 ch;
#if SIZEOF_WCHAR_T == 4
*p++ = (wchar_t)ch;
ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
#else
/* compute and append the two surrogates: */
*p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
*p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
#endif
if (ch > 0xFF) {
#if SIZEOF_WCHAR_T == 4
assert(0);
#else
assert(Py_UNICODE_IS_SURROGATE(ch));
/* compute and append the two surrogates: */
unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
#endif
break;
}
s += n;
continue;
surrogateescape:
*p++ = 0xDC00 + ch;
s++;
else {
if (!ch && s == e)
break;
/* surrogateescape */
unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
}
}
*p = L'\0';
unicode[outpos] = L'\0';
return unicode;
}
@ -6970,17 +6692,13 @@ PyUnicode_DecodeASCII(const char *s,
const char *errors)
{
const char *starts = s;
PyObject *v;
PyObject *unicode;
int kind;
void *data;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
int has_error;
const unsigned char *p = (const unsigned char *)s;
const unsigned char *end = p + size;
const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
@ -6993,45 +6711,18 @@ PyUnicode_DecodeASCII(const char *s,
if (size == 1 && (unsigned char)s[0] < 128)
return get_latin1_char((unsigned char)s[0]);
has_error = 0;
while (p < end && !has_error) {
/* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
an explanation. */
if (!((size_t) p & LONG_PTR_MASK)) {
/* Help register allocation */
register const unsigned char *_p = p;
while (_p < aligned_end) {
unsigned long value = *(unsigned long *) _p;
if (value & ASCII_CHAR_MASK) {
has_error = 1;
break;
}
_p += SIZEOF_LONG;
}
if (_p == end)
break;
if (has_error)
break;
p = _p;
}
if (*p & 0x80) {
has_error = 1;
break;
}
else {
++p;
}
}
if (!has_error)
return unicode_fromascii((const unsigned char *)s, size);
v = PyUnicode_New(size, 127);
if (v == NULL)
unicode = PyUnicode_New(size, 127);
if (unicode == NULL)
goto onError;
kind = PyUnicode_KIND(v);
data = PyUnicode_DATA(v);
outpos = 0;
e = s + size;
data = PyUnicode_1BYTE_DATA(unicode);
outpos = ascii_decode(s, e, (Py_UCS1 *)data);
if (outpos == size)
return unicode;
s += outpos;
kind = PyUnicode_1BYTE_KIND;
while (s < e) {
register unsigned char c = (unsigned char)*s;
if (c < 128) {
@ -7045,21 +6736,21 @@ PyUnicode_DecodeASCII(const char *s,
errors, &errorHandler,
"ascii", "ordinal not in range(128)",
&starts, &e, &startinpos, &endinpos, &exc, &s,
&v, &outpos))
&unicode, &outpos))
goto onError;
kind = PyUnicode_KIND(v);
data = PyUnicode_DATA(v);
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
}
}
if (unicode_resize(&v, outpos) < 0)
if (unicode_resize(&unicode, outpos) < 0)
goto onError;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
assert(_PyUnicode_CheckConsistency(v, 1));
return v;
assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
onError:
Py_XDECREF(v);
Py_XDECREF(unicode);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return NULL;