mirror of https://github.com/python/cpython
Issue #13417: speed up utf-8 decoding by around 2x for the non-fully-ASCII case.
This almost catches up with pre-PEP 393 performance, when decoding needed only one pass.
This commit is contained in:
parent
7fe601c5bf
commit
0a3229de6b
|
@ -645,6 +645,7 @@ BYTESTR_DEPS = \
|
||||||
|
|
||||||
UNICODE_DEPS = $(BYTESTR_DEPS) \
|
UNICODE_DEPS = $(BYTESTR_DEPS) \
|
||||||
$(srcdir)/Objects/stringlib/asciilib.h \
|
$(srcdir)/Objects/stringlib/asciilib.h \
|
||||||
|
$(srcdir)/Objects/stringlib/codecs.h \
|
||||||
$(srcdir)/Objects/stringlib/ucs1lib.h \
|
$(srcdir)/Objects/stringlib/ucs1lib.h \
|
||||||
$(srcdir)/Objects/stringlib/ucs2lib.h \
|
$(srcdir)/Objects/stringlib/ucs2lib.h \
|
||||||
$(srcdir)/Objects/stringlib/ucs4lib.h \
|
$(srcdir)/Objects/stringlib/ucs4lib.h \
|
||||||
|
|
|
@ -0,0 +1,156 @@
|
||||||
|
/* stringlib: codec implementations */
|
||||||
|
|
||||||
|
#if STRINGLIB_IS_UNICODE
|
||||||
|
|
||||||
|
/* Mask to check or force alignment of a pointer to C 'long' boundaries */
|
||||||
|
#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
|
||||||
|
|
||||||
|
/* Mask to quickly check whether a C 'long' contains a
|
||||||
|
non-ASCII, UTF8-encoded char. */
|
||||||
|
#if (SIZEOF_LONG == 8)
|
||||||
|
# define ASCII_CHAR_MASK 0x8080808080808080L
|
||||||
|
#elif (SIZEOF_LONG == 4)
|
||||||
|
# define ASCII_CHAR_MASK 0x80808080L
|
||||||
|
#else
|
||||||
|
# error C 'long' size should be either 4 or 8!
|
||||||
|
#endif
|
||||||
|
|
||||||
|
Py_LOCAL_INLINE(int)
|
||||||
|
STRINGLIB(utf8_try_decode)(const char *start, const char *end,
|
||||||
|
STRINGLIB_CHAR *dest,
|
||||||
|
const char **src_pos, Py_ssize_t *dest_index)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
Py_ssize_t n;
|
||||||
|
const char *s = start;
|
||||||
|
const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
|
||||||
|
STRINGLIB_CHAR *p = dest;
|
||||||
|
|
||||||
|
while (s < end) {
|
||||||
|
Py_UCS4 ch = (unsigned char)*s;
|
||||||
|
|
||||||
|
if (ch < 0x80) {
|
||||||
|
/* Fast path for runs of ASCII characters. Given that common UTF-8
|
||||||
|
input will consist of an overwhelming majority of ASCII
|
||||||
|
characters, we try to optimize for this case by checking
|
||||||
|
as many characters as a C 'long' can contain.
|
||||||
|
First, check if we can do an aligned read, as most CPUs have
|
||||||
|
a penalty for unaligned reads.
|
||||||
|
*/
|
||||||
|
if (!((size_t) s & LONG_PTR_MASK)) {
|
||||||
|
/* Help register allocation */
|
||||||
|
register const char *_s = s;
|
||||||
|
register STRINGLIB_CHAR *_p = p;
|
||||||
|
while (_s < aligned_end) {
|
||||||
|
/* Read a whole long at a time (either 4 or 8 bytes),
|
||||||
|
and do a fast unrolled copy if it only contains ASCII
|
||||||
|
characters. */
|
||||||
|
unsigned long value = *(unsigned long *) _s;
|
||||||
|
if (value & ASCII_CHAR_MASK)
|
||||||
|
break;
|
||||||
|
_p[0] = _s[0];
|
||||||
|
_p[1] = _s[1];
|
||||||
|
_p[2] = _s[2];
|
||||||
|
_p[3] = _s[3];
|
||||||
|
#if (SIZEOF_LONG == 8)
|
||||||
|
_p[4] = _s[4];
|
||||||
|
_p[5] = _s[5];
|
||||||
|
_p[6] = _s[6];
|
||||||
|
_p[7] = _s[7];
|
||||||
|
#endif
|
||||||
|
_s += SIZEOF_LONG;
|
||||||
|
_p += SIZEOF_LONG;
|
||||||
|
}
|
||||||
|
s = _s;
|
||||||
|
p = _p;
|
||||||
|
if (s == end)
|
||||||
|
break;
|
||||||
|
ch = (unsigned char)*s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ch < 0x80) {
|
||||||
|
s++;
|
||||||
|
*p++ = ch;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
n = utf8_code_length[ch];
|
||||||
|
|
||||||
|
if (s + n > end) {
|
||||||
|
/* unexpected end of data: the caller will decide whether
|
||||||
|
it's an error or not */
|
||||||
|
goto _error;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (n) {
|
||||||
|
case 0:
|
||||||
|
/* invalid start byte */
|
||||||
|
goto _error;
|
||||||
|
case 1:
|
||||||
|
/* internal error */
|
||||||
|
goto _error;
|
||||||
|
case 2:
|
||||||
|
if ((s[1] & 0xc0) != 0x80)
|
||||||
|
/* invalid continuation byte */
|
||||||
|
goto _error;
|
||||||
|
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
|
||||||
|
assert ((ch > 0x007F) && (ch <= 0x07FF));
|
||||||
|
s += 2;
|
||||||
|
*p++ = ch;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 3:
|
||||||
|
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
|
||||||
|
will result in surrogates in range d800-dfff. Surrogates are
|
||||||
|
not valid UTF-8 so they are rejected.
|
||||||
|
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
|
||||||
|
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
|
||||||
|
if ((s[1] & 0xc0) != 0x80 ||
|
||||||
|
(s[2] & 0xc0) != 0x80 ||
|
||||||
|
((unsigned char)s[0] == 0xE0 &&
|
||||||
|
(unsigned char)s[1] < 0xA0) ||
|
||||||
|
((unsigned char)s[0] == 0xED &&
|
||||||
|
(unsigned char)s[1] > 0x9F)) {
|
||||||
|
/* invalid continuation byte */
|
||||||
|
goto _error;
|
||||||
|
}
|
||||||
|
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
|
||||||
|
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
|
||||||
|
s += 3;
|
||||||
|
*p++ = ch;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
if ((s[1] & 0xc0) != 0x80 ||
|
||||||
|
(s[2] & 0xc0) != 0x80 ||
|
||||||
|
(s[3] & 0xc0) != 0x80 ||
|
||||||
|
((unsigned char)s[0] == 0xF0 &&
|
||||||
|
(unsigned char)s[1] < 0x90) ||
|
||||||
|
((unsigned char)s[0] == 0xF4 &&
|
||||||
|
(unsigned char)s[1] > 0x8F)) {
|
||||||
|
/* invalid continuation byte */
|
||||||
|
goto _error;
|
||||||
|
}
|
||||||
|
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
|
||||||
|
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
|
||||||
|
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
|
||||||
|
s += 4;
|
||||||
|
*p++ = ch;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ret = 0;
|
||||||
|
goto _ok;
|
||||||
|
_error:
|
||||||
|
ret = -1;
|
||||||
|
_ok:
|
||||||
|
*src_pos = s;
|
||||||
|
*dest_index = p - dest;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef LONG_PTR_MASK
|
||||||
|
#undef ASCII_CHAR_MASK
|
||||||
|
|
||||||
|
#endif /* STRINGLIB_IS_UNICODE */
|
|
@ -523,6 +523,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
|
||||||
#include "stringlib/fastsearch.h"
|
#include "stringlib/fastsearch.h"
|
||||||
#include "stringlib/count.h"
|
#include "stringlib/count.h"
|
||||||
#include "stringlib/find.h"
|
#include "stringlib/find.h"
|
||||||
|
#include "stringlib/undef.h"
|
||||||
|
|
||||||
/* --- Unicode Object ----------------------------------------------------- */
|
/* --- Unicode Object ----------------------------------------------------- */
|
||||||
|
|
||||||
|
@ -4190,6 +4191,18 @@ PyUnicode_DecodeUTF8(const char *s,
|
||||||
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
|
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include "stringlib/ucs1lib.h"
|
||||||
|
#include "stringlib/codecs.h"
|
||||||
|
#include "stringlib/undef.h"
|
||||||
|
|
||||||
|
#include "stringlib/ucs2lib.h"
|
||||||
|
#include "stringlib/codecs.h"
|
||||||
|
#include "stringlib/undef.h"
|
||||||
|
|
||||||
|
#include "stringlib/ucs4lib.h"
|
||||||
|
#include "stringlib/codecs.h"
|
||||||
|
#include "stringlib/undef.h"
|
||||||
|
|
||||||
/* Mask to check or force alignment of a pointer to C 'long' boundaries */
|
/* Mask to check or force alignment of a pointer to C 'long' boundaries */
|
||||||
#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
|
#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
|
||||||
|
|
||||||
|
@ -4203,33 +4216,41 @@ PyUnicode_DecodeUTF8(const char *s,
|
||||||
# error C 'long' size should be either 4 or 8!
|
# error C 'long' size should be either 4 or 8!
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Scans a UTF-8 string and returns the maximum character to be expected,
|
/* Scans a UTF-8 string and returns the maximum character to be expected
|
||||||
the size of the decoded unicode string and if any major errors were
|
and the size of the decoded unicode string.
|
||||||
encountered.
|
|
||||||
|
|
||||||
This function does check basic UTF-8 sanity, it does however NOT CHECK
|
This function doesn't check for errors, these checks are performed in
|
||||||
if the string contains surrogates, and if all continuation bytes are
|
|
||||||
within the correct ranges, these checks are performed in
|
|
||||||
PyUnicode_DecodeUTF8Stateful.
|
PyUnicode_DecodeUTF8Stateful.
|
||||||
|
|
||||||
If it sets has_errors to 1, it means the value of unicode_size and max_char
|
|
||||||
will be bogus and you should not rely on useful information in them.
|
|
||||||
*/
|
*/
|
||||||
static Py_UCS4
|
static Py_UCS4
|
||||||
utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
|
utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
|
||||||
Py_ssize_t *unicode_size, Py_ssize_t* consumed,
|
Py_ssize_t *unicode_size)
|
||||||
int *has_errors)
|
|
||||||
{
|
{
|
||||||
Py_ssize_t n;
|
|
||||||
Py_ssize_t char_count = 0;
|
Py_ssize_t char_count = 0;
|
||||||
Py_UCS4 max_char = 127, new_max;
|
|
||||||
Py_UCS4 upper_bound;
|
|
||||||
const unsigned char *p = (const unsigned char *)s;
|
const unsigned char *p = (const unsigned char *)s;
|
||||||
const unsigned char *end = p + string_size;
|
const unsigned char *end = p + string_size;
|
||||||
const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
|
const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
|
||||||
int err = 0;
|
|
||||||
|
|
||||||
for (; p < end && !err; ++p, ++char_count) {
|
assert(unicode_size != NULL);
|
||||||
|
|
||||||
|
/* By having a cascade of independent loops which fallback onto each
|
||||||
|
other, we minimize the amount of work done in the average loop
|
||||||
|
iteration, and we also maximize the CPU's ability to predict
|
||||||
|
branches correctly (because a given condition will have always the
|
||||||
|
same boolean outcome except perhaps in the last iteration of the
|
||||||
|
corresponding loop).
|
||||||
|
In the general case this brings us rather close to decoding
|
||||||
|
performance pre-PEP 393, despite the two-pass decoding.
|
||||||
|
|
||||||
|
Note that the pure ASCII loop is not duplicated once a non-ASCII
|
||||||
|
character has been encountered. It is actually a pessimization (by
|
||||||
|
a significant factor) to use this loop on text with many non-ASCII
|
||||||
|
characters, and it is important to avoid bad performance on valid
|
||||||
|
utf-8 data (invalid utf-8 being a different can of worms).
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* ASCII */
|
||||||
|
for (; p < end; ++p) {
|
||||||
/* Only check value if it's not a ASCII char... */
|
/* Only check value if it's not a ASCII char... */
|
||||||
if (*p < 0x80) {
|
if (*p < 0x80) {
|
||||||
/* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
|
/* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
|
||||||
|
@ -4249,76 +4270,59 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (*p >= 0x80) {
|
if (*p < 0x80)
|
||||||
n = utf8_code_length[*p];
|
++char_count;
|
||||||
new_max = max_char;
|
else
|
||||||
switch (n) {
|
goto _ucs1loop;
|
||||||
/* invalid start byte */
|
|
||||||
case 0:
|
|
||||||
err = 1;
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
/* Code points between 0x00FF and 0x07FF inclusive.
|
|
||||||
Approximate the upper bound of the code point,
|
|
||||||
if this flips over 255 we can be sure it will be more
|
|
||||||
than 255 and the string will need 2 bytes per code coint,
|
|
||||||
if it stays under or equal to 255, we can be sure 1 byte
|
|
||||||
is enough.
|
|
||||||
((*p & 0b00011111) << 6) | 0b00111111 */
|
|
||||||
upper_bound = ((*p & 0x1F) << 6) | 0x3F;
|
|
||||||
if (max_char < upper_bound)
|
|
||||||
new_max = upper_bound;
|
|
||||||
/* Ensure we track at least that we left ASCII space. */
|
|
||||||
if (new_max < 128)
|
|
||||||
new_max = 128;
|
|
||||||
break;
|
|
||||||
case 3:
|
|
||||||
/* Between 0x0FFF and 0xFFFF inclusive, so values are
|
|
||||||
always > 255 and <= 65535 and will always need 2 bytes. */
|
|
||||||
if (max_char < 65535)
|
|
||||||
new_max = 65535;
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
/* Code point will be above 0xFFFF for sure in this case. */
|
|
||||||
new_max = 65537;
|
|
||||||
break;
|
|
||||||
/* Internal error, this should be caught by the first if */
|
|
||||||
case 1:
|
|
||||||
default:
|
|
||||||
assert(0 && "Impossible case in utf8_max_char_and_size");
|
|
||||||
err = 1;
|
|
||||||
}
|
|
||||||
/* Instead of number of overall bytes for this code point,
|
|
||||||
n contains the number of following bytes: */
|
|
||||||
--n;
|
|
||||||
/* Check if the follow up chars are all valid continuation bytes */
|
|
||||||
if (n >= 1) {
|
|
||||||
const unsigned char *cont;
|
|
||||||
if ((p + n) >= end) {
|
|
||||||
if (consumed == 0)
|
|
||||||
/* incomplete data, non-incremental decoding */
|
|
||||||
err = 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
for (cont = p + 1; cont <= (p + n); ++cont) {
|
|
||||||
if ((*cont & 0xc0) != 0x80) {
|
|
||||||
err = 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
p += n;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
err = 1;
|
|
||||||
max_char = new_max;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
*unicode_size = char_count;
|
||||||
|
return 127;
|
||||||
|
|
||||||
if (unicode_size)
|
_ucs1loop:
|
||||||
*unicode_size = char_count;
|
for (; p < end; ++p) {
|
||||||
if (has_errors)
|
if (*p < 0xc4)
|
||||||
*has_errors = err;
|
char_count += ((*p & 0xc0) != 0x80);
|
||||||
return max_char;
|
else
|
||||||
|
goto _ucs2loop;
|
||||||
|
}
|
||||||
|
*unicode_size = char_count;
|
||||||
|
return 255;
|
||||||
|
|
||||||
|
_ucs2loop:
|
||||||
|
for (; p < end; ++p) {
|
||||||
|
if (*p < 0xf0)
|
||||||
|
char_count += ((*p & 0xc0) != 0x80);
|
||||||
|
else
|
||||||
|
goto _ucs4loop;
|
||||||
|
}
|
||||||
|
*unicode_size = char_count;
|
||||||
|
return 65535;
|
||||||
|
|
||||||
|
_ucs4loop:
|
||||||
|
for (; p < end; ++p) {
|
||||||
|
char_count += ((*p & 0xc0) != 0x80);
|
||||||
|
}
|
||||||
|
*unicode_size = char_count;
|
||||||
|
return 65537;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Called when we encountered some error that wasn't detected in the original
|
||||||
|
scan, e.g. an encoded surrogate character. The original maxchar computation
|
||||||
|
may have been incorrect, so redo it. */
|
||||||
|
static int
|
||||||
|
refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
|
||||||
|
{
|
||||||
|
PyObject *tmp;
|
||||||
|
Py_ssize_t k, maxchar;
|
||||||
|
for (k = 0, maxchar = 0; k < n; k++)
|
||||||
|
maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
|
||||||
|
tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
|
||||||
|
if (tmp == NULL)
|
||||||
|
return -1;
|
||||||
|
PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
|
||||||
|
Py_DECREF(*unicode);
|
||||||
|
*unicode = tmp;
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
|
/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
|
||||||
|
@ -4361,35 +4365,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
||||||
Py_ssize_t i;
|
Py_ssize_t i;
|
||||||
int kind;
|
int kind;
|
||||||
void *data;
|
void *data;
|
||||||
int has_errors;
|
int has_errors = 0;
|
||||||
|
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
if (consumed)
|
if (consumed)
|
||||||
*consumed = 0;
|
*consumed = 0;
|
||||||
return (PyObject *)PyUnicode_New(0, 0);
|
return (PyObject *)PyUnicode_New(0, 0);
|
||||||
}
|
}
|
||||||
maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
|
maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
|
||||||
consumed, &has_errors);
|
/* In case of errors, maxchar and size computation might be incorrect;
|
||||||
if (has_errors)
|
code below refits and resizes as necessary. */
|
||||||
/* maxchar and size computation might be incorrect;
|
unicode = PyUnicode_New(unicode_size, maxchar);
|
||||||
code below widens and resizes as necessary. */
|
|
||||||
unicode = PyUnicode_New(size, 127);
|
|
||||||
else
|
|
||||||
unicode = PyUnicode_New(unicode_size, maxchar);
|
|
||||||
if (!unicode)
|
if (!unicode)
|
||||||
return NULL;
|
return NULL;
|
||||||
/* When the string is ASCII only, just use memcpy and return.
|
/* When the string is ASCII only, just use memcpy and return.
|
||||||
unicode_size may be != size if there is an incomplete UTF-8
|
unicode_size may be != size if there is an incomplete UTF-8
|
||||||
sequence at the end of the ASCII block. */
|
sequence at the end of the ASCII block. */
|
||||||
if (!has_errors && maxchar < 128 && size == unicode_size) {
|
if (maxchar < 128 && size == unicode_size) {
|
||||||
Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
|
Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
|
||||||
return unicode;
|
return unicode;
|
||||||
}
|
}
|
||||||
kind = PyUnicode_KIND(unicode);
|
kind = PyUnicode_KIND(unicode);
|
||||||
data = PyUnicode_DATA(unicode);
|
data = PyUnicode_DATA(unicode);
|
||||||
|
|
||||||
/* Unpack UTF-8 encoded data */
|
/* Unpack UTF-8 encoded data */
|
||||||
i = 0;
|
i = 0;
|
||||||
e = s + size;
|
e = s + size;
|
||||||
|
switch (kind) {
|
||||||
|
case PyUnicode_1BYTE_KIND:
|
||||||
|
has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
|
||||||
|
break;
|
||||||
|
case PyUnicode_2BYTE_KIND:
|
||||||
|
has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
|
||||||
|
break;
|
||||||
|
case PyUnicode_4BYTE_KIND:
|
||||||
|
has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!has_errors) {
|
||||||
|
/* Ensure the unicode size calculation was correct */
|
||||||
|
assert(i == unicode_size);
|
||||||
|
assert(s == e);
|
||||||
|
if (consumed)
|
||||||
|
*consumed = s-starts;
|
||||||
|
return unicode;
|
||||||
|
}
|
||||||
|
/* Fall through to the generic decoding loop for the rest of
|
||||||
|
the string */
|
||||||
|
if (refit_partial_string(&unicode, kind, data, i) < 0)
|
||||||
|
goto onError;
|
||||||
|
|
||||||
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
|
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
|
||||||
|
|
||||||
while (s < e) {
|
while (s < e) {
|
||||||
|
@ -4541,19 +4566,8 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
||||||
|
|
||||||
utf8Error:
|
utf8Error:
|
||||||
if (!has_errors) {
|
if (!has_errors) {
|
||||||
PyObject *tmp;
|
if (refit_partial_string(&unicode, kind, data, i) < 0)
|
||||||
Py_ssize_t k;
|
|
||||||
/* We encountered some error that wasn't detected in the original scan,
|
|
||||||
e.g. an encoded surrogate character. The original maxchar computation may
|
|
||||||
have been incorrect, so redo it now. */
|
|
||||||
for (k = 0, maxchar = 0; k < i; k++)
|
|
||||||
maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
|
|
||||||
tmp = PyUnicode_New(PyUnicode_GET_LENGTH(unicode), maxchar);
|
|
||||||
if (tmp == NULL)
|
|
||||||
goto onError;
|
goto onError;
|
||||||
PyUnicode_CopyCharacters(tmp, 0, unicode, 0, i);
|
|
||||||
Py_DECREF(unicode);
|
|
||||||
unicode = tmp;
|
|
||||||
has_errors = 1;
|
has_errors = 1;
|
||||||
}
|
}
|
||||||
if (unicode_decode_call_errorhandler(
|
if (unicode_decode_call_errorhandler(
|
||||||
|
|
Loading…
Reference in New Issue