Move the slowest UTF-8 decoder to its own subfunction

* Create decode_utf8_errors()
 * Reuse unicode_fromascii()
 * decode_utf8_errors() doesn't refit at the beginning
 * Remove refit_partial_string(), use unicode_adjust_maxchar() instead
This commit is contained in:
Victor Stinner 2011-12-11 20:09:03 +01:00
parent 84def3774d
commit 785938eebd
1 changed files with 98 additions and 128 deletions

View File

@ -1784,7 +1784,7 @@ _PyUnicode_ClearStaticStrings()
static PyObject*
unicode_fromascii(const unsigned char* s, Py_ssize_t size)
{
PyObject *res;
PyObject *unicode;
#ifdef Py_DEBUG
const unsigned char *p;
const unsigned char *end = s + size;
@ -1794,11 +1794,12 @@ unicode_fromascii(const unsigned char* s, Py_ssize_t size)
#endif
if (size == 1)
return get_latin1_char(s[0]);
res = PyUnicode_New(size, 127);
if (!res)
unicode = PyUnicode_New(size, 127);
if (!unicode)
return NULL;
memcpy(PyUnicode_1BYTE_DATA(res), s, size);
return res;
memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
}
static Py_UCS4
@ -4320,126 +4321,38 @@ _ucs4loop:
return 65537;
}
/* Called when we encountered some error that wasn't detected in the original
scan, e.g. an encoded surrogate character. The original maxchar computation
may have been incorrect, so redo it. */
static int
refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
{
PyObject *tmp;
Py_ssize_t k;
Py_UCS4 maxchar;
for (k = 0, maxchar = 0; k < n; k++)
maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
if (tmp == NULL)
return -1;
PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
Py_DECREF(*unicode);
*unicode = tmp;
return 0;
}
/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
in case of errors. Implicit parameters: unicode, kind, data, has_errors,
onError. Potential resizing overallocates, so the result needs to shrink
at the end.
in case of errors. Implicit parameters: unicode, kind, data, onError.
Potential resizing overallocates, so the result needs to shrink at the end.
*/
#define WRITE_MAYBE_FAIL(index, value) \
do { \
if (has_errors) { \
Py_ssize_t pos = index; \
if (pos > PyUnicode_GET_LENGTH(unicode) && \
unicode_resize(&unicode, pos + pos/8) < 0) \
goto onError; \
if (unicode_putchar(&unicode, &pos, value) < 0) \
goto onError; \
} \
else \
PyUnicode_WRITE(kind, data, index, value); \
#define WRITE_MAYBE_FAIL(index, value) \
do { \
Py_ssize_t pos = index; \
if (pos > PyUnicode_GET_LENGTH(unicode) && \
unicode_resize(&unicode, pos + pos/8) < 0) \
goto onError; \
if (unicode_putchar(&unicode, &pos, value) < 0) \
goto onError; \
} while (0)
PyObject *
PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
decode_utf8_errors(const char *starts,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed,
const char *s,
PyObject *unicode,
Py_ssize_t i)
{
const char *starts = s;
int n;
int k;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *e, *aligned_end;
PyObject *unicode;
const char *e = starts + size;
const char *aligned_end;
const char *errmsg = "";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
Py_UCS4 maxchar = 0;
Py_ssize_t unicode_size;
Py_ssize_t i;
int kind;
void *data;
int has_errors = 0;
if (size == 0) {
if (consumed)
*consumed = 0;
return (PyObject *)PyUnicode_New(0, 0);
}
maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
/* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8
sequence at the end of the ASCII block. */
if (maxchar < 128 && size == unicode_size) {
if (consumed)
*consumed = size;
if (size == 1)
return get_latin1_char((unsigned char)s[0]);
unicode = PyUnicode_New(unicode_size, maxchar);
if (!unicode)
return NULL;
Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
}
/* In case of errors, maxchar and size computation might be incorrect;
code below refits and resizes as necessary. */
unicode = PyUnicode_New(unicode_size, maxchar);
if (!unicode)
return NULL;
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
/* Unpack UTF-8 encoded data */
i = 0;
e = s + size;
switch (kind) {
case PyUnicode_1BYTE_KIND:
has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
break;
case PyUnicode_2BYTE_KIND:
has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
break;
case PyUnicode_4BYTE_KIND:
has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
break;
}
if (!has_errors) {
/* Ensure the unicode size calculation was correct */
assert(i == unicode_size);
assert(s == e);
if (consumed)
*consumed = s-starts;
return unicode;
}
/* Fall through to the generic decoding loop for the rest of
the string */
if (refit_partial_string(&unicode, kind, data, i) < 0)
goto onError;
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
@ -4591,11 +4504,6 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
continue;
utf8Error:
if (!has_errors) {
if (refit_partial_string(&unicode, kind, data, i) < 0)
goto onError;
has_errors = 1;
}
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf8", errmsg,
@ -4604,22 +4512,18 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
goto onError;
/* Update data because unicode_decode_call_errorhandler might have
re-created or resized the unicode object. */
data = PyUnicode_DATA(unicode);
kind = PyUnicode_KIND(unicode);
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
}
/* Ensure the unicode_size calculation above was correct: */
assert(has_errors || i == unicode_size);
if (consumed)
*consumed = s-starts;
/* Adjust length and ready string when it contained errors and
is of the old resizable kind. */
if (has_errors) {
if (PyUnicode_Resize(&unicode, i) < 0)
goto onError;
}
if (unicode_resize(&unicode, i) < 0)
goto onError;
unicode_adjust_maxchar(&unicode);
if (unicode == NULL)
goto onError;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
@ -4629,12 +4533,78 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
onError:
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
Py_DECREF(unicode);
Py_XDECREF(unicode);
return NULL;
}
#undef WRITE_MAYBE_FAIL
PyObject *
PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
Py_UCS4 maxchar = 0;
Py_ssize_t unicode_size;
int has_errors = 0;
PyObject *unicode;
int kind;
void *data;
const char *starts = s;
const char *e;
Py_ssize_t i;
if (size == 0) {
if (consumed)
*consumed = 0;
return (PyObject *)PyUnicode_New(0, 0);
}
maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
/* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8
sequence at the end of the ASCII block. */
if (maxchar < 128 && size == unicode_size) {
if (consumed)
*consumed = size;
return unicode_fromascii(s, size);
}
unicode = PyUnicode_New(unicode_size, maxchar);
if (!unicode)
return NULL;
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
/* Unpack UTF-8 encoded data */
i = 0;
e = starts + size;
switch (kind) {
case PyUnicode_1BYTE_KIND:
has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
break;
case PyUnicode_2BYTE_KIND:
has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
break;
case PyUnicode_4BYTE_KIND:
has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
break;
}
if (!has_errors) {
/* Ensure the unicode size calculation was correct */
assert(i == unicode_size);
assert(s == e);
if (consumed)
*consumed = size;
return unicode;
}
/* In case of errors, maxchar and size computation might be incorrect;
code below refits and resizes as necessary. */
return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
}
#ifdef __APPLE__
/* Simplified UTF-8 decoder using surrogateescape error handler,