Optimize backslashreplace error handler

Issue #25318: Optimize backslashreplace and xmlcharrefreplace error handlers in
UTF-8 encoder. Optimize also backslashreplace error handler for ASCII and
Latin1 encoders.

Use the new _PyBytesWriter API to optimize these error handlers for the
encoders. It avoids to create an exception and call the slow implementation of
the error handler.
This commit is contained in:
Victor Stinner 2015-10-09 01:39:28 +02:00
parent fdfbf78114
commit e7bf86cd7d
2 changed files with 160 additions and 51 deletions

View File

@ -334,7 +334,6 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
i += (endpos - startpos - 1);
break;
case _Py_ERROR_SURROGATEPASS:
for (k=startpos; k<endpos; k++) {
ch = data[k];
@ -345,6 +344,22 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
i += (endpos - startpos - 1);
break;
case _Py_ERROR_BACKSLASHREPLACE:
p = backslashreplace(&writer, max_char_size, p,
unicode, startpos, endpos);
if (p == NULL)
goto error;
i += (endpos - startpos - 1);
break;
case _Py_ERROR_XMLCHARREFREPLACE:
p = xmlcharrefreplace(&writer, max_char_size, p,
unicode, startpos, endpos);
if (p == NULL)
goto error;
i += (endpos - startpos - 1);
break;
case _Py_ERROR_SURROGATEESCAPE:
for (k=startpos; k<endpos; k++) {
ch = data[k];
@ -359,7 +374,6 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
startpos = k;
assert(startpos < endpos);
/* fall through the default handler */
default:
rep = unicode_encode_call_errorhandler(
errors, &error_handler_obj, "utf-8", "surrogates not allowed",

View File

@ -305,9 +305,10 @@ typedef enum {
_Py_ERROR_UNKNOWN=0,
_Py_ERROR_STRICT,
_Py_ERROR_SURROGATEESCAPE,
_Py_ERROR_SURROGATEPASS,
_Py_ERROR_REPLACE,
_Py_ERROR_IGNORE,
_Py_ERROR_BACKSLASHREPLACE,
_Py_ERROR_SURROGATEPASS,
_Py_ERROR_XMLCHARREFREPLACE,
_Py_ERROR_OTHER
} _Py_error_handler;
@ -315,18 +316,18 @@ typedef enum {
static _Py_error_handler
get_error_handler(const char *errors)
{
if (errors == NULL)
return _Py_ERROR_STRICT;
if (strcmp(errors, "strict") == 0)
if (errors == NULL || strcmp(errors, "strict") == 0)
return _Py_ERROR_STRICT;
if (strcmp(errors, "surrogateescape") == 0)
return _Py_ERROR_SURROGATEESCAPE;
if (strcmp(errors, "surrogatepass") == 0)
return _Py_ERROR_SURROGATEPASS;
if (strcmp(errors, "ignore") == 0)
return _Py_ERROR_IGNORE;
if (strcmp(errors, "replace") == 0)
return _Py_ERROR_REPLACE;
if (strcmp(errors, "ignore") == 0)
return _Py_ERROR_IGNORE;
if (strcmp(errors, "backslashreplace") == 0)
return _Py_ERROR_BACKSLASHREPLACE;
if (strcmp(errors, "surrogatepass") == 0)
return _Py_ERROR_SURROGATEPASS;
if (strcmp(errors, "xmlcharrefreplace") == 0)
return _Py_ERROR_XMLCHARREFREPLACE;
return _Py_ERROR_OTHER;
@ -771,6 +772,126 @@ unicode_result_unchanged(PyObject *unicode)
return _PyUnicode_Copy(unicode);
}
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
ASCII, Latin1, UTF-8, etc. */
static char*
backslashreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char,
char *str,
PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
{
Py_ssize_t size, i, prealloc;
Py_UCS4 ch;
enum PyUnicode_Kind kind;
void *data;
assert(PyUnicode_IS_READY(unicode));
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
size = 0;
/* determine replacement size */
for (i = collstart; i < collend; ++i) {
Py_ssize_t incr;
ch = PyUnicode_READ(kind, data, i);
if (ch < 0x100)
incr = 2+2;
else if (ch < 0x10000)
incr = 2+4;
else {
assert(ch <= MAX_UNICODE);
incr = 2+6;
}
if (size > PY_SSIZE_T_MAX - incr) {
PyErr_SetString(PyExc_OverflowError,
"encoded result is too long for a Python string");
return NULL;
}
size += incr;
}
prealloc = prealloc_per_char * (collend - collstart);
if (size > prealloc) {
str = _PyBytesWriter_Prepare(writer, str, size - prealloc);
if (str == NULL)
return NULL;
}
/* generate replacement */
for (i = collstart; i < collend; ++i) {
ch = PyUnicode_READ(kind, data, i);
if (ch < 0x100)
str += sprintf(str, "\\x%02x", ch);
else if (ch < 0x10000)
str += sprintf(str, "\\u%04x", ch);
else {
assert(ch <= MAX_UNICODE);
str += sprintf(str, "\\U%08x", ch);
}
}
return str;
}
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
ASCII, Latin1, UTF-8, etc. */
static char*
xmlcharrefreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char,
char *str,
PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
{
Py_ssize_t size, i, prealloc;
Py_UCS4 ch;
enum PyUnicode_Kind kind;
void *data;
assert(PyUnicode_IS_READY(unicode));
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
size = 0;
/* determine replacement size */
for (i = collstart; i < collend; ++i) {
Py_ssize_t incr;
ch = PyUnicode_READ(kind, data, i);
if (ch < 10)
incr = 2+1+1;
else if (ch < 100)
incr = 2+2+1;
else if (ch < 1000)
incr = 2+3+1;
else if (ch < 10000)
incr = 2+4+1;
else if (ch < 100000)
incr = 2+5+1;
else if (ch < 1000000)
incr = 2+6+1;
else {
assert(ch <= MAX_UNICODE);
incr = 2+7+1;
}
if (size > PY_SSIZE_T_MAX - incr) {
PyErr_SetString(PyExc_OverflowError,
"encoded result is too long for a Python string");
return NULL;
}
size += incr;
}
prealloc = prealloc_per_char * (collend - collstart);
if (size > prealloc) {
str = _PyBytesWriter_Prepare(writer, str, size - prealloc);
if (str == NULL)
return NULL;
}
/* generate replacement */
for (i = collstart; i < collend; ++i) {
str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
}
return str;
}
/* --- Bloom Filters ----------------------------------------------------- */
/* stuff to implement simple "bloom filters" for Unicode characters.
@ -6713,7 +6834,6 @@ unicode_encode_ucs1(PyObject *unicode,
++pos;
}
else {
Py_ssize_t requiredsize;
PyObject *repunicode;
Py_ssize_t repsize, newpos, i;
/* startpos for collecting unencodable chars */
@ -6744,42 +6864,19 @@ unicode_encode_ucs1(PyObject *unicode,
pos = collend;
break;
case _Py_ERROR_XMLCHARREFREPLACE:
requiredsize = 0;
/* determine replacement size */
for (i = collstart; i < collend; ++i) {
Py_ssize_t incr;
ch = PyUnicode_READ(kind, data, i);
if (ch < 10)
incr = 2+1+1;
else if (ch < 100)
incr = 2+2+1;
else if (ch < 1000)
incr = 2+3+1;
else if (ch < 10000)
incr = 2+4+1;
else if (ch < 100000)
incr = 2+5+1;
else if (ch < 1000000)
incr = 2+6+1;
else {
assert(ch <= MAX_UNICODE);
incr = 2+7+1;
}
if (requiredsize > PY_SSIZE_T_MAX - incr)
goto overflow;
requiredsize += incr;
}
str = _PyBytesWriter_Prepare(&writer, str, requiredsize-1);
case _Py_ERROR_BACKSLASHREPLACE:
str = backslashreplace(&writer, 1, str,
unicode, collstart, collend);
if (str == NULL)
goto onError;
pos = collend;
break;
/* generate replacement */
for (i = collstart; i < collend; ++i) {
str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
}
case _Py_ERROR_XMLCHARREFREPLACE:
str = xmlcharrefreplace(&writer, 1, str,
unicode, collstart, collend);
if (str == NULL)
goto onError;
pos = collend;
break;
@ -6810,9 +6907,11 @@ unicode_encode_ucs1(PyObject *unicode,
if (PyBytes_Check(repunicode)) {
/* Directly copy bytes result to output. */
repsize = PyBytes_Size(repunicode);
str = _PyBytesWriter_Prepare(&writer, str, repsize-1);
if (str == NULL)
goto onError;
if (repsize > 1) {
str = _PyBytesWriter_Prepare(&writer, str, repsize-1);
if (str == NULL)
goto onError;
}
memcpy(str, PyBytes_AsString(repunicode), repsize);
str += repsize;
pos = newpos;
@ -6856,10 +6955,6 @@ unicode_encode_ucs1(PyObject *unicode,
Py_XDECREF(exc);
return _PyBytesWriter_Finish(&writer, str);
overflow:
PyErr_SetString(PyExc_OverflowError,
"encoded result is too long for a Python string");
onError:
_PyBytesWriter_Dealloc(&writer);
Py_XDECREF(error_handler_obj);