From 6bd525b656f75c9752d39d9c4be1e1b29fa67cdb Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 9 Oct 2015 13:10:05 +0200 Subject: [PATCH] Optimize error handlers of ASCII and Latin1 encoders when the replacement string is pure ASCII: use _PyBytesWriter_WriteBytes(), don't check individual character. Cleanup unicode_encode_ucs1(): * Rename repunicode to rep * Clear rep object on error * Factorize code between bytes and unicode path --- Objects/stringlib/codecs.h | 18 ++++------ Objects/unicodeobject.c | 74 +++++++++++++++++++++----------------- 2 files changed, 48 insertions(+), 44 deletions(-) diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 7e8d928e20b..2beb604f116 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -311,7 +311,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, #if STRINGLIB_SIZEOF_CHAR > 1 else if (Py_UNICODE_IS_SURROGATE(ch)) { Py_ssize_t startpos, endpos, newpos; - Py_ssize_t repsize, k; + Py_ssize_t k; if (error_handler == _Py_ERROR_UNKNOWN) error_handler = get_error_handler(errors); @@ -392,20 +392,12 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, p = _PyBytesWriter_WriteBytes(&writer, p, PyBytes_AS_STRING(rep), PyBytes_GET_SIZE(rep)); - if (p == NULL) - goto error; } else { /* rep is unicode */ if (PyUnicode_READY(rep) < 0) goto error; - repsize = PyUnicode_GET_LENGTH(rep); - - p = _PyBytesWriter_Prepare(&writer, p, repsize); - if (p == NULL) - goto error; - if (!PyUnicode_IS_ASCII(rep)) { raise_encode_exception(&exc, "utf-8", unicode, @@ -415,9 +407,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, } assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); - memcpy(p, PyUnicode_DATA(rep), repsize); - p += repsize; + p = _PyBytesWriter_WriteBytes(&writer, p, + PyUnicode_DATA(rep), + PyUnicode_GET_LENGTH(rep)); } + + if (p == NULL) + goto error; Py_CLEAR(rep); i = newpos; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 23b8cc764d4..35df74714ca 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6599,6 +6599,7 @@ unicode_encode_ucs1(PyObject *unicode, PyObject *error_handler_obj = NULL; PyObject *exc = NULL; _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; + PyObject *rep = NULL; /* output object */ _PyBytesWriter writer; @@ -6627,8 +6628,7 @@ unicode_encode_ucs1(PyObject *unicode, ++pos; } else { - PyObject *repunicode; - Py_ssize_t repsize, newpos, i; + Py_ssize_t newpos, i; /* startpos for collecting unencodable chars */ Py_ssize_t collstart = pos; Py_ssize_t collend = collstart + 1; @@ -6694,52 +6694,59 @@ unicode_encode_ucs1(PyObject *unicode, /* fallback to general error handling */ default: - repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj, - encoding, reason, unicode, &exc, - collstart, collend, &newpos); - if (repunicode == NULL || (PyUnicode_Check(repunicode) && - PyUnicode_READY(repunicode) == -1)) + rep = unicode_encode_call_errorhandler(errors, &error_handler_obj, + encoding, reason, unicode, &exc, + collstart, collend, &newpos); + if (rep == NULL) goto onError; /* substract preallocated bytes */ writer.min_size -= 1; - if (PyBytes_Check(repunicode)) { + if (PyBytes_Check(rep)) { /* Directly copy bytes result to output. */ str = _PyBytesWriter_WriteBytes(&writer, str, - PyBytes_AS_STRING(repunicode), - PyBytes_GET_SIZE(repunicode)); + PyBytes_AS_STRING(rep), + PyBytes_GET_SIZE(rep)); if (str == NULL) goto onError; - - pos = newpos; - Py_DECREF(repunicode); - break; } + else { + assert(PyUnicode_Check(rep)); - /* need more space? (at least enough for what we - have+the replacement+the rest of the string, so - we won't have to check space for encodable characters) */ - repsize = PyUnicode_GET_LENGTH(repunicode); - - str = _PyBytesWriter_Prepare(&writer, str, repsize); - if (str == NULL) - goto onError; - - /* check if there is anything unencodable in the replacement - and copy it to the output */ - for (i = 0; repsize-->0; ++i, ++str) { - ch = PyUnicode_READ_CHAR(repunicode, i); - if (ch >= limit) { - raise_encode_exception(&exc, encoding, unicode, - pos, pos+1, reason); - Py_DECREF(repunicode); + if (PyUnicode_READY(rep) < 0) goto onError; + + if (PyUnicode_IS_ASCII(rep)) { + /* Fast path: all characters are smaller than limit */ + assert(limit >= 128); + assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); + str = _PyBytesWriter_WriteBytes(&writer, str, + PyUnicode_DATA(rep), + PyUnicode_GET_LENGTH(rep)); + } + else { + Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep); + + str = _PyBytesWriter_Prepare(&writer, str, repsize); + if (str == NULL) + goto onError; + + /* check if there is anything unencodable in the + replacement and copy it to the output */ + for (i = 0; repsize-->0; ++i, ++str) { + ch = PyUnicode_READ_CHAR(rep, i); + if (ch >= limit) { + raise_encode_exception(&exc, encoding, unicode, + pos, pos+1, reason); + goto onError; + } + *str = (char)ch; + } } - *str = (char)ch; } pos = newpos; - Py_DECREF(repunicode); + Py_CLEAR(rep); } /* If overallocation was disabled, ensure that it was the last @@ -6753,6 +6760,7 @@ unicode_encode_ucs1(PyObject *unicode, return _PyBytesWriter_Finish(&writer, str); onError: + Py_XDECREF(rep); _PyBytesWriter_Dealloc(&writer); Py_XDECREF(error_handler_obj); Py_XDECREF(exc);