bpo-37348: optimize decoding ASCII string (GH-14283)
`_PyUnicode_Writer` is a relatively complex structure. Initializing it is significant overhead when decoding short ASCII string.
This commit is contained in:
parent
b3ca7972c8
commit
770847a7db
|
@ -0,0 +1,2 @@
|
|||
Optimized decoding short ASCII string with UTF-8 and ascii codecs.
|
||||
``b"foo".decode()`` is about 15% faster. Patch by Inada Naoki.
|
|
@ -265,6 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
|
|||
/* Forward declaration */
|
||||
static inline int
|
||||
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
|
||||
static inline void
|
||||
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
|
||||
static PyObject *
|
||||
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
|
||||
const char *errors);
|
||||
|
@ -4877,16 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
|
|||
_Py_error_handler error_handler, const char *errors,
|
||||
Py_ssize_t *consumed)
|
||||
{
|
||||
_PyUnicodeWriter writer;
|
||||
const char *starts = s;
|
||||
const char *end = s + size;
|
||||
|
||||
Py_ssize_t startinpos;
|
||||
Py_ssize_t endinpos;
|
||||
const char *errmsg = "";
|
||||
PyObject *error_handler_obj = NULL;
|
||||
PyObject *exc = NULL;
|
||||
|
||||
if (size == 0) {
|
||||
if (consumed)
|
||||
*consumed = 0;
|
||||
|
@ -4900,13 +4892,29 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
|
|||
return get_latin1_char((unsigned char)s[0]);
|
||||
}
|
||||
|
||||
_PyUnicodeWriter_Init(&writer);
|
||||
writer.min_length = size;
|
||||
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
|
||||
goto onError;
|
||||
const char *starts = s;
|
||||
const char *end = s + size;
|
||||
|
||||
// fast path: try ASCII string.
|
||||
PyObject *u = PyUnicode_New(size, 127);
|
||||
if (u == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
s += ascii_decode(s, end, PyUnicode_DATA(u));
|
||||
if (s == end) {
|
||||
return u;
|
||||
}
|
||||
|
||||
// Use _PyUnicodeWriter after fast path is failed.
|
||||
_PyUnicodeWriter writer;
|
||||
_PyUnicodeWriter_InitWithBuffer(&writer, u);
|
||||
writer.pos = s - starts;
|
||||
|
||||
Py_ssize_t startinpos, endinpos;
|
||||
const char *errmsg = "";
|
||||
PyObject *error_handler_obj = NULL;
|
||||
PyObject *exc = NULL;
|
||||
|
||||
writer.pos = ascii_decode(s, end, writer.data);
|
||||
s += writer.pos;
|
||||
while (s < end) {
|
||||
Py_UCS4 ch;
|
||||
int kind = writer.kind;
|
||||
|
@ -6451,7 +6459,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
|||
length after conversion to the true value. (But decoding error
|
||||
handler might have to resize the string) */
|
||||
_PyUnicodeWriter_Init(&writer);
|
||||
writer.min_length = size;
|
||||
writer.min_length = size;
|
||||
if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
|
||||
goto onError;
|
||||
}
|
||||
|
@ -6975,13 +6983,7 @@ PyUnicode_DecodeASCII(const char *s,
|
|||
const char *errors)
|
||||
{
|
||||
const char *starts = s;
|
||||
_PyUnicodeWriter writer;
|
||||
int kind;
|
||||
void *data;
|
||||
Py_ssize_t startinpos;
|
||||
Py_ssize_t endinpos;
|
||||
Py_ssize_t outpos;
|
||||
const char *e;
|
||||
const char *e = s + size;
|
||||
PyObject *error_handler_obj = NULL;
|
||||
PyObject *exc = NULL;
|
||||
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
|
||||
|
@ -6993,20 +6995,25 @@ PyUnicode_DecodeASCII(const char *s,
|
|||
if (size == 1 && (unsigned char)s[0] < 128)
|
||||
return get_latin1_char((unsigned char)s[0]);
|
||||
|
||||
_PyUnicodeWriter_Init(&writer);
|
||||
writer.min_length = size;
|
||||
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
|
||||
// Shortcut for simple case
|
||||
PyObject *u = PyUnicode_New(size, 127);
|
||||
if (u == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
|
||||
if (outpos == size) {
|
||||
return u;
|
||||
}
|
||||
|
||||
e = s + size;
|
||||
data = writer.data;
|
||||
outpos = ascii_decode(s, e, (Py_UCS1 *)data);
|
||||
_PyUnicodeWriter writer;
|
||||
_PyUnicodeWriter_InitWithBuffer(&writer, u);
|
||||
writer.pos = outpos;
|
||||
if (writer.pos == size)
|
||||
return _PyUnicodeWriter_Finish(&writer);
|
||||
|
||||
s += writer.pos;
|
||||
kind = writer.kind;
|
||||
s += outpos;
|
||||
int kind = writer.kind;
|
||||
void *data = writer.data;
|
||||
Py_ssize_t startinpos, endinpos;
|
||||
|
||||
while (s < e) {
|
||||
unsigned char c = (unsigned char)*s;
|
||||
if (c < 128) {
|
||||
|
@ -13506,6 +13513,16 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
|
|||
assert(writer->kind <= PyUnicode_1BYTE_KIND);
|
||||
}
|
||||
|
||||
// Initialize _PyUnicodeWriter with initial buffer
|
||||
static inline void
|
||||
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
|
||||
{
|
||||
memset(writer, 0, sizeof(*writer));
|
||||
writer->buffer = buffer;
|
||||
_PyUnicodeWriter_Update(writer);
|
||||
writer->min_length = writer->size;
|
||||
}
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
|
||||
Py_ssize_t length, Py_UCS4 maxchar)
|
||||
|
|
Loading…
Reference in New Issue