bpo-39087: Optimize PyUnicode_AsUTF8AndSize() (GH-18327)
Avoid using temporary bytes object.
This commit is contained in:
parent
0c6e3aa67b
commit
02a4d57263
|
@ -0,0 +1,2 @@
|
|||
Optimize :c:func:`PyUnicode_AsUTF8` and :c:func:`PyUnicode_AsUTF8AndSize`
|
||||
slightly when they need to create internal UTF-8 cache.
|
|
@ -256,8 +256,9 @@ InvalidContinuation3:
|
|||
/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
|
||||
PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
|
||||
UCS-1 strings don't need to handle surrogates for example. */
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
STRINGLIB(utf8_encoder)(PyObject *unicode,
|
||||
Py_LOCAL_INLINE(char *)
|
||||
STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
|
||||
PyObject *unicode,
|
||||
STRINGLIB_CHAR *data,
|
||||
Py_ssize_t size,
|
||||
_Py_error_handler error_handler,
|
||||
|
@ -277,17 +278,16 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
|
|||
#else /* STRINGLIB_SIZEOF_CHAR == 4 */
|
||||
const Py_ssize_t max_char_size = 4;
|
||||
#endif
|
||||
_PyBytesWriter writer;
|
||||
|
||||
assert(size >= 0);
|
||||
_PyBytesWriter_Init(&writer);
|
||||
|
||||
if (size > PY_SSIZE_T_MAX / max_char_size) {
|
||||
/* integer overflow */
|
||||
return PyErr_NoMemory();
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
|
||||
_PyBytesWriter_Init(writer);
|
||||
p = _PyBytesWriter_Alloc(writer, size * max_char_size);
|
||||
if (p == NULL)
|
||||
return NULL;
|
||||
|
||||
|
@ -323,7 +323,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
|
|||
endpos++;
|
||||
|
||||
/* Only overallocate the buffer if it's not the last write */
|
||||
writer.overallocate = (endpos < size);
|
||||
writer->overallocate = (endpos < size);
|
||||
|
||||
switch (error_handler)
|
||||
{
|
||||
|
@ -347,8 +347,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
|
|||
|
||||
case _Py_ERROR_BACKSLASHREPLACE:
|
||||
/* subtract preallocated bytes */
|
||||
writer.min_size -= max_char_size * (endpos - startpos);
|
||||
p = backslashreplace(&writer, p,
|
||||
writer->min_size -= max_char_size * (endpos - startpos);
|
||||
p = backslashreplace(writer, p,
|
||||
unicode, startpos, endpos);
|
||||
if (p == NULL)
|
||||
goto error;
|
||||
|
@ -357,8 +357,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
|
|||
|
||||
case _Py_ERROR_XMLCHARREFREPLACE:
|
||||
/* subtract preallocated bytes */
|
||||
writer.min_size -= max_char_size * (endpos - startpos);
|
||||
p = xmlcharrefreplace(&writer, p,
|
||||
writer->min_size -= max_char_size * (endpos - startpos);
|
||||
p = xmlcharrefreplace(writer, p,
|
||||
unicode, startpos, endpos);
|
||||
if (p == NULL)
|
||||
goto error;
|
||||
|
@ -387,10 +387,10 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
|
|||
goto error;
|
||||
|
||||
/* subtract preallocated bytes */
|
||||
writer.min_size -= max_char_size * (newpos - startpos);
|
||||
writer->min_size -= max_char_size * (newpos - startpos);
|
||||
|
||||
if (PyBytes_Check(rep)) {
|
||||
p = _PyBytesWriter_WriteBytes(&writer, p,
|
||||
p = _PyBytesWriter_WriteBytes(writer, p,
|
||||
PyBytes_AS_STRING(rep),
|
||||
PyBytes_GET_SIZE(rep));
|
||||
}
|
||||
|
@ -406,7 +406,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
|
|||
goto error;
|
||||
}
|
||||
|
||||
p = _PyBytesWriter_WriteBytes(&writer, p,
|
||||
p = _PyBytesWriter_WriteBytes(writer, p,
|
||||
PyUnicode_DATA(rep),
|
||||
PyUnicode_GET_LENGTH(rep));
|
||||
}
|
||||
|
@ -420,7 +420,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
|
|||
|
||||
/* If overallocation was disabled, ensure that it was the last
|
||||
write. Otherwise, we missed an optimization */
|
||||
assert(writer.overallocate || i == size);
|
||||
assert(writer->overallocate || i == size);
|
||||
}
|
||||
else
|
||||
#if STRINGLIB_SIZEOF_CHAR > 2
|
||||
|
@ -449,14 +449,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
|
|||
Py_XDECREF(error_handler_obj);
|
||||
Py_XDECREF(exc);
|
||||
#endif
|
||||
return _PyBytesWriter_Finish(&writer, p);
|
||||
return p;
|
||||
|
||||
#if STRINGLIB_SIZEOF_CHAR > 1
|
||||
error:
|
||||
Py_XDECREF(rep);
|
||||
Py_XDECREF(error_handler_obj);
|
||||
Py_XDECREF(exc);
|
||||
_PyBytesWriter_Dealloc(&writer);
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -3991,11 +3991,11 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
|
|||
}
|
||||
|
||||
|
||||
static int unicode_fill_utf8(PyObject *unicode);
|
||||
|
||||
const char *
|
||||
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
|
||||
{
|
||||
PyObject *bytes;
|
||||
|
||||
if (!PyUnicode_Check(unicode)) {
|
||||
PyErr_BadArgument();
|
||||
return NULL;
|
||||
|
@ -4004,21 +4004,9 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
|
|||
return NULL;
|
||||
|
||||
if (PyUnicode_UTF8(unicode) == NULL) {
|
||||
assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
|
||||
bytes = _PyUnicode_AsUTF8String(unicode, NULL);
|
||||
if (bytes == NULL)
|
||||
return NULL;
|
||||
_PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
|
||||
if (_PyUnicode_UTF8(unicode) == NULL) {
|
||||
PyErr_NoMemory();
|
||||
Py_DECREF(bytes);
|
||||
if (unicode_fill_utf8(unicode) == -1) {
|
||||
return NULL;
|
||||
}
|
||||
_PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
|
||||
memcpy(_PyUnicode_UTF8(unicode),
|
||||
PyBytes_AS_STRING(bytes),
|
||||
_PyUnicode_UTF8_LENGTH(unicode) + 1);
|
||||
Py_DECREF(bytes);
|
||||
}
|
||||
|
||||
if (psize)
|
||||
|
@ -5381,10 +5369,6 @@ static PyObject *
|
|||
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
|
||||
const char *errors)
|
||||
{
|
||||
enum PyUnicode_Kind kind;
|
||||
void *data;
|
||||
Py_ssize_t size;
|
||||
|
||||
if (!PyUnicode_Check(unicode)) {
|
||||
PyErr_BadArgument();
|
||||
return NULL;
|
||||
|
@ -5397,9 +5381,12 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
|
|||
return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
|
||||
PyUnicode_UTF8_LENGTH(unicode));
|
||||
|
||||
kind = PyUnicode_KIND(unicode);
|
||||
data = PyUnicode_DATA(unicode);
|
||||
size = PyUnicode_GET_LENGTH(unicode);
|
||||
enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
|
||||
void *data = PyUnicode_DATA(unicode);
|
||||
Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
|
||||
|
||||
_PyBytesWriter writer;
|
||||
char *end;
|
||||
|
||||
switch (kind) {
|
||||
default:
|
||||
|
@ -5407,12 +5394,73 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
|
|||
case PyUnicode_1BYTE_KIND:
|
||||
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
|
||||
assert(!PyUnicode_IS_ASCII(unicode));
|
||||
return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
|
||||
end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
|
||||
break;
|
||||
case PyUnicode_2BYTE_KIND:
|
||||
return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
|
||||
end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
|
||||
break;
|
||||
case PyUnicode_4BYTE_KIND:
|
||||
return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
|
||||
end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
|
||||
break;
|
||||
}
|
||||
|
||||
if (end == NULL) {
|
||||
_PyBytesWriter_Dealloc(&writer);
|
||||
return NULL;
|
||||
}
|
||||
return _PyBytesWriter_Finish(&writer, end);
|
||||
}
|
||||
|
||||
static int
|
||||
unicode_fill_utf8(PyObject *unicode)
|
||||
{
|
||||
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
|
||||
assert(!PyUnicode_IS_ASCII(unicode));
|
||||
|
||||
enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
|
||||
void *data = PyUnicode_DATA(unicode);
|
||||
Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
|
||||
|
||||
_PyBytesWriter writer;
|
||||
char *end;
|
||||
|
||||
switch (kind) {
|
||||
default:
|
||||
Py_UNREACHABLE();
|
||||
case PyUnicode_1BYTE_KIND:
|
||||
end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
|
||||
_Py_ERROR_STRICT, NULL);
|
||||
break;
|
||||
case PyUnicode_2BYTE_KIND:
|
||||
end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
|
||||
_Py_ERROR_STRICT, NULL);
|
||||
break;
|
||||
case PyUnicode_4BYTE_KIND:
|
||||
end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
|
||||
_Py_ERROR_STRICT, NULL);
|
||||
break;
|
||||
}
|
||||
if (end == NULL) {
|
||||
_PyBytesWriter_Dealloc(&writer);
|
||||
return -1;
|
||||
}
|
||||
|
||||
char *start = writer.use_small_buffer ? writer.small_buffer :
|
||||
PyBytes_AS_STRING(writer.buffer);
|
||||
Py_ssize_t len = end - start;
|
||||
|
||||
char *cache = PyObject_MALLOC(len + 1);
|
||||
if (cache == NULL) {
|
||||
_PyBytesWriter_Dealloc(&writer);
|
||||
PyErr_NoMemory();
|
||||
return -1;
|
||||
}
|
||||
_PyUnicode_UTF8(unicode) = cache;
|
||||
_PyUnicode_UTF8_LENGTH(unicode) = len;
|
||||
memcpy(cache, start, len);
|
||||
cache[len] = '\0';
|
||||
_PyBytesWriter_Dealloc(&writer);
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *
|
||||
|
|
Loading…
Reference in New Issue