mirror of https://github.com/python/cpython
Optimize bytes.fromhex() and bytearray.fromhex()
Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now between 2x and 3.5x faster. Changes: * Use a fast-path working on a char* string for ASCII string * Use a slow-path for non-ASCII string * Replace slow hex_digit_to_int() function with a O(1) lookup in _PyLong_DigitValue precomputed table * Use _PyBytesWriter API to handle the buffer * Add unit tests to check the error position in error messages
This commit is contained in:
parent
ebcf9edc05
commit
2bf8993db9
|
@ -161,6 +161,9 @@ Optimizations
|
||||||
* ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by
|
* ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by
|
||||||
Victor Stinner in :issue:`25399`).
|
Victor Stinner in :issue:`25399`).
|
||||||
|
|
||||||
|
* Optimize :meth:`bytes.fromhex` and :meth:`bytearray.fromhex`: they are now
|
||||||
|
between 2x and 3.5x faster. (Contributed by Victor Stinner in :issue:`25401`).
|
||||||
|
|
||||||
|
|
||||||
Build and C API Changes
|
Build and C API Changes
|
||||||
=======================
|
=======================
|
||||||
|
|
|
@ -67,6 +67,9 @@ PyAPI_FUNC(PyObject*) _PyBytes_FormatEx(
|
||||||
Py_ssize_t format_len,
|
Py_ssize_t format_len,
|
||||||
PyObject *args,
|
PyObject *args,
|
||||||
int use_bytearray);
|
int use_bytearray);
|
||||||
|
PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
|
||||||
|
PyObject *string,
|
||||||
|
int use_bytearray);
|
||||||
#endif
|
#endif
|
||||||
PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
|
PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
|
||||||
const char *, Py_ssize_t,
|
const char *, Py_ssize_t,
|
||||||
|
|
|
@ -65,7 +65,7 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
|
||||||
# error "void* different in size from int, long and long long"
|
# error "void* different in size from int, long and long long"
|
||||||
#endif /* SIZEOF_VOID_P */
|
#endif /* SIZEOF_VOID_P */
|
||||||
|
|
||||||
/* Used by Python/mystrtoul.c. */
|
/* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */
|
||||||
#ifndef Py_LIMITED_API
|
#ifndef Py_LIMITED_API
|
||||||
PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
|
PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -301,6 +301,20 @@ class BaseBytesTest:
|
||||||
self.assertRaises(ValueError, self.type2test.fromhex, '\x00')
|
self.assertRaises(ValueError, self.type2test.fromhex, '\x00')
|
||||||
self.assertRaises(ValueError, self.type2test.fromhex, '12 \x00 34')
|
self.assertRaises(ValueError, self.type2test.fromhex, '12 \x00 34')
|
||||||
|
|
||||||
|
for data, pos in (
|
||||||
|
# invalid first hexadecimal character
|
||||||
|
('12 x4 56', 3),
|
||||||
|
# invalid second hexadecimal character
|
||||||
|
('12 3x 56', 4),
|
||||||
|
# two invalid hexadecimal characters
|
||||||
|
('12 xy 56', 3),
|
||||||
|
# test non-ASCII string
|
||||||
|
('12 3\xff 56', 4),
|
||||||
|
):
|
||||||
|
with self.assertRaises(ValueError) as cm:
|
||||||
|
self.type2test.fromhex(data)
|
||||||
|
self.assertIn('at position %s' % pos, str(cm.exception))
|
||||||
|
|
||||||
def test_hex(self):
|
def test_hex(self):
|
||||||
self.assertRaises(TypeError, self.type2test.hex)
|
self.assertRaises(TypeError, self.type2test.hex)
|
||||||
self.assertRaises(TypeError, self.type2test.hex, 1)
|
self.assertRaises(TypeError, self.type2test.hex, 1)
|
||||||
|
|
|
@ -10,6 +10,9 @@ Release date: XXXX-XX-XX
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now
|
||||||
|
between 2x and 3.5x faster.
|
||||||
|
|
||||||
- Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter
|
- Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter
|
||||||
API. Formatting is now between 2.5 and 5 times faster.
|
API. Formatting is now between 2.5 and 5 times faster.
|
||||||
|
|
||||||
|
|
|
@ -2823,48 +2823,7 @@ static PyObject *
|
||||||
bytearray_fromhex_impl(PyObject*cls, PyObject *string)
|
bytearray_fromhex_impl(PyObject*cls, PyObject *string)
|
||||||
/*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/
|
/*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/
|
||||||
{
|
{
|
||||||
PyObject *newbytes;
|
return _PyBytes_FromHex(string, 1);
|
||||||
char *buf;
|
|
||||||
Py_ssize_t hexlen, byteslen, i, j;
|
|
||||||
int top, bot;
|
|
||||||
void *data;
|
|
||||||
unsigned int kind;
|
|
||||||
|
|
||||||
assert(PyUnicode_Check(string));
|
|
||||||
if (PyUnicode_READY(string))
|
|
||||||
return NULL;
|
|
||||||
kind = PyUnicode_KIND(string);
|
|
||||||
data = PyUnicode_DATA(string);
|
|
||||||
hexlen = PyUnicode_GET_LENGTH(string);
|
|
||||||
|
|
||||||
byteslen = hexlen/2; /* This overestimates if there are spaces */
|
|
||||||
newbytes = PyByteArray_FromStringAndSize(NULL, byteslen);
|
|
||||||
if (!newbytes)
|
|
||||||
return NULL;
|
|
||||||
buf = PyByteArray_AS_STRING(newbytes);
|
|
||||||
for (i = j = 0; i < hexlen; i += 2) {
|
|
||||||
/* skip over spaces in the input */
|
|
||||||
while (PyUnicode_READ(kind, data, i) == ' ')
|
|
||||||
i++;
|
|
||||||
if (i >= hexlen)
|
|
||||||
break;
|
|
||||||
top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
|
|
||||||
bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
|
|
||||||
if (top == -1 || bot == -1) {
|
|
||||||
PyErr_Format(PyExc_ValueError,
|
|
||||||
"non-hexadecimal number found in "
|
|
||||||
"fromhex() arg at position %zd", i);
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
buf[j++] = (top << 4) + bot;
|
|
||||||
}
|
|
||||||
if (PyByteArray_Resize(newbytes, j) < 0)
|
|
||||||
goto error;
|
|
||||||
return newbytes;
|
|
||||||
|
|
||||||
error:
|
|
||||||
Py_DECREF(newbytes);
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(hex__doc__,
|
PyDoc_STRVAR(hex__doc__,
|
||||||
|
|
|
@ -30,6 +30,10 @@ static PyBytesObject *nullstring;
|
||||||
*/
|
*/
|
||||||
#define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1)
|
#define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1)
|
||||||
|
|
||||||
|
/* Forward declaration */
|
||||||
|
Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer,
|
||||||
|
char *str);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
For PyBytes_FromString(), the parameter `str' points to a null-terminated
|
For PyBytes_FromString(), the parameter `str' points to a null-terminated
|
||||||
string containing exactly `size' bytes.
|
string containing exactly `size' bytes.
|
||||||
|
@ -3078,22 +3082,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
|
||||||
hex_digit_to_int(Py_UCS4 c)
|
|
||||||
{
|
|
||||||
if (c >= 128)
|
|
||||||
return -1;
|
|
||||||
if (Py_ISDIGIT(c))
|
|
||||||
return c - '0';
|
|
||||||
else {
|
|
||||||
if (Py_ISUPPER(c))
|
|
||||||
c = Py_TOLOWER(c);
|
|
||||||
if (c >= 'a' && c <= 'f')
|
|
||||||
return c - 'a' + 10;
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*[clinic input]
|
/*[clinic input]
|
||||||
@classmethod
|
@classmethod
|
||||||
bytes.fromhex
|
bytes.fromhex
|
||||||
|
@ -3111,47 +3099,83 @@ static PyObject *
|
||||||
bytes_fromhex_impl(PyTypeObject *type, PyObject *string)
|
bytes_fromhex_impl(PyTypeObject *type, PyObject *string)
|
||||||
/*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/
|
/*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/
|
||||||
{
|
{
|
||||||
PyObject *newstring;
|
return _PyBytes_FromHex(string, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
PyObject*
|
||||||
|
_PyBytes_FromHex(PyObject *string, int use_bytearray)
|
||||||
|
{
|
||||||
char *buf;
|
char *buf;
|
||||||
Py_ssize_t hexlen, byteslen, i, j;
|
Py_ssize_t hexlen, invalid_char;
|
||||||
int top, bot;
|
unsigned int top, bot;
|
||||||
void *data;
|
Py_UCS1 *str, *end;
|
||||||
unsigned int kind;
|
_PyBytesWriter writer;
|
||||||
|
|
||||||
|
_PyBytesWriter_Init(&writer);
|
||||||
|
writer.use_bytearray = use_bytearray;
|
||||||
|
|
||||||
assert(PyUnicode_Check(string));
|
assert(PyUnicode_Check(string));
|
||||||
if (PyUnicode_READY(string))
|
if (PyUnicode_READY(string))
|
||||||
return NULL;
|
return NULL;
|
||||||
kind = PyUnicode_KIND(string);
|
|
||||||
data = PyUnicode_DATA(string);
|
|
||||||
hexlen = PyUnicode_GET_LENGTH(string);
|
hexlen = PyUnicode_GET_LENGTH(string);
|
||||||
|
|
||||||
byteslen = hexlen/2; /* This overestimates if there are spaces */
|
if (!PyUnicode_IS_ASCII(string)) {
|
||||||
newstring = PyBytes_FromStringAndSize(NULL, byteslen);
|
void *data = PyUnicode_DATA(string);
|
||||||
if (!newstring)
|
unsigned int kind = PyUnicode_KIND(string);
|
||||||
|
Py_ssize_t i;
|
||||||
|
|
||||||
|
/* search for the first non-ASCII character */
|
||||||
|
for (i = 0; i < hexlen; i++) {
|
||||||
|
if (PyUnicode_READ(kind, data, i) >= 128)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
invalid_char = i;
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND);
|
||||||
|
str = PyUnicode_1BYTE_DATA(string);
|
||||||
|
|
||||||
|
/* This overestimates if there are spaces */
|
||||||
|
buf = _PyBytesWriter_Alloc(&writer, hexlen / 2);
|
||||||
|
if (buf == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
buf = PyBytes_AS_STRING(newstring);
|
|
||||||
for (i = j = 0; i < hexlen; i += 2) {
|
end = str + hexlen;
|
||||||
|
while (str < end) {
|
||||||
/* skip over spaces in the input */
|
/* skip over spaces in the input */
|
||||||
while (PyUnicode_READ(kind, data, i) == ' ')
|
if (*str == ' ') {
|
||||||
i++;
|
do {
|
||||||
if (i >= hexlen)
|
str++;
|
||||||
break;
|
} while (*str == ' ');
|
||||||
top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
|
if (str >= end)
|
||||||
bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
|
break;
|
||||||
if (top == -1 || bot == -1) {
|
}
|
||||||
PyErr_Format(PyExc_ValueError,
|
|
||||||
"non-hexadecimal number found in "
|
top = _PyLong_DigitValue[*str];
|
||||||
"fromhex() arg at position %zd", i);
|
if (top >= 16) {
|
||||||
|
invalid_char = str - PyUnicode_1BYTE_DATA(string);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
buf[j++] = (top << 4) + bot;
|
str++;
|
||||||
|
|
||||||
|
bot = _PyLong_DigitValue[*str];
|
||||||
|
if (bot >= 16) {
|
||||||
|
invalid_char = str - PyUnicode_1BYTE_DATA(string);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
str++;
|
||||||
|
|
||||||
|
*buf++ = (unsigned char)((top << 4) + bot);
|
||||||
}
|
}
|
||||||
if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0)
|
|
||||||
goto error;
|
return _PyBytesWriter_Finish(&writer, buf);
|
||||||
return newstring;
|
|
||||||
|
|
||||||
error:
|
error:
|
||||||
Py_XDECREF(newstring);
|
PyErr_Format(PyExc_ValueError,
|
||||||
|
"non-hexadecimal number found in "
|
||||||
|
"fromhex() arg at position %zd", invalid_char);
|
||||||
|
_PyBytesWriter_Dealloc(&writer);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3888,7 +3912,7 @@ _PyBytesWriter_AsString(_PyBytesWriter *writer)
|
||||||
}
|
}
|
||||||
|
|
||||||
Py_LOCAL_INLINE(Py_ssize_t)
|
Py_LOCAL_INLINE(Py_ssize_t)
|
||||||
_PyBytesWriter_GetPos(_PyBytesWriter *writer, char *str)
|
_PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str)
|
||||||
{
|
{
|
||||||
char *start = _PyBytesWriter_AsString(writer);
|
char *start = _PyBytesWriter_AsString(writer);
|
||||||
assert(str != NULL);
|
assert(str != NULL);
|
||||||
|
@ -3963,7 +3987,7 @@ _PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size)
|
||||||
allocated += allocated / OVERALLOCATE_FACTOR;
|
allocated += allocated / OVERALLOCATE_FACTOR;
|
||||||
}
|
}
|
||||||
|
|
||||||
pos = _PyBytesWriter_GetPos(writer, str);
|
pos = _PyBytesWriter_GetSize(writer, str);
|
||||||
if (!writer->use_small_buffer) {
|
if (!writer->use_small_buffer) {
|
||||||
if (writer->use_bytearray) {
|
if (writer->use_bytearray) {
|
||||||
if (PyByteArray_Resize(writer->buffer, allocated))
|
if (PyByteArray_Resize(writer->buffer, allocated))
|
||||||
|
@ -4041,33 +4065,33 @@ _PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size)
|
||||||
PyObject *
|
PyObject *
|
||||||
_PyBytesWriter_Finish(_PyBytesWriter *writer, void *str)
|
_PyBytesWriter_Finish(_PyBytesWriter *writer, void *str)
|
||||||
{
|
{
|
||||||
Py_ssize_t pos;
|
Py_ssize_t size;
|
||||||
PyObject *result;
|
PyObject *result;
|
||||||
|
|
||||||
_PyBytesWriter_CheckConsistency(writer, str);
|
_PyBytesWriter_CheckConsistency(writer, str);
|
||||||
|
|
||||||
pos = _PyBytesWriter_GetPos(writer, str);
|
size = _PyBytesWriter_GetSize(writer, str);
|
||||||
if (pos == 0 && !writer->use_bytearray) {
|
if (size == 0 && !writer->use_bytearray) {
|
||||||
Py_CLEAR(writer->buffer);
|
Py_CLEAR(writer->buffer);
|
||||||
/* Get the empty byte string singleton */
|
/* Get the empty byte string singleton */
|
||||||
result = PyBytes_FromStringAndSize(NULL, 0);
|
result = PyBytes_FromStringAndSize(NULL, 0);
|
||||||
}
|
}
|
||||||
else if (writer->use_small_buffer) {
|
else if (writer->use_small_buffer) {
|
||||||
result = PyBytes_FromStringAndSize(writer->small_buffer, pos);
|
result = PyBytes_FromStringAndSize(writer->small_buffer, size);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
result = writer->buffer;
|
result = writer->buffer;
|
||||||
writer->buffer = NULL;
|
writer->buffer = NULL;
|
||||||
|
|
||||||
if (pos != writer->allocated) {
|
if (size != writer->allocated) {
|
||||||
if (writer->use_bytearray) {
|
if (writer->use_bytearray) {
|
||||||
if (PyByteArray_Resize(result, pos)) {
|
if (PyByteArray_Resize(result, size)) {
|
||||||
Py_DECREF(result);
|
Py_DECREF(result);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (_PyBytes_Resize(&result, pos)) {
|
if (_PyBytes_Resize(&result, size)) {
|
||||||
assert(result == NULL);
|
assert(result == NULL);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue