mirror of https://github.com/python/cpython
Optimize bytes.fromhex() and bytearray.fromhex()
Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now between 2x and 3.5x faster. Changes: * Use a fast-path working on a char* string for ASCII string * Use a slow-path for non-ASCII string * Replace slow hex_digit_to_int() function with a O(1) lookup in _PyLong_DigitValue precomputed table * Use _PyBytesWriter API to handle the buffer * Add unit tests to check the error position in error messages
This commit is contained in:
parent
ebcf9edc05
commit
2bf8993db9
|
@ -161,6 +161,9 @@ Optimizations
|
|||
* ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by
|
||||
Victor Stinner in :issue:`25399`).
|
||||
|
||||
* Optimize :meth:`bytes.fromhex` and :meth:`bytearray.fromhex`: they are now
|
||||
between 2x and 3.5x faster. (Contributed by Victor Stinner in :issue:`25401`).
|
||||
|
||||
|
||||
Build and C API Changes
|
||||
=======================
|
||||
|
|
|
@ -67,6 +67,9 @@ PyAPI_FUNC(PyObject*) _PyBytes_FormatEx(
|
|||
Py_ssize_t format_len,
|
||||
PyObject *args,
|
||||
int use_bytearray);
|
||||
PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
|
||||
PyObject *string,
|
||||
int use_bytearray);
|
||||
#endif
|
||||
PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
|
||||
const char *, Py_ssize_t,
|
||||
|
|
|
@ -65,7 +65,7 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
|
|||
# error "void* different in size from int, long and long long"
|
||||
#endif /* SIZEOF_VOID_P */
|
||||
|
||||
/* Used by Python/mystrtoul.c. */
|
||||
/* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */
|
||||
#ifndef Py_LIMITED_API
|
||||
PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
|
||||
#endif
|
||||
|
|
|
@ -301,6 +301,20 @@ class BaseBytesTest:
|
|||
self.assertRaises(ValueError, self.type2test.fromhex, '\x00')
|
||||
self.assertRaises(ValueError, self.type2test.fromhex, '12 \x00 34')
|
||||
|
||||
for data, pos in (
|
||||
# invalid first hexadecimal character
|
||||
('12 x4 56', 3),
|
||||
# invalid second hexadecimal character
|
||||
('12 3x 56', 4),
|
||||
# two invalid hexadecimal characters
|
||||
('12 xy 56', 3),
|
||||
# test non-ASCII string
|
||||
('12 3\xff 56', 4),
|
||||
):
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
self.type2test.fromhex(data)
|
||||
self.assertIn('at position %s' % pos, str(cm.exception))
|
||||
|
||||
def test_hex(self):
|
||||
self.assertRaises(TypeError, self.type2test.hex)
|
||||
self.assertRaises(TypeError, self.type2test.hex, 1)
|
||||
|
|
|
@ -10,6 +10,9 @@ Release date: XXXX-XX-XX
|
|||
Core and Builtins
|
||||
-----------------
|
||||
|
||||
- Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now
|
||||
between 2x and 3.5x faster.
|
||||
|
||||
- Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter
|
||||
API. Formatting is now between 2.5 and 5 times faster.
|
||||
|
||||
|
|
|
@ -2823,48 +2823,7 @@ static PyObject *
|
|||
bytearray_fromhex_impl(PyObject*cls, PyObject *string)
|
||||
/*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/
|
||||
{
|
||||
PyObject *newbytes;
|
||||
char *buf;
|
||||
Py_ssize_t hexlen, byteslen, i, j;
|
||||
int top, bot;
|
||||
void *data;
|
||||
unsigned int kind;
|
||||
|
||||
assert(PyUnicode_Check(string));
|
||||
if (PyUnicode_READY(string))
|
||||
return NULL;
|
||||
kind = PyUnicode_KIND(string);
|
||||
data = PyUnicode_DATA(string);
|
||||
hexlen = PyUnicode_GET_LENGTH(string);
|
||||
|
||||
byteslen = hexlen/2; /* This overestimates if there are spaces */
|
||||
newbytes = PyByteArray_FromStringAndSize(NULL, byteslen);
|
||||
if (!newbytes)
|
||||
return NULL;
|
||||
buf = PyByteArray_AS_STRING(newbytes);
|
||||
for (i = j = 0; i < hexlen; i += 2) {
|
||||
/* skip over spaces in the input */
|
||||
while (PyUnicode_READ(kind, data, i) == ' ')
|
||||
i++;
|
||||
if (i >= hexlen)
|
||||
break;
|
||||
top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
|
||||
bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
|
||||
if (top == -1 || bot == -1) {
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"non-hexadecimal number found in "
|
||||
"fromhex() arg at position %zd", i);
|
||||
goto error;
|
||||
}
|
||||
buf[j++] = (top << 4) + bot;
|
||||
}
|
||||
if (PyByteArray_Resize(newbytes, j) < 0)
|
||||
goto error;
|
||||
return newbytes;
|
||||
|
||||
error:
|
||||
Py_DECREF(newbytes);
|
||||
return NULL;
|
||||
return _PyBytes_FromHex(string, 1);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(hex__doc__,
|
||||
|
|
|
@ -30,6 +30,10 @@ static PyBytesObject *nullstring;
|
|||
*/
|
||||
#define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1)
|
||||
|
||||
/* Forward declaration */
|
||||
Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer,
|
||||
char *str);
|
||||
|
||||
/*
|
||||
For PyBytes_FromString(), the parameter `str' points to a null-terminated
|
||||
string containing exactly `size' bytes.
|
||||
|
@ -3078,22 +3082,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends)
|
|||
);
|
||||
}
|
||||
|
||||
static int
|
||||
hex_digit_to_int(Py_UCS4 c)
|
||||
{
|
||||
if (c >= 128)
|
||||
return -1;
|
||||
if (Py_ISDIGIT(c))
|
||||
return c - '0';
|
||||
else {
|
||||
if (Py_ISUPPER(c))
|
||||
c = Py_TOLOWER(c);
|
||||
if (c >= 'a' && c <= 'f')
|
||||
return c - 'a' + 10;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*[clinic input]
|
||||
@classmethod
|
||||
bytes.fromhex
|
||||
|
@ -3111,47 +3099,83 @@ static PyObject *
|
|||
bytes_fromhex_impl(PyTypeObject *type, PyObject *string)
|
||||
/*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/
|
||||
{
|
||||
PyObject *newstring;
|
||||
return _PyBytes_FromHex(string, 0);
|
||||
}
|
||||
|
||||
PyObject*
|
||||
_PyBytes_FromHex(PyObject *string, int use_bytearray)
|
||||
{
|
||||
char *buf;
|
||||
Py_ssize_t hexlen, byteslen, i, j;
|
||||
int top, bot;
|
||||
void *data;
|
||||
unsigned int kind;
|
||||
Py_ssize_t hexlen, invalid_char;
|
||||
unsigned int top, bot;
|
||||
Py_UCS1 *str, *end;
|
||||
_PyBytesWriter writer;
|
||||
|
||||
_PyBytesWriter_Init(&writer);
|
||||
writer.use_bytearray = use_bytearray;
|
||||
|
||||
assert(PyUnicode_Check(string));
|
||||
if (PyUnicode_READY(string))
|
||||
return NULL;
|
||||
kind = PyUnicode_KIND(string);
|
||||
data = PyUnicode_DATA(string);
|
||||
hexlen = PyUnicode_GET_LENGTH(string);
|
||||
|
||||
byteslen = hexlen/2; /* This overestimates if there are spaces */
|
||||
newstring = PyBytes_FromStringAndSize(NULL, byteslen);
|
||||
if (!newstring)
|
||||
return NULL;
|
||||
buf = PyBytes_AS_STRING(newstring);
|
||||
for (i = j = 0; i < hexlen; i += 2) {
|
||||
/* skip over spaces in the input */
|
||||
while (PyUnicode_READ(kind, data, i) == ' ')
|
||||
i++;
|
||||
if (i >= hexlen)
|
||||
if (!PyUnicode_IS_ASCII(string)) {
|
||||
void *data = PyUnicode_DATA(string);
|
||||
unsigned int kind = PyUnicode_KIND(string);
|
||||
Py_ssize_t i;
|
||||
|
||||
/* search for the first non-ASCII character */
|
||||
for (i = 0; i < hexlen; i++) {
|
||||
if (PyUnicode_READ(kind, data, i) >= 128)
|
||||
break;
|
||||
top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
|
||||
bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
|
||||
if (top == -1 || bot == -1) {
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"non-hexadecimal number found in "
|
||||
"fromhex() arg at position %zd", i);
|
||||
}
|
||||
invalid_char = i;
|
||||
goto error;
|
||||
}
|
||||
buf[j++] = (top << 4) + bot;
|
||||
|
||||
assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND);
|
||||
str = PyUnicode_1BYTE_DATA(string);
|
||||
|
||||
/* This overestimates if there are spaces */
|
||||
buf = _PyBytesWriter_Alloc(&writer, hexlen / 2);
|
||||
if (buf == NULL)
|
||||
return NULL;
|
||||
|
||||
end = str + hexlen;
|
||||
while (str < end) {
|
||||
/* skip over spaces in the input */
|
||||
if (*str == ' ') {
|
||||
do {
|
||||
str++;
|
||||
} while (*str == ' ');
|
||||
if (str >= end)
|
||||
break;
|
||||
}
|
||||
if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0)
|
||||
|
||||
top = _PyLong_DigitValue[*str];
|
||||
if (top >= 16) {
|
||||
invalid_char = str - PyUnicode_1BYTE_DATA(string);
|
||||
goto error;
|
||||
return newstring;
|
||||
}
|
||||
str++;
|
||||
|
||||
bot = _PyLong_DigitValue[*str];
|
||||
if (bot >= 16) {
|
||||
invalid_char = str - PyUnicode_1BYTE_DATA(string);
|
||||
goto error;
|
||||
}
|
||||
str++;
|
||||
|
||||
*buf++ = (unsigned char)((top << 4) + bot);
|
||||
}
|
||||
|
||||
return _PyBytesWriter_Finish(&writer, buf);
|
||||
|
||||
error:
|
||||
Py_XDECREF(newstring);
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"non-hexadecimal number found in "
|
||||
"fromhex() arg at position %zd", invalid_char);
|
||||
_PyBytesWriter_Dealloc(&writer);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -3888,7 +3912,7 @@ _PyBytesWriter_AsString(_PyBytesWriter *writer)
|
|||
}
|
||||
|
||||
Py_LOCAL_INLINE(Py_ssize_t)
|
||||
_PyBytesWriter_GetPos(_PyBytesWriter *writer, char *str)
|
||||
_PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str)
|
||||
{
|
||||
char *start = _PyBytesWriter_AsString(writer);
|
||||
assert(str != NULL);
|
||||
|
@ -3963,7 +3987,7 @@ _PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size)
|
|||
allocated += allocated / OVERALLOCATE_FACTOR;
|
||||
}
|
||||
|
||||
pos = _PyBytesWriter_GetPos(writer, str);
|
||||
pos = _PyBytesWriter_GetSize(writer, str);
|
||||
if (!writer->use_small_buffer) {
|
||||
if (writer->use_bytearray) {
|
||||
if (PyByteArray_Resize(writer->buffer, allocated))
|
||||
|
@ -4041,33 +4065,33 @@ _PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size)
|
|||
PyObject *
|
||||
_PyBytesWriter_Finish(_PyBytesWriter *writer, void *str)
|
||||
{
|
||||
Py_ssize_t pos;
|
||||
Py_ssize_t size;
|
||||
PyObject *result;
|
||||
|
||||
_PyBytesWriter_CheckConsistency(writer, str);
|
||||
|
||||
pos = _PyBytesWriter_GetPos(writer, str);
|
||||
if (pos == 0 && !writer->use_bytearray) {
|
||||
size = _PyBytesWriter_GetSize(writer, str);
|
||||
if (size == 0 && !writer->use_bytearray) {
|
||||
Py_CLEAR(writer->buffer);
|
||||
/* Get the empty byte string singleton */
|
||||
result = PyBytes_FromStringAndSize(NULL, 0);
|
||||
}
|
||||
else if (writer->use_small_buffer) {
|
||||
result = PyBytes_FromStringAndSize(writer->small_buffer, pos);
|
||||
result = PyBytes_FromStringAndSize(writer->small_buffer, size);
|
||||
}
|
||||
else {
|
||||
result = writer->buffer;
|
||||
writer->buffer = NULL;
|
||||
|
||||
if (pos != writer->allocated) {
|
||||
if (size != writer->allocated) {
|
||||
if (writer->use_bytearray) {
|
||||
if (PyByteArray_Resize(result, pos)) {
|
||||
if (PyByteArray_Resize(result, size)) {
|
||||
Py_DECREF(result);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (_PyBytes_Resize(&result, pos)) {
|
||||
if (_PyBytes_Resize(&result, size)) {
|
||||
assert(result == NULL);
|
||||
return NULL;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue