Optimize bytes.fromhex() and bytearray.fromhex()

Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now
between 2x and 3.5x faster. Changes:

* Use a fast-path working on a char* string for ASCII string
* Use a slow-path for non-ASCII string
* Replace slow hex_digit_to_int() function with a O(1) lookup in
  _PyLong_DigitValue precomputed table
* Use _PyBytesWriter API to handle the buffer
* Add unit tests to check the error position in error messages
This commit is contained in:
Victor Stinner 2015-10-14 11:25:33 +02:00
parent ebcf9edc05
commit 2bf8993db9
7 changed files with 101 additions and 95 deletions

View File

@ -161,6 +161,9 @@ Optimizations
* ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by * ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by
Victor Stinner in :issue:`25399`). Victor Stinner in :issue:`25399`).
* Optimize :meth:`bytes.fromhex` and :meth:`bytearray.fromhex`: they are now
between 2x and 3.5x faster. (Contributed by Victor Stinner in :issue:`25401`).
Build and C API Changes Build and C API Changes
======================= =======================

View File

@ -67,6 +67,9 @@ PyAPI_FUNC(PyObject*) _PyBytes_FormatEx(
Py_ssize_t format_len, Py_ssize_t format_len,
PyObject *args, PyObject *args,
int use_bytearray); int use_bytearray);
PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
PyObject *string,
int use_bytearray);
#endif #endif
PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t, PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
const char *, Py_ssize_t, const char *, Py_ssize_t,

View File

@ -65,7 +65,7 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
# error "void* different in size from int, long and long long" # error "void* different in size from int, long and long long"
#endif /* SIZEOF_VOID_P */ #endif /* SIZEOF_VOID_P */
/* Used by Python/mystrtoul.c. */ /* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */
#ifndef Py_LIMITED_API #ifndef Py_LIMITED_API
PyAPI_DATA(unsigned char) _PyLong_DigitValue[256]; PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
#endif #endif

View File

@ -301,6 +301,20 @@ class BaseBytesTest:
self.assertRaises(ValueError, self.type2test.fromhex, '\x00') self.assertRaises(ValueError, self.type2test.fromhex, '\x00')
self.assertRaises(ValueError, self.type2test.fromhex, '12 \x00 34') self.assertRaises(ValueError, self.type2test.fromhex, '12 \x00 34')
for data, pos in (
# invalid first hexadecimal character
('12 x4 56', 3),
# invalid second hexadecimal character
('12 3x 56', 4),
# two invalid hexadecimal characters
('12 xy 56', 3),
# test non-ASCII string
('12 3\xff 56', 4),
):
with self.assertRaises(ValueError) as cm:
self.type2test.fromhex(data)
self.assertIn('at position %s' % pos, str(cm.exception))
def test_hex(self): def test_hex(self):
self.assertRaises(TypeError, self.type2test.hex) self.assertRaises(TypeError, self.type2test.hex)
self.assertRaises(TypeError, self.type2test.hex, 1) self.assertRaises(TypeError, self.type2test.hex, 1)

View File

@ -10,6 +10,9 @@ Release date: XXXX-XX-XX
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now
between 2x and 3.5x faster.
- Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter - Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter
API. Formatting is now between 2.5 and 5 times faster. API. Formatting is now between 2.5 and 5 times faster.

View File

@ -2823,48 +2823,7 @@ static PyObject *
bytearray_fromhex_impl(PyObject*cls, PyObject *string) bytearray_fromhex_impl(PyObject*cls, PyObject *string)
/*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/ /*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/
{ {
PyObject *newbytes; return _PyBytes_FromHex(string, 1);
char *buf;
Py_ssize_t hexlen, byteslen, i, j;
int top, bot;
void *data;
unsigned int kind;
assert(PyUnicode_Check(string));
if (PyUnicode_READY(string))
return NULL;
kind = PyUnicode_KIND(string);
data = PyUnicode_DATA(string);
hexlen = PyUnicode_GET_LENGTH(string);
byteslen = hexlen/2; /* This overestimates if there are spaces */
newbytes = PyByteArray_FromStringAndSize(NULL, byteslen);
if (!newbytes)
return NULL;
buf = PyByteArray_AS_STRING(newbytes);
for (i = j = 0; i < hexlen; i += 2) {
/* skip over spaces in the input */
while (PyUnicode_READ(kind, data, i) == ' ')
i++;
if (i >= hexlen)
break;
top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
if (top == -1 || bot == -1) {
PyErr_Format(PyExc_ValueError,
"non-hexadecimal number found in "
"fromhex() arg at position %zd", i);
goto error;
}
buf[j++] = (top << 4) + bot;
}
if (PyByteArray_Resize(newbytes, j) < 0)
goto error;
return newbytes;
error:
Py_DECREF(newbytes);
return NULL;
} }
PyDoc_STRVAR(hex__doc__, PyDoc_STRVAR(hex__doc__,

View File

@ -30,6 +30,10 @@ static PyBytesObject *nullstring;
*/ */
#define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1) #define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1)
/* Forward declaration */
Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer,
char *str);
/* /*
For PyBytes_FromString(), the parameter `str' points to a null-terminated For PyBytes_FromString(), the parameter `str' points to a null-terminated
string containing exactly `size' bytes. string containing exactly `size' bytes.
@ -3078,22 +3082,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends)
); );
} }
static int
hex_digit_to_int(Py_UCS4 c)
{
if (c >= 128)
return -1;
if (Py_ISDIGIT(c))
return c - '0';
else {
if (Py_ISUPPER(c))
c = Py_TOLOWER(c);
if (c >= 'a' && c <= 'f')
return c - 'a' + 10;
}
return -1;
}
/*[clinic input] /*[clinic input]
@classmethod @classmethod
bytes.fromhex bytes.fromhex
@ -3111,47 +3099,83 @@ static PyObject *
bytes_fromhex_impl(PyTypeObject *type, PyObject *string) bytes_fromhex_impl(PyTypeObject *type, PyObject *string)
/*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/ /*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/
{ {
PyObject *newstring; return _PyBytes_FromHex(string, 0);
}
PyObject*
_PyBytes_FromHex(PyObject *string, int use_bytearray)
{
char *buf; char *buf;
Py_ssize_t hexlen, byteslen, i, j; Py_ssize_t hexlen, invalid_char;
int top, bot; unsigned int top, bot;
void *data; Py_UCS1 *str, *end;
unsigned int kind; _PyBytesWriter writer;
_PyBytesWriter_Init(&writer);
writer.use_bytearray = use_bytearray;
assert(PyUnicode_Check(string)); assert(PyUnicode_Check(string));
if (PyUnicode_READY(string)) if (PyUnicode_READY(string))
return NULL; return NULL;
kind = PyUnicode_KIND(string);
data = PyUnicode_DATA(string);
hexlen = PyUnicode_GET_LENGTH(string); hexlen = PyUnicode_GET_LENGTH(string);
byteslen = hexlen/2; /* This overestimates if there are spaces */ if (!PyUnicode_IS_ASCII(string)) {
newstring = PyBytes_FromStringAndSize(NULL, byteslen); void *data = PyUnicode_DATA(string);
if (!newstring) unsigned int kind = PyUnicode_KIND(string);
Py_ssize_t i;
/* search for the first non-ASCII character */
for (i = 0; i < hexlen; i++) {
if (PyUnicode_READ(kind, data, i) >= 128)
break;
}
invalid_char = i;
goto error;
}
assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND);
str = PyUnicode_1BYTE_DATA(string);
/* This overestimates if there are spaces */
buf = _PyBytesWriter_Alloc(&writer, hexlen / 2);
if (buf == NULL)
return NULL; return NULL;
buf = PyBytes_AS_STRING(newstring);
for (i = j = 0; i < hexlen; i += 2) { end = str + hexlen;
while (str < end) {
/* skip over spaces in the input */ /* skip over spaces in the input */
while (PyUnicode_READ(kind, data, i) == ' ') if (*str == ' ') {
i++; do {
if (i >= hexlen) str++;
break; } while (*str == ' ');
top = hex_digit_to_int(PyUnicode_READ(kind, data, i)); if (str >= end)
bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1)); break;
if (top == -1 || bot == -1) { }
PyErr_Format(PyExc_ValueError,
"non-hexadecimal number found in " top = _PyLong_DigitValue[*str];
"fromhex() arg at position %zd", i); if (top >= 16) {
invalid_char = str - PyUnicode_1BYTE_DATA(string);
goto error; goto error;
} }
buf[j++] = (top << 4) + bot; str++;
bot = _PyLong_DigitValue[*str];
if (bot >= 16) {
invalid_char = str - PyUnicode_1BYTE_DATA(string);
goto error;
}
str++;
*buf++ = (unsigned char)((top << 4) + bot);
} }
if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0)
goto error; return _PyBytesWriter_Finish(&writer, buf);
return newstring;
error: error:
Py_XDECREF(newstring); PyErr_Format(PyExc_ValueError,
"non-hexadecimal number found in "
"fromhex() arg at position %zd", invalid_char);
_PyBytesWriter_Dealloc(&writer);
return NULL; return NULL;
} }
@ -3888,7 +3912,7 @@ _PyBytesWriter_AsString(_PyBytesWriter *writer)
} }
Py_LOCAL_INLINE(Py_ssize_t) Py_LOCAL_INLINE(Py_ssize_t)
_PyBytesWriter_GetPos(_PyBytesWriter *writer, char *str) _PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str)
{ {
char *start = _PyBytesWriter_AsString(writer); char *start = _PyBytesWriter_AsString(writer);
assert(str != NULL); assert(str != NULL);
@ -3963,7 +3987,7 @@ _PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size)
allocated += allocated / OVERALLOCATE_FACTOR; allocated += allocated / OVERALLOCATE_FACTOR;
} }
pos = _PyBytesWriter_GetPos(writer, str); pos = _PyBytesWriter_GetSize(writer, str);
if (!writer->use_small_buffer) { if (!writer->use_small_buffer) {
if (writer->use_bytearray) { if (writer->use_bytearray) {
if (PyByteArray_Resize(writer->buffer, allocated)) if (PyByteArray_Resize(writer->buffer, allocated))
@ -4041,33 +4065,33 @@ _PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size)
PyObject * PyObject *
_PyBytesWriter_Finish(_PyBytesWriter *writer, void *str) _PyBytesWriter_Finish(_PyBytesWriter *writer, void *str)
{ {
Py_ssize_t pos; Py_ssize_t size;
PyObject *result; PyObject *result;
_PyBytesWriter_CheckConsistency(writer, str); _PyBytesWriter_CheckConsistency(writer, str);
pos = _PyBytesWriter_GetPos(writer, str); size = _PyBytesWriter_GetSize(writer, str);
if (pos == 0 && !writer->use_bytearray) { if (size == 0 && !writer->use_bytearray) {
Py_CLEAR(writer->buffer); Py_CLEAR(writer->buffer);
/* Get the empty byte string singleton */ /* Get the empty byte string singleton */
result = PyBytes_FromStringAndSize(NULL, 0); result = PyBytes_FromStringAndSize(NULL, 0);
} }
else if (writer->use_small_buffer) { else if (writer->use_small_buffer) {
result = PyBytes_FromStringAndSize(writer->small_buffer, pos); result = PyBytes_FromStringAndSize(writer->small_buffer, size);
} }
else { else {
result = writer->buffer; result = writer->buffer;
writer->buffer = NULL; writer->buffer = NULL;
if (pos != writer->allocated) { if (size != writer->allocated) {
if (writer->use_bytearray) { if (writer->use_bytearray) {
if (PyByteArray_Resize(result, pos)) { if (PyByteArray_Resize(result, size)) {
Py_DECREF(result); Py_DECREF(result);
return NULL; return NULL;
} }
} }
else { else {
if (_PyBytes_Resize(&result, pos)) { if (_PyBytes_Resize(&result, size)) {
assert(result == NULL); assert(result == NULL);
return NULL; return NULL;
} }