diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst index d1e5045dd90..edacea12e6a 100644 --- a/Doc/whatsnew/3.6.rst +++ b/Doc/whatsnew/3.6.rst @@ -161,6 +161,9 @@ Optimizations * ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by Victor Stinner in :issue:`25399`). +* Optimize :meth:`bytes.fromhex` and :meth:`bytearray.fromhex`: they are now + between 2x and 3.5x faster. (Contributed by Victor Stinner in :issue:`25401`). + Build and C API Changes ======================= diff --git a/Include/bytesobject.h b/Include/bytesobject.h index b5b37efd251..4046c1cf85b 100644 --- a/Include/bytesobject.h +++ b/Include/bytesobject.h @@ -67,6 +67,9 @@ PyAPI_FUNC(PyObject*) _PyBytes_FormatEx( Py_ssize_t format_len, PyObject *args, int use_bytearray); +PyAPI_FUNC(PyObject*) _PyBytes_FromHex( + PyObject *string, + int use_bytearray); #endif PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t, const char *, Py_ssize_t, diff --git a/Include/longobject.h b/Include/longobject.h index ab92495a23f..9574f05a2d3 100644 --- a/Include/longobject.h +++ b/Include/longobject.h @@ -65,7 +65,7 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void); # error "void* different in size from int, long and long long" #endif /* SIZEOF_VOID_P */ -/* Used by Python/mystrtoul.c. */ +/* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */ #ifndef Py_LIMITED_API PyAPI_DATA(unsigned char) _PyLong_DigitValue[256]; #endif diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index 947d959e84e..0fe33b5cb35 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -301,6 +301,20 @@ class BaseBytesTest: self.assertRaises(ValueError, self.type2test.fromhex, '\x00') self.assertRaises(ValueError, self.type2test.fromhex, '12 \x00 34') + for data, pos in ( + # invalid first hexadecimal character + ('12 x4 56', 3), + # invalid second hexadecimal character + ('12 3x 56', 4), + # two invalid hexadecimal characters + ('12 xy 56', 3), + # test non-ASCII string + ('12 3\xff 56', 4), + ): + with self.assertRaises(ValueError) as cm: + self.type2test.fromhex(data) + self.assertIn('at position %s' % pos, str(cm.exception)) + def test_hex(self): self.assertRaises(TypeError, self.type2test.hex) self.assertRaises(TypeError, self.type2test.hex, 1) diff --git a/Misc/NEWS b/Misc/NEWS index 97e5e7740b8..312cde02c5f 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,9 @@ Release date: XXXX-XX-XX Core and Builtins ----------------- +- Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now + between 2x and 3.5x faster. + - Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter API. Formatting is now between 2.5 and 5 times faster. diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index e535bce8d73..b270fcccc67 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -2823,48 +2823,7 @@ static PyObject * bytearray_fromhex_impl(PyObject*cls, PyObject *string) /*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/ { - PyObject *newbytes; - char *buf; - Py_ssize_t hexlen, byteslen, i, j; - int top, bot; - void *data; - unsigned int kind; - - assert(PyUnicode_Check(string)); - if (PyUnicode_READY(string)) - return NULL; - kind = PyUnicode_KIND(string); - data = PyUnicode_DATA(string); - hexlen = PyUnicode_GET_LENGTH(string); - - byteslen = hexlen/2; /* This overestimates if there are spaces */ - newbytes = PyByteArray_FromStringAndSize(NULL, byteslen); - if (!newbytes) - return NULL; - buf = PyByteArray_AS_STRING(newbytes); - for (i = j = 0; i < hexlen; i += 2) { - /* skip over spaces in the input */ - while (PyUnicode_READ(kind, data, i) == ' ') - i++; - if (i >= hexlen) - break; - top = hex_digit_to_int(PyUnicode_READ(kind, data, i)); - bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1)); - if (top == -1 || bot == -1) { - PyErr_Format(PyExc_ValueError, - "non-hexadecimal number found in " - "fromhex() arg at position %zd", i); - goto error; - } - buf[j++] = (top << 4) + bot; - } - if (PyByteArray_Resize(newbytes, j) < 0) - goto error; - return newbytes; - - error: - Py_DECREF(newbytes); - return NULL; + return _PyBytes_FromHex(string, 1); } PyDoc_STRVAR(hex__doc__, diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 20b11fb375f..2d4cf4b9ec0 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -30,6 +30,10 @@ static PyBytesObject *nullstring; */ #define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1) +/* Forward declaration */ +Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer, + char *str); + /* For PyBytes_FromString(), the parameter `str' points to a null-terminated string containing exactly `size' bytes. @@ -3078,22 +3082,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends) ); } -static int -hex_digit_to_int(Py_UCS4 c) -{ - if (c >= 128) - return -1; - if (Py_ISDIGIT(c)) - return c - '0'; - else { - if (Py_ISUPPER(c)) - c = Py_TOLOWER(c); - if (c >= 'a' && c <= 'f') - return c - 'a' + 10; - } - return -1; -} - /*[clinic input] @classmethod bytes.fromhex @@ -3111,47 +3099,83 @@ static PyObject * bytes_fromhex_impl(PyTypeObject *type, PyObject *string) /*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/ { - PyObject *newstring; + return _PyBytes_FromHex(string, 0); +} + +PyObject* +_PyBytes_FromHex(PyObject *string, int use_bytearray) +{ char *buf; - Py_ssize_t hexlen, byteslen, i, j; - int top, bot; - void *data; - unsigned int kind; + Py_ssize_t hexlen, invalid_char; + unsigned int top, bot; + Py_UCS1 *str, *end; + _PyBytesWriter writer; + + _PyBytesWriter_Init(&writer); + writer.use_bytearray = use_bytearray; assert(PyUnicode_Check(string)); if (PyUnicode_READY(string)) return NULL; - kind = PyUnicode_KIND(string); - data = PyUnicode_DATA(string); hexlen = PyUnicode_GET_LENGTH(string); - byteslen = hexlen/2; /* This overestimates if there are spaces */ - newstring = PyBytes_FromStringAndSize(NULL, byteslen); - if (!newstring) + if (!PyUnicode_IS_ASCII(string)) { + void *data = PyUnicode_DATA(string); + unsigned int kind = PyUnicode_KIND(string); + Py_ssize_t i; + + /* search for the first non-ASCII character */ + for (i = 0; i < hexlen; i++) { + if (PyUnicode_READ(kind, data, i) >= 128) + break; + } + invalid_char = i; + goto error; + } + + assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND); + str = PyUnicode_1BYTE_DATA(string); + + /* This overestimates if there are spaces */ + buf = _PyBytesWriter_Alloc(&writer, hexlen / 2); + if (buf == NULL) return NULL; - buf = PyBytes_AS_STRING(newstring); - for (i = j = 0; i < hexlen; i += 2) { + + end = str + hexlen; + while (str < end) { /* skip over spaces in the input */ - while (PyUnicode_READ(kind, data, i) == ' ') - i++; - if (i >= hexlen) - break; - top = hex_digit_to_int(PyUnicode_READ(kind, data, i)); - bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1)); - if (top == -1 || bot == -1) { - PyErr_Format(PyExc_ValueError, - "non-hexadecimal number found in " - "fromhex() arg at position %zd", i); + if (*str == ' ') { + do { + str++; + } while (*str == ' '); + if (str >= end) + break; + } + + top = _PyLong_DigitValue[*str]; + if (top >= 16) { + invalid_char = str - PyUnicode_1BYTE_DATA(string); goto error; } - buf[j++] = (top << 4) + bot; + str++; + + bot = _PyLong_DigitValue[*str]; + if (bot >= 16) { + invalid_char = str - PyUnicode_1BYTE_DATA(string); + goto error; + } + str++; + + *buf++ = (unsigned char)((top << 4) + bot); } - if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0) - goto error; - return newstring; + + return _PyBytesWriter_Finish(&writer, buf); error: - Py_XDECREF(newstring); + PyErr_Format(PyExc_ValueError, + "non-hexadecimal number found in " + "fromhex() arg at position %zd", invalid_char); + _PyBytesWriter_Dealloc(&writer); return NULL; } @@ -3888,7 +3912,7 @@ _PyBytesWriter_AsString(_PyBytesWriter *writer) } Py_LOCAL_INLINE(Py_ssize_t) -_PyBytesWriter_GetPos(_PyBytesWriter *writer, char *str) +_PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str) { char *start = _PyBytesWriter_AsString(writer); assert(str != NULL); @@ -3963,7 +3987,7 @@ _PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size) allocated += allocated / OVERALLOCATE_FACTOR; } - pos = _PyBytesWriter_GetPos(writer, str); + pos = _PyBytesWriter_GetSize(writer, str); if (!writer->use_small_buffer) { if (writer->use_bytearray) { if (PyByteArray_Resize(writer->buffer, allocated)) @@ -4041,33 +4065,33 @@ _PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size) PyObject * _PyBytesWriter_Finish(_PyBytesWriter *writer, void *str) { - Py_ssize_t pos; + Py_ssize_t size; PyObject *result; _PyBytesWriter_CheckConsistency(writer, str); - pos = _PyBytesWriter_GetPos(writer, str); - if (pos == 0 && !writer->use_bytearray) { + size = _PyBytesWriter_GetSize(writer, str); + if (size == 0 && !writer->use_bytearray) { Py_CLEAR(writer->buffer); /* Get the empty byte string singleton */ result = PyBytes_FromStringAndSize(NULL, 0); } else if (writer->use_small_buffer) { - result = PyBytes_FromStringAndSize(writer->small_buffer, pos); + result = PyBytes_FromStringAndSize(writer->small_buffer, size); } else { result = writer->buffer; writer->buffer = NULL; - if (pos != writer->allocated) { + if (size != writer->allocated) { if (writer->use_bytearray) { - if (PyByteArray_Resize(result, pos)) { + if (PyByteArray_Resize(result, size)) { Py_DECREF(result); return NULL; } } else { - if (_PyBytes_Resize(&result, pos)) { + if (_PyBytes_Resize(&result, size)) { assert(result == NULL); return NULL; }