Optimize bytes.fromhex() and bytearray.fromhex()

Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now between 2x and 3.5x faster. Changes: * Use a fast-path working on a char* string for ASCII string * Use a slow-path for non-ASCII string * Replace slow hex_digit_to_int() function with a O(1) lookup in _PyLong_DigitValue precomputed table * Use _PyBytesWriter API to handle the buffer * Add unit tests to check the error position in error messages
2015-10-14 11:25:33 +02:00 · 2015-10-14 11:25:33 +02:00 · 2bf8993db9
parent ebcf9edc05
commit 2bf8993db9
7 changed files with 101 additions and 95 deletions
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@ -161,6 +161,9 @@ Optimizations
 * ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by
  Victor Stinner in :issue:`25399`).
 * Optimize :meth:`bytes.fromhex` and :meth:`bytearray.fromhex`: they are now
  between 2x and 3.5x faster. (Contributed by Victor Stinner in :issue:`25401`).
 Build and C API Changes
 =======================
--- a/Include/bytesobject.h
+++ b/Include/bytesobject.h
@ -67,6 +67,9 @@ PyAPI_FUNC(PyObject*) _PyBytes_FormatEx(
    Py_ssize_t format_len,
    PyObject *args,
    int use_bytearray);
 PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
    PyObject *string,
    int use_bytearray);
 #endif
 PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
 						   const char *, Py_ssize_t,
--- a/Include/longobject.h
+++ b/Include/longobject.h
@ -65,7 +65,7 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
 #  error "void* different in size from int, long and long long"
 #endif /* SIZEOF_VOID_P */
-/* Used by Python/mystrtoul.c. */
+/* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */
 #ifndef Py_LIMITED_API
 PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
 #endif
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@ -301,6 +301,20 @@ class BaseBytesTest:
        self.assertRaises(ValueError, self.type2test.fromhex, '\x00')
        self.assertRaises(ValueError, self.type2test.fromhex, '12   \x00   34')
        for data, pos in (
            # invalid first hexadecimal character
            ('12 x4 56', 3),
            # invalid second hexadecimal character
            ('12 3x 56', 4),
            # two invalid hexadecimal characters
            ('12 xy 56', 3),
            # test non-ASCII string
            ('12 3\xff 56', 4),
        ):
            with self.assertRaises(ValueError) as cm:
                self.type2test.fromhex(data)
            self.assertIn('at position %s' % pos, str(cm.exception))
    def test_hex(self):
        self.assertRaises(TypeError, self.type2test.hex)
        self.assertRaises(TypeError, self.type2test.hex, 1)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,9 @@ Release date: XXXX-XX-XX
 Core and Builtins
 -----------------
 - Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now
  between 2x and 3.5x faster.
 - Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter
  API. Formatting is now between 2.5 and 5 times faster.
--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
@ -2823,48 +2823,7 @@ static PyObject *
 bytearray_fromhex_impl(PyObject*cls, PyObject *string)
 /*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/
 {
-    PyObject *newbytes;
+    return _PyBytes_FromHex(string, 1);
    char *buf;
    Py_ssize_t hexlen, byteslen, i, j;
    int top, bot;
    void *data;
    unsigned int kind;
    assert(PyUnicode_Check(string));
    if (PyUnicode_READY(string))
        return NULL;
    kind = PyUnicode_KIND(string);
    data = PyUnicode_DATA(string);
    hexlen = PyUnicode_GET_LENGTH(string);
    byteslen = hexlen/2; /* This overestimates if there are spaces */
    newbytes = PyByteArray_FromStringAndSize(NULL, byteslen);
    if (!newbytes)
        return NULL;
    buf = PyByteArray_AS_STRING(newbytes);
    for (i = j = 0; i < hexlen; i += 2) {
        /* skip over spaces in the input */
        while (PyUnicode_READ(kind, data, i) == ' ')
            i++;
        if (i >= hexlen)
            break;
        top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
        bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
        if (top == -1 || bot == -1) {
            PyErr_Format(PyExc_ValueError,
                         "non-hexadecimal number found in "
                         "fromhex() arg at position %zd", i);
            goto error;
        }
        buf[j++] = (top << 4) + bot;
    }
    if (PyByteArray_Resize(newbytes, j) < 0)
        goto error;
    return newbytes;
  error:
    Py_DECREF(newbytes);
    return NULL;
 }
 PyDoc_STRVAR(hex__doc__,
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@ -30,6 +30,10 @@ static PyBytesObject *nullstring;
 */
 #define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1)
 /* Forward declaration */
 Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer,
                                                   char *str);
 /*
   For PyBytes_FromString(), the parameter `str' points to a null-terminated
   string containing exactly `size' bytes.
@ -3078,22 +3082,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends)
        );
 }
 static int
 hex_digit_to_int(Py_UCS4 c)
 {
    if (c >= 128)
        return -1;
    if (Py_ISDIGIT(c))
        return c - '0';
    else {
        if (Py_ISUPPER(c))
            c = Py_TOLOWER(c);
        if (c >= 'a' && c <= 'f')
            return c - 'a' + 10;
    }
    return -1;
 }
 /*[clinic input]
@classmethod
 bytes.fromhex
@ -3111,47 +3099,83 @@ static PyObject *
 bytes_fromhex_impl(PyTypeObject *type, PyObject *string)
 /*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/
 {
-    PyObject *newstring;
+    return _PyBytes_FromHex(string, 0);
 }
 PyObject*
 _PyBytes_FromHex(PyObject *string, int use_bytearray)
 {
    char *buf;
-    Py_ssize_t hexlen, byteslen, i, j;
+    Py_ssize_t hexlen, invalid_char;
-    int top, bot;
+    unsigned int top, bot;
-    void *data;
+    Py_UCS1 *str, *end;
-    unsigned int kind;
+    _PyBytesWriter writer;
    _PyBytesWriter_Init(&writer);
    writer.use_bytearray = use_bytearray;
    assert(PyUnicode_Check(string));
    if (PyUnicode_READY(string))
        return NULL;
    kind = PyUnicode_KIND(string);
    data = PyUnicode_DATA(string);
    hexlen = PyUnicode_GET_LENGTH(string);
-    byteslen = hexlen/2; /* This overestimates if there are spaces */
+    if (!PyUnicode_IS_ASCII(string)) {
-    newstring = PyBytes_FromStringAndSize(NULL, byteslen);
+        void *data = PyUnicode_DATA(string);
-    if (!newstring)
+        unsigned int kind = PyUnicode_KIND(string);
        Py_ssize_t i;
        /* search for the first non-ASCII character */
        for (i = 0; i < hexlen; i++) {
            if (PyUnicode_READ(kind, data, i) >= 128)
                break;
        }
        invalid_char = i;
        goto error;
    }
    assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND);
    str = PyUnicode_1BYTE_DATA(string);
    /* This overestimates if there are spaces */
    buf = _PyBytesWriter_Alloc(&writer, hexlen / 2);
    if (buf == NULL)
        return NULL;
-    buf = PyBytes_AS_STRING(newstring);
+
-    for (i = j = 0; i < hexlen; i += 2) {
+    end = str + hexlen;
    while (str < end) {
        /* skip over spaces in the input */
-        while (PyUnicode_READ(kind, data, i) == ' ')
+        if (*str == ' ') {
-            i++;
+            do {
-        if (i >= hexlen)
+                str++;
-            break;
+            } while (*str == ' ');
-        top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
+            if (str >= end)
-        bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
+                break;
-        if (top == -1 || bot == -1) {
+        }
-            PyErr_Format(PyExc_ValueError,
+
-                         "non-hexadecimal number found in "
+        top = _PyLong_DigitValue[*str];
-                         "fromhex() arg at position %zd", i);
+        if (top >= 16) {
            invalid_char = str - PyUnicode_1BYTE_DATA(string);
            goto error;
        }
-        buf[j++] = (top << 4) + bot;
+        str++;
        bot = _PyLong_DigitValue[*str];
        if (bot >= 16) {
            invalid_char = str - PyUnicode_1BYTE_DATA(string);
            goto error;
        }
        str++;
        *buf++ = (unsigned char)((top << 4) + bot);
    }
-    if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0)
+
-        goto error;
+    return _PyBytesWriter_Finish(&writer, buf);
    return newstring;
  error:
-    Py_XDECREF(newstring);
+    PyErr_Format(PyExc_ValueError,
                 "non-hexadecimal number found in "
                 "fromhex() arg at position %zd", invalid_char);
    _PyBytesWriter_Dealloc(&writer);
    return NULL;
 }
@ -3888,7 +3912,7 @@ _PyBytesWriter_AsString(_PyBytesWriter *writer)
 }
 Py_LOCAL_INLINE(Py_ssize_t)
-_PyBytesWriter_GetPos(_PyBytesWriter *writer, char *str)
+_PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str)
 {
    char *start = _PyBytesWriter_AsString(writer);
    assert(str != NULL);
@ -3963,7 +3987,7 @@ _PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size)
        allocated += allocated / OVERALLOCATE_FACTOR;
    }
-    pos = _PyBytesWriter_GetPos(writer, str);
+    pos = _PyBytesWriter_GetSize(writer, str);
    if (!writer->use_small_buffer) {
        if (writer->use_bytearray) {
            if (PyByteArray_Resize(writer->buffer, allocated))
@ -4041,33 +4065,33 @@ _PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size)
 PyObject *
 _PyBytesWriter_Finish(_PyBytesWriter *writer, void *str)
 {
-    Py_ssize_t pos;
+    Py_ssize_t size;
    PyObject *result;
    _PyBytesWriter_CheckConsistency(writer, str);
-    pos = _PyBytesWriter_GetPos(writer, str);
+    size = _PyBytesWriter_GetSize(writer, str);
-    if (pos == 0 && !writer->use_bytearray) {
+    if (size == 0 && !writer->use_bytearray) {
        Py_CLEAR(writer->buffer);
        /* Get the empty byte string singleton */
        result = PyBytes_FromStringAndSize(NULL, 0);
    }
    else if (writer->use_small_buffer) {
-        result = PyBytes_FromStringAndSize(writer->small_buffer, pos);
+        result = PyBytes_FromStringAndSize(writer->small_buffer, size);
    }
    else {
        result = writer->buffer;
        writer->buffer = NULL;
-        if (pos != writer->allocated) {
+        if (size != writer->allocated) {
            if (writer->use_bytearray) {
-                if (PyByteArray_Resize(result, pos)) {
+                if (PyByteArray_Resize(result, size)) {
                    Py_DECREF(result);
                    return NULL;
                }
            }
            else {
-                if (_PyBytes_Resize(&result, pos)) {
+                if (_PyBytes_Resize(&result, size)) {
                    assert(result == NULL);
                    return NULL;
                }