diff --git a/Misc/NEWS.d/next/Library/2018-09-21-13-23-29.bpo-34749.B0k819.rst b/Misc/NEWS.d/next/Library/2018-09-21-13-23-29.bpo-34749.B0k819.rst new file mode 100644 index 00000000000..5a5e5b492c0 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-09-21-13-23-29.bpo-34749.B0k819.rst @@ -0,0 +1,2 @@ +:func:`binascii.a2b_base64` is now up to 2 times faster. Patch by Sergey +Fedoseev. diff --git a/Modules/binascii.c b/Modules/binascii.c index 1c7dc35882d..94b0732c12c 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -130,7 +130,7 @@ static const unsigned char table_a2b_hqx[256] = { static const unsigned char table_b2a_hqx[] = "!\"#$%&'()*+,-012345689@ABCDEFGHIJKLMNPQRSTUVXYZ[`abcdefhijklmpqr"; -static const char table_a2b_base64[] = { +static const unsigned char table_a2b_base64[] = { -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63, @@ -138,7 +138,16 @@ static const char table_a2b_base64[] = { -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14, 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1, -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40, - 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1 + 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1, + + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, }; #define BASE64_PAD '=' @@ -413,32 +422,6 @@ binascii_b2a_uu_impl(PyObject *module, Py_buffer *data, int backtick) return _PyBytesWriter_Finish(&writer, ascii_data); } - -static int -binascii_find_valid(const unsigned char *s, Py_ssize_t slen, int num) -{ - /* Finds & returns the (num+1)th - ** valid character for base64, or -1 if none. - */ - - int ret = -1; - unsigned char c, b64val; - - while ((slen > 0) && (ret == -1)) { - c = *s; - b64val = table_a2b_base64[c & 0x7f]; - if ( ((c <= 0x7f) && (b64val != (unsigned char)-1)) ) { - if (num == 0) - ret = *s; - num--; - } - - s++; - slen--; - } - return ret; -} - /*[clinic input] binascii.a2b_base64 @@ -452,88 +435,74 @@ static PyObject * binascii_a2b_base64_impl(PyObject *module, Py_buffer *data) /*[clinic end generated code: output=0628223f19fd3f9b input=5872acf6e1cac243]*/ { - const unsigned char *ascii_data; - unsigned char *bin_data; - unsigned char *bin_data_start; - int leftbits = 0; - unsigned char this_ch; - unsigned int leftchar = 0; - Py_ssize_t ascii_len, bin_len; - int quad_pos = 0; - _PyBytesWriter writer; - binascii_state *state; + assert(data->len >= 0); - ascii_data = data->buf; - ascii_len = data->len; - - assert(ascii_len >= 0); - - if (ascii_len > PY_SSIZE_T_MAX - 3) - return PyErr_NoMemory(); - - bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */ - - _PyBytesWriter_Init(&writer); + const unsigned char *ascii_data = data->buf; + size_t ascii_len = data->len; /* Allocate the buffer */ - bin_data = _PyBytesWriter_Alloc(&writer, bin_len); + Py_ssize_t bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */ + _PyBytesWriter writer; + _PyBytesWriter_Init(&writer); + unsigned char *bin_data = _PyBytesWriter_Alloc(&writer, bin_len); if (bin_data == NULL) return NULL; - bin_data_start = bin_data; + unsigned char *bin_data_start = bin_data; - for( ; ascii_len > 0; ascii_len--, ascii_data++) { - this_ch = *ascii_data; - - if (this_ch > 0x7f || - this_ch == '\r' || this_ch == '\n' || this_ch == ' ') - continue; + int quad_pos = 0; + unsigned char leftchar = 0; + int pads = 0; + for (size_t i = 0; i < ascii_len; i++) { + unsigned char this_ch = ascii_data[i]; /* Check for pad sequences and ignore ** the invalid ones. */ if (this_ch == BASE64_PAD) { - if ( (quad_pos < 2) || - ((quad_pos == 2) && - (binascii_find_valid(ascii_data, ascii_len, 1) - != BASE64_PAD)) ) - { - continue; - } - else { + if (quad_pos >= 2 && quad_pos + ++pads >= 4) { /* A pad sequence means no more input. ** We've already interpreted the data ** from the quad at this point. */ - leftbits = 0; - break; + goto done; } + continue; } - this_ch = table_a2b_base64[*ascii_data]; - if ( this_ch == (unsigned char) -1 ) + this_ch = table_a2b_base64[this_ch]; + if (this_ch >= 64) { continue; + } + pads = 0; - /* - ** Shift it in on the low end, and see if there's - ** a byte ready for output. - */ - quad_pos = (quad_pos + 1) & 0x03; - leftchar = (leftchar << 6) | (this_ch); - leftbits += 6; - - if ( leftbits >= 8 ) { - leftbits -= 8; - *bin_data++ = (leftchar >> leftbits) & 0xff; - leftchar &= ((1 << leftbits) - 1); + switch (quad_pos) { + case 0: + quad_pos = 1; + leftchar = this_ch; + break; + case 1: + quad_pos = 2; + *bin_data++ = (leftchar << 2) | (this_ch >> 4); + leftchar = this_ch & 0x0f; + break; + case 2: + quad_pos = 3; + *bin_data++ = (leftchar << 4) | (this_ch >> 2); + leftchar = this_ch & 0x03; + break; + case 3: + quad_pos = 0; + *bin_data++ = (leftchar << 6) | (this_ch); + leftchar = 0; + break; } } - if (leftbits != 0) { - state = PyModule_GetState(module); + if (quad_pos != 0) { + binascii_state *state = PyModule_GetState(module); if (state == NULL) { - return NULL; - } - if (leftbits == 6) { + /* error already set, from PyModule_GetState */ + } else if (quad_pos == 1) { /* ** There is exactly one extra valid, non-padding, base64 character. ** This is an invalid length, as there is no possible input that @@ -551,6 +520,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data) return NULL; } +done: return _PyBytesWriter_Finish(&writer, bin_data); }