bpo-34749: Improved performance of binascii.a2b_base64(). (GH-9444)

https://bugs.python.org/issue34749
This commit is contained in:
Sergey Fedoseev 2019-07-14 17:15:32 +05:00 committed by Miss Islington (bot)
parent 0d4f4352ef
commit 1c5e68e714
2 changed files with 58 additions and 86 deletions

View File

@ -0,0 +1,2 @@
:func:`binascii.a2b_base64` is now up to 2 times faster. Patch by Sergey
Fedoseev.

View File

@ -130,7 +130,7 @@ static const unsigned char table_a2b_hqx[256] = {
static const unsigned char table_b2a_hqx[] = static const unsigned char table_b2a_hqx[] =
"!\"#$%&'()*+,-012345689@ABCDEFGHIJKLMNPQRSTUVXYZ[`abcdefhijklmpqr"; "!\"#$%&'()*+,-012345689@ABCDEFGHIJKLMNPQRSTUVXYZ[`abcdefhijklmpqr";
static const char table_a2b_base64[] = { static const unsigned char table_a2b_base64[] = {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
@ -138,7 +138,16 @@ static const char table_a2b_base64[] = {
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1, 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40, -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
}; };
#define BASE64_PAD '=' #define BASE64_PAD '='
@ -413,32 +422,6 @@ binascii_b2a_uu_impl(PyObject *module, Py_buffer *data, int backtick)
return _PyBytesWriter_Finish(&writer, ascii_data); return _PyBytesWriter_Finish(&writer, ascii_data);
} }
static int
binascii_find_valid(const unsigned char *s, Py_ssize_t slen, int num)
{
/* Finds & returns the (num+1)th
** valid character for base64, or -1 if none.
*/
int ret = -1;
unsigned char c, b64val;
while ((slen > 0) && (ret == -1)) {
c = *s;
b64val = table_a2b_base64[c & 0x7f];
if ( ((c <= 0x7f) && (b64val != (unsigned char)-1)) ) {
if (num == 0)
ret = *s;
num--;
}
s++;
slen--;
}
return ret;
}
/*[clinic input] /*[clinic input]
binascii.a2b_base64 binascii.a2b_base64
@ -452,88 +435,74 @@ static PyObject *
binascii_a2b_base64_impl(PyObject *module, Py_buffer *data) binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
/*[clinic end generated code: output=0628223f19fd3f9b input=5872acf6e1cac243]*/ /*[clinic end generated code: output=0628223f19fd3f9b input=5872acf6e1cac243]*/
{ {
const unsigned char *ascii_data; assert(data->len >= 0);
unsigned char *bin_data;
unsigned char *bin_data_start;
int leftbits = 0;
unsigned char this_ch;
unsigned int leftchar = 0;
Py_ssize_t ascii_len, bin_len;
int quad_pos = 0;
_PyBytesWriter writer;
binascii_state *state;
ascii_data = data->buf; const unsigned char *ascii_data = data->buf;
ascii_len = data->len; size_t ascii_len = data->len;
assert(ascii_len >= 0);
if (ascii_len > PY_SSIZE_T_MAX - 3)
return PyErr_NoMemory();
bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */
_PyBytesWriter_Init(&writer);
/* Allocate the buffer */ /* Allocate the buffer */
bin_data = _PyBytesWriter_Alloc(&writer, bin_len); Py_ssize_t bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */
_PyBytesWriter writer;
_PyBytesWriter_Init(&writer);
unsigned char *bin_data = _PyBytesWriter_Alloc(&writer, bin_len);
if (bin_data == NULL) if (bin_data == NULL)
return NULL; return NULL;
bin_data_start = bin_data; unsigned char *bin_data_start = bin_data;
for( ; ascii_len > 0; ascii_len--, ascii_data++) { int quad_pos = 0;
this_ch = *ascii_data; unsigned char leftchar = 0;
int pads = 0;
if (this_ch > 0x7f || for (size_t i = 0; i < ascii_len; i++) {
this_ch == '\r' || this_ch == '\n' || this_ch == ' ') unsigned char this_ch = ascii_data[i];
continue;
/* Check for pad sequences and ignore /* Check for pad sequences and ignore
** the invalid ones. ** the invalid ones.
*/ */
if (this_ch == BASE64_PAD) { if (this_ch == BASE64_PAD) {
if ( (quad_pos < 2) || if (quad_pos >= 2 && quad_pos + ++pads >= 4) {
((quad_pos == 2) &&
(binascii_find_valid(ascii_data, ascii_len, 1)
!= BASE64_PAD)) )
{
continue;
}
else {
/* A pad sequence means no more input. /* A pad sequence means no more input.
** We've already interpreted the data ** We've already interpreted the data
** from the quad at this point. ** from the quad at this point.
*/ */
leftbits = 0; goto done;
break;
} }
continue;
} }
this_ch = table_a2b_base64[*ascii_data]; this_ch = table_a2b_base64[this_ch];
if ( this_ch == (unsigned char) -1 ) if (this_ch >= 64) {
continue; continue;
}
pads = 0;
/* switch (quad_pos) {
** Shift it in on the low end, and see if there's case 0:
** a byte ready for output. quad_pos = 1;
*/ leftchar = this_ch;
quad_pos = (quad_pos + 1) & 0x03; break;
leftchar = (leftchar << 6) | (this_ch); case 1:
leftbits += 6; quad_pos = 2;
*bin_data++ = (leftchar << 2) | (this_ch >> 4);
if ( leftbits >= 8 ) { leftchar = this_ch & 0x0f;
leftbits -= 8; break;
*bin_data++ = (leftchar >> leftbits) & 0xff; case 2:
leftchar &= ((1 << leftbits) - 1); quad_pos = 3;
*bin_data++ = (leftchar << 4) | (this_ch >> 2);
leftchar = this_ch & 0x03;
break;
case 3:
quad_pos = 0;
*bin_data++ = (leftchar << 6) | (this_ch);
leftchar = 0;
break;
} }
} }
if (leftbits != 0) { if (quad_pos != 0) {
state = PyModule_GetState(module); binascii_state *state = PyModule_GetState(module);
if (state == NULL) { if (state == NULL) {
return NULL; /* error already set, from PyModule_GetState */
} } else if (quad_pos == 1) {
if (leftbits == 6) {
/* /*
** There is exactly one extra valid, non-padding, base64 character. ** There is exactly one extra valid, non-padding, base64 character.
** This is an invalid length, as there is no possible input that ** This is an invalid length, as there is no possible input that
@ -551,6 +520,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
return NULL; return NULL;
} }
done:
return _PyBytesWriter_Finish(&writer, bin_data); return _PyBytesWriter_Finish(&writer, bin_data);
} }