bpo-34749: Improved performance of binascii.a2b_base64(). (GH-9444)

https://bugs.python.org/issue34749
This commit is contained in:
Sergey Fedoseev 2019-07-14 17:15:32 +05:00 committed by Miss Islington (bot)
parent 0d4f4352ef
commit 1c5e68e714
2 changed files with 58 additions and 86 deletions

View File

@ -0,0 +1,2 @@
:func:`binascii.a2b_base64` is now up to 2 times faster. Patch by Sergey
Fedoseev.

View File

@ -130,7 +130,7 @@ static const unsigned char table_a2b_hqx[256] = {
static const unsigned char table_b2a_hqx[] =
"!\"#$%&'()*+,-012345689@ABCDEFGHIJKLMNPQRSTUVXYZ[`abcdefhijklmpqr";
static const char table_a2b_base64[] = {
static const unsigned char table_a2b_base64[] = {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
@ -138,7 +138,16 @@ static const char table_a2b_base64[] = {
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
};
#define BASE64_PAD '='
@ -413,32 +422,6 @@ binascii_b2a_uu_impl(PyObject *module, Py_buffer *data, int backtick)
return _PyBytesWriter_Finish(&writer, ascii_data);
}
static int
binascii_find_valid(const unsigned char *s, Py_ssize_t slen, int num)
{
/* Finds & returns the (num+1)th
** valid character for base64, or -1 if none.
*/
int ret = -1;
unsigned char c, b64val;
while ((slen > 0) && (ret == -1)) {
c = *s;
b64val = table_a2b_base64[c & 0x7f];
if ( ((c <= 0x7f) && (b64val != (unsigned char)-1)) ) {
if (num == 0)
ret = *s;
num--;
}
s++;
slen--;
}
return ret;
}
/*[clinic input]
binascii.a2b_base64
@ -452,88 +435,74 @@ static PyObject *
binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
/*[clinic end generated code: output=0628223f19fd3f9b input=5872acf6e1cac243]*/
{
const unsigned char *ascii_data;
unsigned char *bin_data;
unsigned char *bin_data_start;
int leftbits = 0;
unsigned char this_ch;
unsigned int leftchar = 0;
Py_ssize_t ascii_len, bin_len;
int quad_pos = 0;
_PyBytesWriter writer;
binascii_state *state;
assert(data->len >= 0);
ascii_data = data->buf;
ascii_len = data->len;
assert(ascii_len >= 0);
if (ascii_len > PY_SSIZE_T_MAX - 3)
return PyErr_NoMemory();
bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */
_PyBytesWriter_Init(&writer);
const unsigned char *ascii_data = data->buf;
size_t ascii_len = data->len;
/* Allocate the buffer */
bin_data = _PyBytesWriter_Alloc(&writer, bin_len);
Py_ssize_t bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */
_PyBytesWriter writer;
_PyBytesWriter_Init(&writer);
unsigned char *bin_data = _PyBytesWriter_Alloc(&writer, bin_len);
if (bin_data == NULL)
return NULL;
bin_data_start = bin_data;
unsigned char *bin_data_start = bin_data;
for( ; ascii_len > 0; ascii_len--, ascii_data++) {
this_ch = *ascii_data;
if (this_ch > 0x7f ||
this_ch == '\r' || this_ch == '\n' || this_ch == ' ')
continue;
int quad_pos = 0;
unsigned char leftchar = 0;
int pads = 0;
for (size_t i = 0; i < ascii_len; i++) {
unsigned char this_ch = ascii_data[i];
/* Check for pad sequences and ignore
** the invalid ones.
*/
if (this_ch == BASE64_PAD) {
if ( (quad_pos < 2) ||
((quad_pos == 2) &&
(binascii_find_valid(ascii_data, ascii_len, 1)
!= BASE64_PAD)) )
{
continue;
}
else {
if (quad_pos >= 2 && quad_pos + ++pads >= 4) {
/* A pad sequence means no more input.
** We've already interpreted the data
** from the quad at this point.
*/
leftbits = 0;
break;
goto done;
}
continue;
}
this_ch = table_a2b_base64[*ascii_data];
if ( this_ch == (unsigned char) -1 )
this_ch = table_a2b_base64[this_ch];
if (this_ch >= 64) {
continue;
}
pads = 0;
/*
** Shift it in on the low end, and see if there's
** a byte ready for output.
*/
quad_pos = (quad_pos + 1) & 0x03;
leftchar = (leftchar << 6) | (this_ch);
leftbits += 6;
if ( leftbits >= 8 ) {
leftbits -= 8;
*bin_data++ = (leftchar >> leftbits) & 0xff;
leftchar &= ((1 << leftbits) - 1);
switch (quad_pos) {
case 0:
quad_pos = 1;
leftchar = this_ch;
break;
case 1:
quad_pos = 2;
*bin_data++ = (leftchar << 2) | (this_ch >> 4);
leftchar = this_ch & 0x0f;
break;
case 2:
quad_pos = 3;
*bin_data++ = (leftchar << 4) | (this_ch >> 2);
leftchar = this_ch & 0x03;
break;
case 3:
quad_pos = 0;
*bin_data++ = (leftchar << 6) | (this_ch);
leftchar = 0;
break;
}
}
if (leftbits != 0) {
state = PyModule_GetState(module);
if (quad_pos != 0) {
binascii_state *state = PyModule_GetState(module);
if (state == NULL) {
return NULL;
}
if (leftbits == 6) {
/* error already set, from PyModule_GetState */
} else if (quad_pos == 1) {
/*
** There is exactly one extra valid, non-padding, base64 character.
** This is an invalid length, as there is no possible input that
@ -551,6 +520,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
return NULL;
}
done:
return _PyBytesWriter_Finish(&writer, bin_data);
}