Add fast-path in PyUnicode_DecodeCharmap() for pure 8 bit encodings:

cp037, cp500 and iso8859_1 codecs
This commit is contained in:
Victor Stinner 2013-04-09 21:53:09 +02:00
parent 0f344b6e05
commit 03c3e35d42
4 changed files with 26 additions and 4 deletions

View File

@ -301,7 +301,6 @@ decoding_table = (
'\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE '\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE
'\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE '\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE
'\x9f' # 0xFF -> CONTROL '\x9f' # 0xFF -> CONTROL
'\ufffe' ## Widen to UCS2 for optimization
) )
### Encoding table ### Encoding table

View File

@ -301,7 +301,6 @@ decoding_table = (
'\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE '\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE
'\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE '\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE
'\x9f' # 0xFF -> CONTROL '\x9f' # 0xFF -> CONTROL
'\ufffe' ## Widen to UCS2 for optimization
) )
### Encoding table ### Encoding table

View File

@ -301,7 +301,6 @@ decoding_table = (
'\xfd' # 0xFD -> LATIN SMALL LETTER Y WITH ACUTE '\xfd' # 0xFD -> LATIN SMALL LETTER Y WITH ACUTE
'\xfe' # 0xFE -> LATIN SMALL LETTER THORN (Icelandic) '\xfe' # 0xFE -> LATIN SMALL LETTER THORN (Icelandic)
'\xff' # 0xFF -> LATIN SMALL LETTER Y WITH DIAERESIS '\xff' # 0xFF -> LATIN SMALL LETTER Y WITH DIAERESIS
'\ufffe' ## Widen to UCS2 for optimization
) )
### Encoding table ### Encoding table

View File

@ -7281,6 +7281,7 @@ PyUnicode_DecodeCharmap(const char *s,
enum PyUnicode_Kind mapkind; enum PyUnicode_Kind mapkind;
void *mapdata; void *mapdata;
Py_UCS4 x; Py_UCS4 x;
unsigned char ch;
if (PyUnicode_READY(mapping) == -1) if (PyUnicode_READY(mapping) == -1)
return NULL; return NULL;
@ -7288,8 +7289,32 @@ PyUnicode_DecodeCharmap(const char *s,
maplen = PyUnicode_GET_LENGTH(mapping); maplen = PyUnicode_GET_LENGTH(mapping);
mapdata = PyUnicode_DATA(mapping); mapdata = PyUnicode_DATA(mapping);
mapkind = PyUnicode_KIND(mapping); mapkind = PyUnicode_KIND(mapping);
if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
/* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
* is disabled in encoding aliases, latin1 is preferred because
* its implementation is faster. */
Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
Py_UCS4 maxchar = writer.maxchar;
assert (writer.kind == PyUnicode_1BYTE_KIND);
while (s < e) {
ch = *s;
x = mapdata_ucs1[ch];
if (x > maxchar) {
if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)
goto onError;
maxchar = writer.maxchar;
outdata = (Py_UCS1 *)writer.data;
}
outdata[writer.pos] = x;
writer.pos++;
++s;
}
}
while (s < e) { while (s < e) {
unsigned char ch;
if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
enum PyUnicode_Kind outkind = writer.kind; enum PyUnicode_Kind outkind = writer.kind;
void *outdata = writer.data; void *outdata = writer.data;