Add fast-path in PyUnicode_DecodeCharmap() for pure 8 bit encodings:

cp037, cp500 and iso8859_1 codecs
2013-04-09 21:53:09 +02:00 · 2013-04-09 21:53:09 +02:00 · 03c3e35d42
parent 0f344b6e05
commit 03c3e35d42
4 changed files with 26 additions and 4 deletions
--- a/Lib/encodings/cp037.py
+++ b/Lib/encodings/cp037.py
@ -301,7 +301,6 @@ decoding_table = (
    '\xd9'     #  0xFD -> LATIN CAPITAL LETTER U WITH GRAVE
    '\xda'     #  0xFE -> LATIN CAPITAL LETTER U WITH ACUTE
    '\x9f'     #  0xFF -> CONTROL
    '\ufffe'   ## Widen to UCS2 for optimization
 )
 ### Encoding table
--- a/Lib/encodings/cp500.py
+++ b/Lib/encodings/cp500.py
@ -301,7 +301,6 @@ decoding_table = (
    '\xd9'     #  0xFD -> LATIN CAPITAL LETTER U WITH GRAVE
    '\xda'     #  0xFE -> LATIN CAPITAL LETTER U WITH ACUTE
    '\x9f'     #  0xFF -> CONTROL
    '\ufffe'   ## Widen to UCS2 for optimization
 )
 ### Encoding table
--- a/Lib/encodings/iso8859_1.py
+++ b/Lib/encodings/iso8859_1.py
@ -301,7 +301,6 @@ decoding_table = (
    '\xfd'     #  0xFD -> LATIN SMALL LETTER Y WITH ACUTE
    '\xfe'     #  0xFE -> LATIN SMALL LETTER THORN (Icelandic)
    '\xff'     #  0xFF -> LATIN SMALL LETTER Y WITH DIAERESIS
    '\ufffe'   ## Widen to UCS2 for optimization
 )
 ### Encoding table
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -7281,6 +7281,7 @@ PyUnicode_DecodeCharmap(const char *s,
        enum PyUnicode_Kind mapkind;
        void *mapdata;
        Py_UCS4 x;
        unsigned char ch;
        if (PyUnicode_READY(mapping) == -1)
            return NULL;
@ -7288,8 +7289,32 @@ PyUnicode_DecodeCharmap(const char *s,
        maplen = PyUnicode_GET_LENGTH(mapping);
        mapdata = PyUnicode_DATA(mapping);
        mapkind = PyUnicode_KIND(mapping);
        if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
            /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
             * is disabled in encoding aliases, latin1 is preferred because
             * its implementation is faster. */
            Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
            Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
            Py_UCS4 maxchar = writer.maxchar;
            assert (writer.kind == PyUnicode_1BYTE_KIND);
            while (s < e) {
                ch = *s;
                x = mapdata_ucs1[ch];
                if (x > maxchar) {
                    if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)
                        goto onError;
                    maxchar = writer.maxchar;
                    outdata = (Py_UCS1 *)writer.data;
                }
                outdata[writer.pos] = x;
                writer.pos++;
                ++s;
            }
        }
        while (s < e) {
            unsigned char ch;
            if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
                enum PyUnicode_Kind outkind = writer.kind;
                void *outdata = writer.data;