Issue #27959: Adds oem encoding, alias ansi to mbcs, move aliasmbcs to codec lookup

2016-09-06 19:42:27 -07:00 · 2016-09-06 19:42:27 -07:00 · f5aba58480
parent 22d0698d3b
commit f5aba58480
8 changed files with 198 additions and 51 deletions
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -1663,7 +1663,7 @@ PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
    const char *string,         /* MBCS encoded string */
-    Py_ssize_t length,              /* size of string */
+    Py_ssize_t length,          /* size of string */
    const char *errors          /* error handling */
    );
--- a/Lib/encodings/init.py
+++ b/Lib/encodings/init.py
@ -29,6 +29,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
 """#"
 import codecs
 import sys
 from . import aliases
 _cache = {}
@ -151,3 +152,12 @@ def search_function(encoding):
 # Register the search_function in the Python codec registry
 codecs.register(search_function)
 if sys.platform == 'win32':
    def _alias_mbcs(encoding):
        import _bootlocale
        if encoding == _bootlocale.getpreferredencoding(False):
            import encodings.mbcs
            return encodings.mbcs.getregentry()
    codecs.register(_alias_mbcs)
--- a/Lib/encodings/aliases.py
+++ b/Lib/encodings/aliases.py
@ -458,6 +458,7 @@ aliases = {
    'macturkish'         : 'mac_turkish',
    # mbcs codec
    'ansi'               : 'mbcs',
    'dbcs'               : 'mbcs',
    # ptcp154 codec
--- a/Lib/encodings/oem.py
+++ b/Lib/encodings/oem.py
@ -0,0 +1,41 @@
 """ Python 'oem' Codec for Windows
 """
 # Import them explicitly to cause an ImportError
 # on non-Windows systems
 from codecs import oem_encode, oem_decode
 # for IncrementalDecoder, IncrementalEncoder, ...
 import codecs
 ### Codec APIs
 encode = oem_encode
 def decode(input, errors='strict'):
    return oem_decode(input, errors, True)
 class IncrementalEncoder(codecs.IncrementalEncoder):
    def encode(self, input, final=False):
        return oem_encode(input, self.errors)[0]
 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
    _buffer_decode = oem_decode
 class StreamWriter(codecs.StreamWriter):
    encode = oem_encode
 class StreamReader(codecs.StreamReader):
    decode = oem_decode
 ### encodings module API
 def getregentry():
    return codecs.CodecInfo(
        name='oem',
        encode=encode,
        decode=decode,
        incrementalencoder=IncrementalEncoder,
        incrementaldecoder=IncrementalDecoder,
        streamreader=StreamReader,
        streamwriter=StreamWriter,
    )
--- a/Lib/site.py
+++ b/Lib/site.py
@ -423,21 +423,6 @@ def enablerlcompleter():
    sys.__interactivehook__ = register_readline
 def aliasmbcs():
    """On Windows, some default encodings are not provided by Python,
    while they are always available as "mbcs" in each locale. Make
    them usable by aliasing to "mbcs" in such a case."""
    if sys.platform == 'win32':
        import _bootlocale, codecs
        enc = _bootlocale.getpreferredencoding(False)
        if enc.startswith('cp'):            # "cp***" ?
            try:
                codecs.lookup(enc)
            except LookupError:
                import encodings
                encodings._cache[enc] = encodings._unknown
                encodings.aliases.aliases[enc] = 'mbcs'
 CONFIG_LINE = r'^(?P<key>(\w|[-_])+)\s*=\s*(?P<value>.*)\s*$'
 def venv(known_paths):
@ -560,7 +545,6 @@ def main():
    setcopyright()
    sethelper()
    enablerlcompleter()
    aliasmbcs()
    execsitecustomize()
    if ENABLE_USER_SITE:
        execusercustomize()
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -8,11 +8,6 @@ import encodings
 from test import support
 if sys.platform == 'win32':
    VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
 else:
    VISTA_OR_LATER = False
 try:
    import ctypes
 except ImportError:
@ -841,18 +836,13 @@ class CP65001Test(ReadTest, unittest.TestCase):
            ('abc', 'strict', b'abc'),
            ('\xe9\u20ac', 'strict',  b'\xc3\xa9\xe2\x82\xac'),
            ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
            ('\udc80', 'strict', None),
            ('\udc80', 'ignore', b''),
            ('\udc80', 'replace', b'?'),
            ('\udc80', 'backslashreplace', b'\\udc80'),
            ('\udc80', 'namereplace', b'\\udc80'),
            ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
        ]
        if VISTA_OR_LATER:
            tests.extend((
                ('\udc80', 'strict', None),
                ('\udc80', 'ignore', b''),
                ('\udc80', 'replace', b'?'),
                ('\udc80', 'backslashreplace', b'\\udc80'),
                ('\udc80', 'namereplace', b'\\udc80'),
                ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
            ))
        else:
            tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
        for text, errors, expected in tests:
            if expected is not None:
                try:
@ -879,17 +869,10 @@ class CP65001Test(ReadTest, unittest.TestCase):
            (b'[\xff]', 'ignore', '[]'),
            (b'[\xff]', 'replace', '[\ufffd]'),
            (b'[\xff]', 'surrogateescape', '[\udcff]'),
            (b'[\xed\xb2\x80]', 'strict', None),
            (b'[\xed\xb2\x80]', 'ignore', '[]'),
            (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
        ]
        if VISTA_OR_LATER:
            tests.extend((
                (b'[\xed\xb2\x80]', 'strict', None),
                (b'[\xed\xb2\x80]', 'ignore', '[]'),
                (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
            ))
        else:
            tests.extend((
                (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
            ))
        for raw, errors, expected in tests:
            if expected is not None:
                try:
@ -904,7 +887,6 @@ class CP65001Test(ReadTest, unittest.TestCase):
                self.assertRaises(UnicodeDecodeError,
                    raw.decode, 'cp65001', errors)
    @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
    def test_lone_surrogates(self):
        self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
        self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
@ -921,7 +903,6 @@ class CP65001Test(ReadTest, unittest.TestCase):
        self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
                         b'[?]')
    @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
    def test_surrogatepass_handler(self):
        self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
                         b"abc\xed\xa0\x80def")
@ -1951,6 +1932,8 @@ all_unicode_encodings = [
 if hasattr(codecs, "mbcs_encode"):
    all_unicode_encodings.append("mbcs")
 if hasattr(codecs, "oem_encode"):
    all_unicode_encodings.append("oem")
 # The following encoding is not tested, because it's not supposed
 # to work:
@ -3119,11 +3102,10 @@ class CodePageTest(unittest.TestCase):
            (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
            (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
        ))
-        if VISTA_OR_LATER:
+        self.check_encode(self.CP_UTF8, (
-            self.check_encode(self.CP_UTF8, (
+            ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
-                ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
+            ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
-                ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
+        ))
            ))
    def test_incremental(self):
        decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
@ -3144,6 +3126,20 @@ class CodePageTest(unittest.TestCase):
                                          False)
        self.assertEqual(decoded, ('abc', 3))
    def test_mbcs_alias(self):
        # Check that looking up our 'default' codepage will return
        # mbcs when we don't have a more specific one available
        import _bootlocale
        def _get_fake_codepage(*a):
            return 'cp123'
        old_getpreferredencoding = _bootlocale.getpreferredencoding
        _bootlocale.getpreferredencoding = _get_fake_codepage
        try:
            codec = codecs.lookup('cp123')
            self.assertEqual(codec.name, 'mbcs')
        finally:
            _bootlocale.getpreferredencoding = old_getpreferredencoding
 class ASCIITest(unittest.TestCase):
    def test_encode(self):
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@ -625,6 +625,25 @@ _codecs_mbcs_decode_impl(PyObject *module, Py_buffer *data,
    return codec_tuple(decoded, consumed);
 }
 /*[clinic input]
 _codecs.oem_decode
    data: Py_buffer
    errors: str(accept={str, NoneType}) = NULL
    final: int(c_default="0") = False
    /
 [clinic start generated code]*/
 static PyObject *
 _codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
                        const char *errors, int final)
 /*[clinic end generated code: output=da1617612f3fcad8 input=95b8a92c446b03cd]*/
 {
    Py_ssize_t consumed = data->len;
    PyObject *decoded = PyUnicode_DecodeCodePageStateful(CP_OEMCP,
        data->buf, data->len, errors, final ? NULL : &consumed);
    return codec_tuple(decoded, consumed);
 }
 /*[clinic input]
 _codecs.code_page_decode
    codepage: int
@ -970,6 +989,21 @@ _codecs_mbcs_encode_impl(PyObject *module, PyObject *str, const char *errors)
                       PyUnicode_GET_LENGTH(str));
 }
 /*[clinic input]
 _codecs.oem_encode
    str: unicode
    errors: str(accept={str, NoneType}) = NULL
    /
 [clinic start generated code]*/
 static PyObject *
 _codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors)
 /*[clinic end generated code: output=65d5982c737de649 input=3fc5f0028aad3cda]*/
 {
    return codec_tuple(PyUnicode_EncodeCodePage(CP_OEMCP, str, errors),
        PyUnicode_GET_LENGTH(str));
 }
 /*[clinic input]
 _codecs.code_page_encode
    code_page: int
@ -1075,6 +1109,8 @@ static PyMethodDef _codecs_functions[] = {
    _CODECS_READBUFFER_ENCODE_METHODDEF
    _CODECS_MBCS_ENCODE_METHODDEF
    _CODECS_MBCS_DECODE_METHODDEF
    _CODECS_OEM_ENCODE_METHODDEF
    _CODECS_OEM_DECODE_METHODDEF
    _CODECS_CODE_PAGE_ENCODE_METHODDEF
    _CODECS_CODE_PAGE_DECODE_METHODDEF
    _CODECS_REGISTER_ERROR_METHODDEF
--- a/Modules/clinic/_codecsmodule.c.h
+++ b/Modules/clinic/_codecsmodule.c.h
@ -805,6 +805,45 @@ exit:
 #if defined(HAVE_MBCS)
 PyDoc_STRVAR(_codecs_oem_decode__doc__,
 "oem_decode($module, data, errors=None, final=False, /)\n"
 "--\n"
 "\n");
 #define _CODECS_OEM_DECODE_METHODDEF    \
    {"oem_decode", (PyCFunction)_codecs_oem_decode, METH_VARARGS, _codecs_oem_decode__doc__},
 static PyObject *
 _codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
                        const char *errors, int final);
 static PyObject *
 _codecs_oem_decode(PyObject *module, PyObject *args)
 {
    PyObject *return_value = NULL;
    Py_buffer data = {NULL, NULL};
    const char *errors = NULL;
    int final = 0;
    if (!PyArg_ParseTuple(args, "y*|zi:oem_decode",
        &data, &errors, &final)) {
        goto exit;
    }
    return_value = _codecs_oem_decode_impl(module, &data, errors, final);
 exit:
    /* Cleanup for data */
    if (data.obj) {
       PyBuffer_Release(&data);
    }
    return return_value;
 }
 #endif /* defined(HAVE_MBCS) */
 #if defined(HAVE_MBCS)
 PyDoc_STRVAR(_codecs_code_page_decode__doc__,
 "code_page_decode($module, codepage, data, errors=None, final=False, /)\n"
 "--\n"
@ -1346,6 +1385,38 @@ exit:
 #if defined(HAVE_MBCS)
 PyDoc_STRVAR(_codecs_oem_encode__doc__,
 "oem_encode($module, str, errors=None, /)\n"
 "--\n"
 "\n");
 #define _CODECS_OEM_ENCODE_METHODDEF    \
    {"oem_encode", (PyCFunction)_codecs_oem_encode, METH_VARARGS, _codecs_oem_encode__doc__},
 static PyObject *
 _codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors);
 static PyObject *
 _codecs_oem_encode(PyObject *module, PyObject *args)
 {
    PyObject *return_value = NULL;
    PyObject *str;
    const char *errors = NULL;
    if (!PyArg_ParseTuple(args, "U|z:oem_encode",
        &str, &errors)) {
        goto exit;
    }
    return_value = _codecs_oem_encode_impl(module, str, errors);
 exit:
    return return_value;
 }
 #endif /* defined(HAVE_MBCS) */
 #if defined(HAVE_MBCS)
 PyDoc_STRVAR(_codecs_code_page_encode__doc__,
 "code_page_encode($module, code_page, str, errors=None, /)\n"
 "--\n"
@ -1446,6 +1517,10 @@ exit:
    #define _CODECS_MBCS_DECODE_METHODDEF
 #endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */
 #ifndef _CODECS_OEM_DECODE_METHODDEF
    #define _CODECS_OEM_DECODE_METHODDEF
 #endif /* !defined(_CODECS_OEM_DECODE_METHODDEF) */
 #ifndef _CODECS_CODE_PAGE_DECODE_METHODDEF
    #define _CODECS_CODE_PAGE_DECODE_METHODDEF
 #endif /* !defined(_CODECS_CODE_PAGE_DECODE_METHODDEF) */
@ -1454,7 +1529,11 @@ exit:
    #define _CODECS_MBCS_ENCODE_METHODDEF
 #endif /* !defined(_CODECS_MBCS_ENCODE_METHODDEF) */
 #ifndef _CODECS_OEM_ENCODE_METHODDEF
    #define _CODECS_OEM_ENCODE_METHODDEF
 #endif /* !defined(_CODECS_OEM_ENCODE_METHODDEF) */
 #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
    #define _CODECS_CODE_PAGE_ENCODE_METHODDEF
 #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
-/*[clinic end generated code: output=0221e4eece62c905 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=7874e2d559d49368 input=a9049054013a1b77]*/