Issue #27959: Adds oem encoding, alias ansi to mbcs, move aliasmbcs to codec lookup

This commit is contained in:
Steve Dower 2016-09-06 19:42:27 -07:00
parent 22d0698d3b
commit f5aba58480
8 changed files with 198 additions and 51 deletions

View File

@ -1663,7 +1663,7 @@ PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
const char *string, /* MBCS encoded string */ const char *string, /* MBCS encoded string */
Py_ssize_t length, /* size of string */ Py_ssize_t length, /* size of string */
const char *errors /* error handling */ const char *errors /* error handling */
); );

View File

@ -29,6 +29,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
"""#" """#"
import codecs import codecs
import sys
from . import aliases from . import aliases
_cache = {} _cache = {}
@ -151,3 +152,12 @@ def search_function(encoding):
# Register the search_function in the Python codec registry # Register the search_function in the Python codec registry
codecs.register(search_function) codecs.register(search_function)
if sys.platform == 'win32':
def _alias_mbcs(encoding):
import _bootlocale
if encoding == _bootlocale.getpreferredencoding(False):
import encodings.mbcs
return encodings.mbcs.getregentry()
codecs.register(_alias_mbcs)

View File

@ -458,6 +458,7 @@ aliases = {
'macturkish' : 'mac_turkish', 'macturkish' : 'mac_turkish',
# mbcs codec # mbcs codec
'ansi' : 'mbcs',
'dbcs' : 'mbcs', 'dbcs' : 'mbcs',
# ptcp154 codec # ptcp154 codec

41
Lib/encodings/oem.py Normal file
View File

@ -0,0 +1,41 @@
""" Python 'oem' Codec for Windows
"""
# Import them explicitly to cause an ImportError
# on non-Windows systems
from codecs import oem_encode, oem_decode
# for IncrementalDecoder, IncrementalEncoder, ...
import codecs
### Codec APIs
encode = oem_encode
def decode(input, errors='strict'):
return oem_decode(input, errors, True)
class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return oem_encode(input, self.errors)[0]
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
_buffer_decode = oem_decode
class StreamWriter(codecs.StreamWriter):
encode = oem_encode
class StreamReader(codecs.StreamReader):
decode = oem_decode
### encodings module API
def getregentry():
return codecs.CodecInfo(
name='oem',
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)

View File

@ -423,21 +423,6 @@ def enablerlcompleter():
sys.__interactivehook__ = register_readline sys.__interactivehook__ = register_readline
def aliasmbcs():
"""On Windows, some default encodings are not provided by Python,
while they are always available as "mbcs" in each locale. Make
them usable by aliasing to "mbcs" in such a case."""
if sys.platform == 'win32':
import _bootlocale, codecs
enc = _bootlocale.getpreferredencoding(False)
if enc.startswith('cp'): # "cp***" ?
try:
codecs.lookup(enc)
except LookupError:
import encodings
encodings._cache[enc] = encodings._unknown
encodings.aliases.aliases[enc] = 'mbcs'
CONFIG_LINE = r'^(?P<key>(\w|[-_])+)\s*=\s*(?P<value>.*)\s*$' CONFIG_LINE = r'^(?P<key>(\w|[-_])+)\s*=\s*(?P<value>.*)\s*$'
def venv(known_paths): def venv(known_paths):
@ -560,7 +545,6 @@ def main():
setcopyright() setcopyright()
sethelper() sethelper()
enablerlcompleter() enablerlcompleter()
aliasmbcs()
execsitecustomize() execsitecustomize()
if ENABLE_USER_SITE: if ENABLE_USER_SITE:
execusercustomize() execusercustomize()

View File

@ -8,11 +8,6 @@ import encodings
from test import support from test import support
if sys.platform == 'win32':
VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
else:
VISTA_OR_LATER = False
try: try:
import ctypes import ctypes
except ImportError: except ImportError:
@ -841,18 +836,13 @@ class CP65001Test(ReadTest, unittest.TestCase):
('abc', 'strict', b'abc'), ('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'), ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'), ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
('\udc80', 'strict', None),
('\udc80', 'ignore', b''),
('\udc80', 'replace', b'?'),
('\udc80', 'backslashreplace', b'\\udc80'),
('\udc80', 'namereplace', b'\\udc80'),
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
] ]
if VISTA_OR_LATER:
tests.extend((
('\udc80', 'strict', None),
('\udc80', 'ignore', b''),
('\udc80', 'replace', b'?'),
('\udc80', 'backslashreplace', b'\\udc80'),
('\udc80', 'namereplace', b'\\udc80'),
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
))
else:
tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
for text, errors, expected in tests: for text, errors, expected in tests:
if expected is not None: if expected is not None:
try: try:
@ -879,17 +869,10 @@ class CP65001Test(ReadTest, unittest.TestCase):
(b'[\xff]', 'ignore', '[]'), (b'[\xff]', 'ignore', '[]'),
(b'[\xff]', 'replace', '[\ufffd]'), (b'[\xff]', 'replace', '[\ufffd]'),
(b'[\xff]', 'surrogateescape', '[\udcff]'), (b'[\xff]', 'surrogateescape', '[\udcff]'),
(b'[\xed\xb2\x80]', 'strict', None),
(b'[\xed\xb2\x80]', 'ignore', '[]'),
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
] ]
if VISTA_OR_LATER:
tests.extend((
(b'[\xed\xb2\x80]', 'strict', None),
(b'[\xed\xb2\x80]', 'ignore', '[]'),
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
))
else:
tests.extend((
(b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
))
for raw, errors, expected in tests: for raw, errors, expected in tests:
if expected is not None: if expected is not None:
try: try:
@ -904,7 +887,6 @@ class CP65001Test(ReadTest, unittest.TestCase):
self.assertRaises(UnicodeDecodeError, self.assertRaises(UnicodeDecodeError,
raw.decode, 'cp65001', errors) raw.decode, 'cp65001', errors)
@unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
def test_lone_surrogates(self): def test_lone_surrogates(self):
self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001") self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001") self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
@ -921,7 +903,6 @@ class CP65001Test(ReadTest, unittest.TestCase):
self.assertEqual("[\uDC80]".encode("cp65001", "replace"), self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
b'[?]') b'[?]')
@unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
def test_surrogatepass_handler(self): def test_surrogatepass_handler(self):
self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"), self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
b"abc\xed\xa0\x80def") b"abc\xed\xa0\x80def")
@ -1951,6 +1932,8 @@ all_unicode_encodings = [
if hasattr(codecs, "mbcs_encode"): if hasattr(codecs, "mbcs_encode"):
all_unicode_encodings.append("mbcs") all_unicode_encodings.append("mbcs")
if hasattr(codecs, "oem_encode"):
all_unicode_encodings.append("oem")
# The following encoding is not tested, because it's not supposed # The following encoding is not tested, because it's not supposed
# to work: # to work:
@ -3119,11 +3102,10 @@ class CodePageTest(unittest.TestCase):
(b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
(b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
)) ))
if VISTA_OR_LATER: self.check_encode(self.CP_UTF8, (
self.check_encode(self.CP_UTF8, ( ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), ))
))
def test_incremental(self): def test_incremental(self):
decoded = codecs.code_page_decode(932, b'\x82', 'strict', False) decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
@ -3144,6 +3126,20 @@ class CodePageTest(unittest.TestCase):
False) False)
self.assertEqual(decoded, ('abc', 3)) self.assertEqual(decoded, ('abc', 3))
def test_mbcs_alias(self):
# Check that looking up our 'default' codepage will return
# mbcs when we don't have a more specific one available
import _bootlocale
def _get_fake_codepage(*a):
return 'cp123'
old_getpreferredencoding = _bootlocale.getpreferredencoding
_bootlocale.getpreferredencoding = _get_fake_codepage
try:
codec = codecs.lookup('cp123')
self.assertEqual(codec.name, 'mbcs')
finally:
_bootlocale.getpreferredencoding = old_getpreferredencoding
class ASCIITest(unittest.TestCase): class ASCIITest(unittest.TestCase):
def test_encode(self): def test_encode(self):

View File

@ -625,6 +625,25 @@ _codecs_mbcs_decode_impl(PyObject *module, Py_buffer *data,
return codec_tuple(decoded, consumed); return codec_tuple(decoded, consumed);
} }
/*[clinic input]
_codecs.oem_decode
data: Py_buffer
errors: str(accept={str, NoneType}) = NULL
final: int(c_default="0") = False
/
[clinic start generated code]*/
static PyObject *
_codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
const char *errors, int final)
/*[clinic end generated code: output=da1617612f3fcad8 input=95b8a92c446b03cd]*/
{
Py_ssize_t consumed = data->len;
PyObject *decoded = PyUnicode_DecodeCodePageStateful(CP_OEMCP,
data->buf, data->len, errors, final ? NULL : &consumed);
return codec_tuple(decoded, consumed);
}
/*[clinic input] /*[clinic input]
_codecs.code_page_decode _codecs.code_page_decode
codepage: int codepage: int
@ -970,6 +989,21 @@ _codecs_mbcs_encode_impl(PyObject *module, PyObject *str, const char *errors)
PyUnicode_GET_LENGTH(str)); PyUnicode_GET_LENGTH(str));
} }
/*[clinic input]
_codecs.oem_encode
str: unicode
errors: str(accept={str, NoneType}) = NULL
/
[clinic start generated code]*/
static PyObject *
_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors)
/*[clinic end generated code: output=65d5982c737de649 input=3fc5f0028aad3cda]*/
{
return codec_tuple(PyUnicode_EncodeCodePage(CP_OEMCP, str, errors),
PyUnicode_GET_LENGTH(str));
}
/*[clinic input] /*[clinic input]
_codecs.code_page_encode _codecs.code_page_encode
code_page: int code_page: int
@ -1075,6 +1109,8 @@ static PyMethodDef _codecs_functions[] = {
_CODECS_READBUFFER_ENCODE_METHODDEF _CODECS_READBUFFER_ENCODE_METHODDEF
_CODECS_MBCS_ENCODE_METHODDEF _CODECS_MBCS_ENCODE_METHODDEF
_CODECS_MBCS_DECODE_METHODDEF _CODECS_MBCS_DECODE_METHODDEF
_CODECS_OEM_ENCODE_METHODDEF
_CODECS_OEM_DECODE_METHODDEF
_CODECS_CODE_PAGE_ENCODE_METHODDEF _CODECS_CODE_PAGE_ENCODE_METHODDEF
_CODECS_CODE_PAGE_DECODE_METHODDEF _CODECS_CODE_PAGE_DECODE_METHODDEF
_CODECS_REGISTER_ERROR_METHODDEF _CODECS_REGISTER_ERROR_METHODDEF

View File

@ -805,6 +805,45 @@ exit:
#if defined(HAVE_MBCS) #if defined(HAVE_MBCS)
PyDoc_STRVAR(_codecs_oem_decode__doc__,
"oem_decode($module, data, errors=None, final=False, /)\n"
"--\n"
"\n");
#define _CODECS_OEM_DECODE_METHODDEF \
{"oem_decode", (PyCFunction)_codecs_oem_decode, METH_VARARGS, _codecs_oem_decode__doc__},
static PyObject *
_codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
const char *errors, int final);
static PyObject *
_codecs_oem_decode(PyObject *module, PyObject *args)
{
PyObject *return_value = NULL;
Py_buffer data = {NULL, NULL};
const char *errors = NULL;
int final = 0;
if (!PyArg_ParseTuple(args, "y*|zi:oem_decode",
&data, &errors, &final)) {
goto exit;
}
return_value = _codecs_oem_decode_impl(module, &data, errors, final);
exit:
/* Cleanup for data */
if (data.obj) {
PyBuffer_Release(&data);
}
return return_value;
}
#endif /* defined(HAVE_MBCS) */
#if defined(HAVE_MBCS)
PyDoc_STRVAR(_codecs_code_page_decode__doc__, PyDoc_STRVAR(_codecs_code_page_decode__doc__,
"code_page_decode($module, codepage, data, errors=None, final=False, /)\n" "code_page_decode($module, codepage, data, errors=None, final=False, /)\n"
"--\n" "--\n"
@ -1346,6 +1385,38 @@ exit:
#if defined(HAVE_MBCS) #if defined(HAVE_MBCS)
PyDoc_STRVAR(_codecs_oem_encode__doc__,
"oem_encode($module, str, errors=None, /)\n"
"--\n"
"\n");
#define _CODECS_OEM_ENCODE_METHODDEF \
{"oem_encode", (PyCFunction)_codecs_oem_encode, METH_VARARGS, _codecs_oem_encode__doc__},
static PyObject *
_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors);
static PyObject *
_codecs_oem_encode(PyObject *module, PyObject *args)
{
PyObject *return_value = NULL;
PyObject *str;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "U|z:oem_encode",
&str, &errors)) {
goto exit;
}
return_value = _codecs_oem_encode_impl(module, str, errors);
exit:
return return_value;
}
#endif /* defined(HAVE_MBCS) */
#if defined(HAVE_MBCS)
PyDoc_STRVAR(_codecs_code_page_encode__doc__, PyDoc_STRVAR(_codecs_code_page_encode__doc__,
"code_page_encode($module, code_page, str, errors=None, /)\n" "code_page_encode($module, code_page, str, errors=None, /)\n"
"--\n" "--\n"
@ -1446,6 +1517,10 @@ exit:
#define _CODECS_MBCS_DECODE_METHODDEF #define _CODECS_MBCS_DECODE_METHODDEF
#endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */ #endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */
#ifndef _CODECS_OEM_DECODE_METHODDEF
#define _CODECS_OEM_DECODE_METHODDEF
#endif /* !defined(_CODECS_OEM_DECODE_METHODDEF) */
#ifndef _CODECS_CODE_PAGE_DECODE_METHODDEF #ifndef _CODECS_CODE_PAGE_DECODE_METHODDEF
#define _CODECS_CODE_PAGE_DECODE_METHODDEF #define _CODECS_CODE_PAGE_DECODE_METHODDEF
#endif /* !defined(_CODECS_CODE_PAGE_DECODE_METHODDEF) */ #endif /* !defined(_CODECS_CODE_PAGE_DECODE_METHODDEF) */
@ -1454,7 +1529,11 @@ exit:
#define _CODECS_MBCS_ENCODE_METHODDEF #define _CODECS_MBCS_ENCODE_METHODDEF
#endif /* !defined(_CODECS_MBCS_ENCODE_METHODDEF) */ #endif /* !defined(_CODECS_MBCS_ENCODE_METHODDEF) */
#ifndef _CODECS_OEM_ENCODE_METHODDEF
#define _CODECS_OEM_ENCODE_METHODDEF
#endif /* !defined(_CODECS_OEM_ENCODE_METHODDEF) */
#ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
#define _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF
#endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
/*[clinic end generated code: output=0221e4eece62c905 input=a9049054013a1b77]*/ /*[clinic end generated code: output=7874e2d559d49368 input=a9049054013a1b77]*/