mirror of https://github.com/python/cpython
gh-123803: Support arbitrary code page encodings on Windows (GH-123804)
If the cpXXX encoding is not directly implemented in Python, fall back to use the Windows-specific API codecs.code_page_encode() and codecs.code_page_decode().
This commit is contained in:
parent
8fe1926164
commit
f7ef0203d4
|
@ -1042,6 +1042,10 @@ is meant to be exhaustive. Notice that spelling alternatives that only differ in
|
||||||
case or use a hyphen instead of an underscore are also valid aliases; therefore,
|
case or use a hyphen instead of an underscore are also valid aliases; therefore,
|
||||||
e.g. ``'utf-8'`` is a valid alias for the ``'utf_8'`` codec.
|
e.g. ``'utf-8'`` is a valid alias for the ``'utf_8'`` codec.
|
||||||
|
|
||||||
|
On Windows, ``cpXXX`` codecs are available for all code pages.
|
||||||
|
But only codecs listed in the following table are guarantead to exist on
|
||||||
|
other platforms.
|
||||||
|
|
||||||
.. impl-detail::
|
.. impl-detail::
|
||||||
|
|
||||||
Some common encodings can bypass the codecs lookup machinery to
|
Some common encodings can bypass the codecs lookup machinery to
|
||||||
|
@ -1307,6 +1311,9 @@ particular, the following variants typically exist:
|
||||||
.. versionchanged:: 3.8
|
.. versionchanged:: 3.8
|
||||||
``cp65001`` is now an alias to ``utf_8``.
|
``cp65001`` is now an alias to ``utf_8``.
|
||||||
|
|
||||||
|
.. versionchanged:: 3.14
|
||||||
|
On Windows, ``cpXXX`` codecs are now available for all code pages.
|
||||||
|
|
||||||
|
|
||||||
Python Specific Encodings
|
Python Specific Encodings
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
|
@ -194,6 +194,9 @@ Other language changes
|
||||||
They raise an error if the argument is a string.
|
They raise an error if the argument is a string.
|
||||||
(Contributed by Serhiy Storchaka in :gh:`84978`.)
|
(Contributed by Serhiy Storchaka in :gh:`84978`.)
|
||||||
|
|
||||||
|
* All Windows code pages are now supported as "cpXXX" codecs on Windows.
|
||||||
|
(Contributed by Serhiy Storchaka in :gh:`123803`.)
|
||||||
|
|
||||||
* :class:`super` objects are now :mod:`pickleable <pickle>` and
|
* :class:`super` objects are now :mod:`pickleable <pickle>` and
|
||||||
:mod:`copyable <copy>`.
|
:mod:`copyable <copy>`.
|
||||||
(Contributed by Serhiy Storchaka in :gh:`125767`.)
|
(Contributed by Serhiy Storchaka in :gh:`125767`.)
|
||||||
|
|
|
@ -156,19 +156,22 @@ def search_function(encoding):
|
||||||
codecs.register(search_function)
|
codecs.register(search_function)
|
||||||
|
|
||||||
if sys.platform == 'win32':
|
if sys.platform == 'win32':
|
||||||
# bpo-671666, bpo-46668: If Python does not implement a codec for current
|
from ._win_cp_codecs import create_win32_code_page_codec
|
||||||
# Windows ANSI code page, use the "mbcs" codec instead:
|
|
||||||
# WideCharToMultiByte() and MultiByteToWideChar() functions with CP_ACP.
|
|
||||||
# Python does not support custom code pages.
|
|
||||||
def _alias_mbcs(encoding):
|
|
||||||
try:
|
|
||||||
import _winapi
|
|
||||||
ansi_code_page = "cp%s" % _winapi.GetACP()
|
|
||||||
if encoding == ansi_code_page:
|
|
||||||
import encodings.mbcs
|
|
||||||
return encodings.mbcs.getregentry()
|
|
||||||
except ImportError:
|
|
||||||
# Imports may fail while we are shutting down
|
|
||||||
pass
|
|
||||||
|
|
||||||
codecs.register(_alias_mbcs)
|
def win32_code_page_search_function(encoding):
|
||||||
|
encoding = encoding.lower()
|
||||||
|
if not encoding.startswith('cp'):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
cp = int(encoding[2:])
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
# Test if the code page is supported
|
||||||
|
try:
|
||||||
|
codecs.code_page_encode(cp, 'x')
|
||||||
|
except (OverflowError, OSError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
return create_win32_code_page_codec(cp)
|
||||||
|
|
||||||
|
codecs.register(win32_code_page_search_function)
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
def create_win32_code_page_codec(cp):
|
||||||
|
from codecs import code_page_encode, code_page_decode
|
||||||
|
|
||||||
|
def encode(input, errors='strict'):
|
||||||
|
return code_page_encode(cp, input, errors)
|
||||||
|
|
||||||
|
def decode(input, errors='strict'):
|
||||||
|
return code_page_decode(cp, input, errors, True)
|
||||||
|
|
||||||
|
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||||
|
def encode(self, input, final=False):
|
||||||
|
return code_page_encode(cp, input, self.errors)[0]
|
||||||
|
|
||||||
|
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||||
|
def _buffer_decode(self, input, errors, final):
|
||||||
|
return code_page_decode(cp, input, errors, final)
|
||||||
|
|
||||||
|
class StreamWriter(codecs.StreamWriter):
|
||||||
|
def encode(self, input, errors='strict'):
|
||||||
|
return code_page_encode(cp, input, errors)
|
||||||
|
|
||||||
|
class StreamReader(codecs.StreamReader):
|
||||||
|
def decode(self, input, errors, final):
|
||||||
|
return code_page_decode(cp, input, errors, final)
|
||||||
|
|
||||||
|
return codecs.CodecInfo(
|
||||||
|
name=f'cp{cp}',
|
||||||
|
encode=encode,
|
||||||
|
decode=decode,
|
||||||
|
incrementalencoder=IncrementalEncoder,
|
||||||
|
incrementaldecoder=IncrementalDecoder,
|
||||||
|
streamreader=StreamReader,
|
||||||
|
streamwriter=StreamWriter,
|
||||||
|
)
|
|
@ -3256,7 +3256,11 @@ class CodePageTest(unittest.TestCase):
|
||||||
codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
|
codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
|
||||||
|
|
||||||
def check_decode(self, cp, tests):
|
def check_decode(self, cp, tests):
|
||||||
for raw, errors, expected in tests:
|
for raw, errors, expected, *rest in tests:
|
||||||
|
if rest:
|
||||||
|
altexpected, = rest
|
||||||
|
else:
|
||||||
|
altexpected = expected
|
||||||
if expected is not None:
|
if expected is not None:
|
||||||
try:
|
try:
|
||||||
decoded = codecs.code_page_decode(cp, raw, errors, True)
|
decoded = codecs.code_page_decode(cp, raw, errors, True)
|
||||||
|
@ -3273,8 +3277,21 @@ class CodePageTest(unittest.TestCase):
|
||||||
self.assertRaises(UnicodeDecodeError,
|
self.assertRaises(UnicodeDecodeError,
|
||||||
codecs.code_page_decode, cp, raw, errors, True)
|
codecs.code_page_decode, cp, raw, errors, True)
|
||||||
|
|
||||||
|
if altexpected is not None:
|
||||||
|
decoded = raw.decode(f'cp{cp}', errors)
|
||||||
|
self.assertEqual(decoded, altexpected,
|
||||||
|
'%a.decode("cp%s", %r)=%a != %a'
|
||||||
|
% (raw, cp, errors, decoded, altexpected))
|
||||||
|
else:
|
||||||
|
self.assertRaises(UnicodeDecodeError,
|
||||||
|
raw.decode, f'cp{cp}', errors)
|
||||||
|
|
||||||
def check_encode(self, cp, tests):
|
def check_encode(self, cp, tests):
|
||||||
for text, errors, expected in tests:
|
for text, errors, expected, *rest in tests:
|
||||||
|
if rest:
|
||||||
|
altexpected, = rest
|
||||||
|
else:
|
||||||
|
altexpected = expected
|
||||||
if expected is not None:
|
if expected is not None:
|
||||||
try:
|
try:
|
||||||
encoded = codecs.code_page_encode(cp, text, errors)
|
encoded = codecs.code_page_encode(cp, text, errors)
|
||||||
|
@ -3285,18 +3302,26 @@ class CodePageTest(unittest.TestCase):
|
||||||
'%a.encode("cp%s", %r)=%a != %a'
|
'%a.encode("cp%s", %r)=%a != %a'
|
||||||
% (text, cp, errors, encoded[0], expected))
|
% (text, cp, errors, encoded[0], expected))
|
||||||
self.assertEqual(encoded[1], len(text))
|
self.assertEqual(encoded[1], len(text))
|
||||||
|
|
||||||
|
encoded = text.encode(f'cp{cp}', errors)
|
||||||
|
self.assertEqual(encoded, altexpected,
|
||||||
|
'%a.encode("cp%s", %r)=%a != %a'
|
||||||
|
% (text, cp, errors, encoded, altexpected))
|
||||||
else:
|
else:
|
||||||
self.assertRaises(UnicodeEncodeError,
|
self.assertRaises(UnicodeEncodeError,
|
||||||
codecs.code_page_encode, cp, text, errors)
|
codecs.code_page_encode, cp, text, errors)
|
||||||
|
self.assertRaises(UnicodeEncodeError,
|
||||||
|
text.encode, f'cp{cp}', errors)
|
||||||
|
|
||||||
def test_cp932(self):
|
def test_cp932(self):
|
||||||
self.check_encode(932, (
|
self.check_encode(932, (
|
||||||
('abc', 'strict', b'abc'),
|
('abc', 'strict', b'abc'),
|
||||||
('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
|
('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
|
||||||
|
('\uf8f3', 'strict', b'\xff'),
|
||||||
# test error handlers
|
# test error handlers
|
||||||
('\xff', 'strict', None),
|
('\xff', 'strict', None),
|
||||||
('[\xff]', 'ignore', b'[]'),
|
('[\xff]', 'ignore', b'[]'),
|
||||||
('[\xff]', 'replace', b'[y]'),
|
('[\xff]', 'replace', b'[y]', b'[?]'),
|
||||||
('[\u20ac]', 'replace', b'[?]'),
|
('[\u20ac]', 'replace', b'[?]'),
|
||||||
('[\xff]', 'backslashreplace', b'[\\xff]'),
|
('[\xff]', 'backslashreplace', b'[\\xff]'),
|
||||||
('[\xff]', 'namereplace',
|
('[\xff]', 'namereplace',
|
||||||
|
@ -3310,12 +3335,12 @@ class CodePageTest(unittest.TestCase):
|
||||||
(b'abc', 'strict', 'abc'),
|
(b'abc', 'strict', 'abc'),
|
||||||
(b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
|
(b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
|
||||||
# invalid bytes
|
# invalid bytes
|
||||||
(b'[\xff]', 'strict', None),
|
(b'[\xff]', 'strict', None, '[\uf8f3]'),
|
||||||
(b'[\xff]', 'ignore', '[]'),
|
(b'[\xff]', 'ignore', '[]', '[\uf8f3]'),
|
||||||
(b'[\xff]', 'replace', '[\ufffd]'),
|
(b'[\xff]', 'replace', '[\ufffd]', '[\uf8f3]'),
|
||||||
(b'[\xff]', 'backslashreplace', '[\\xff]'),
|
(b'[\xff]', 'backslashreplace', '[\\xff]', '[\uf8f3]'),
|
||||||
(b'[\xff]', 'surrogateescape', '[\udcff]'),
|
(b'[\xff]', 'surrogateescape', '[\udcff]', '[\uf8f3]'),
|
||||||
(b'[\xff]', 'surrogatepass', None),
|
(b'[\xff]', 'surrogatepass', None, '[\uf8f3]'),
|
||||||
(b'\x81\x00abc', 'strict', None),
|
(b'\x81\x00abc', 'strict', None),
|
||||||
(b'\x81\x00abc', 'ignore', '\x00abc'),
|
(b'\x81\x00abc', 'ignore', '\x00abc'),
|
||||||
(b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
|
(b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
|
||||||
|
@ -3330,7 +3355,7 @@ class CodePageTest(unittest.TestCase):
|
||||||
# test error handlers
|
# test error handlers
|
||||||
('\u0141', 'strict', None),
|
('\u0141', 'strict', None),
|
||||||
('\u0141', 'ignore', b''),
|
('\u0141', 'ignore', b''),
|
||||||
('\u0141', 'replace', b'L'),
|
('\u0141', 'replace', b'L', b'?'),
|
||||||
('\udc98', 'surrogateescape', b'\x98'),
|
('\udc98', 'surrogateescape', b'\x98'),
|
||||||
('\udc98', 'surrogatepass', None),
|
('\udc98', 'surrogatepass', None),
|
||||||
))
|
))
|
||||||
|
@ -3340,6 +3365,59 @@ class CodePageTest(unittest.TestCase):
|
||||||
(b'\xff', 'strict', '\xff'),
|
(b'\xff', 'strict', '\xff'),
|
||||||
))
|
))
|
||||||
|
|
||||||
|
def test_cp708(self):
|
||||||
|
self.check_encode(708, (
|
||||||
|
('abc2%', 'strict', b'abc2%'),
|
||||||
|
('\u060c\u0621\u064a', 'strict', b'\xac\xc1\xea'),
|
||||||
|
('\u2562\xe7\xa0', 'strict', b'\x86\x87\xff'),
|
||||||
|
('\x9a\x9f', 'strict', b'\x9a\x9f'),
|
||||||
|
('\u256b', 'strict', b'\xc0'),
|
||||||
|
# test error handlers
|
||||||
|
('[\u0662]', 'strict', None),
|
||||||
|
('[\u0662]', 'ignore', b'[]'),
|
||||||
|
('[\u0662]', 'replace', b'[?]'),
|
||||||
|
('\udca0', 'surrogateescape', b'\xa0'),
|
||||||
|
('\udca0', 'surrogatepass', None),
|
||||||
|
))
|
||||||
|
self.check_decode(708, (
|
||||||
|
(b'abc2%', 'strict', 'abc2%'),
|
||||||
|
(b'\xac\xc1\xea', 'strict', '\u060c\u0621\u064a'),
|
||||||
|
(b'\x86\x87\xff', 'strict', '\u2562\xe7\xa0'),
|
||||||
|
(b'\x9a\x9f', 'strict', '\x9a\x9f'),
|
||||||
|
(b'\xc0', 'strict', '\u256b'),
|
||||||
|
# test error handlers
|
||||||
|
(b'\xa0', 'strict', None),
|
||||||
|
(b'[\xa0]', 'ignore', '[]'),
|
||||||
|
(b'[\xa0]', 'replace', '[\ufffd]'),
|
||||||
|
(b'[\xa0]', 'backslashreplace', '[\\xa0]'),
|
||||||
|
(b'[\xa0]', 'surrogateescape', '[\udca0]'),
|
||||||
|
(b'[\xa0]', 'surrogatepass', None),
|
||||||
|
))
|
||||||
|
|
||||||
|
def test_cp20106(self):
|
||||||
|
self.check_encode(20106, (
|
||||||
|
('abc', 'strict', b'abc'),
|
||||||
|
('\xa7\xc4\xdf', 'strict', b'@[~'),
|
||||||
|
# test error handlers
|
||||||
|
('@', 'strict', None),
|
||||||
|
('@', 'ignore', b''),
|
||||||
|
('@', 'replace', b'?'),
|
||||||
|
('\udcbf', 'surrogateescape', b'\xbf'),
|
||||||
|
('\udcbf', 'surrogatepass', None),
|
||||||
|
))
|
||||||
|
self.check_decode(20106, (
|
||||||
|
(b'abc', 'strict', 'abc'),
|
||||||
|
(b'@[~', 'strict', '\xa7\xc4\xdf'),
|
||||||
|
(b'\xe1\xfe', 'strict', 'a\xdf'),
|
||||||
|
# test error handlers
|
||||||
|
(b'(\xbf)', 'strict', None),
|
||||||
|
(b'(\xbf)', 'ignore', '()'),
|
||||||
|
(b'(\xbf)', 'replace', '(\ufffd)'),
|
||||||
|
(b'(\xbf)', 'backslashreplace', '(\\xbf)'),
|
||||||
|
(b'(\xbf)', 'surrogateescape', '(\udcbf)'),
|
||||||
|
(b'(\xbf)', 'surrogatepass', None),
|
||||||
|
))
|
||||||
|
|
||||||
def test_cp_utf7(self):
|
def test_cp_utf7(self):
|
||||||
cp = 65000
|
cp = 65000
|
||||||
self.check_encode(cp, (
|
self.check_encode(cp, (
|
||||||
|
@ -3412,17 +3490,15 @@ class CodePageTest(unittest.TestCase):
|
||||||
False)
|
False)
|
||||||
self.assertEqual(decoded, ('abc', 3))
|
self.assertEqual(decoded, ('abc', 3))
|
||||||
|
|
||||||
def test_mbcs_alias(self):
|
def test_mbcs_code_page(self):
|
||||||
# Check that looking up our 'default' codepage will return
|
# Check that codec for the current Windows (ANSII) code page is
|
||||||
# mbcs when we don't have a more specific one available
|
# always available.
|
||||||
code_page = 99_999
|
try:
|
||||||
name = f'cp{code_page}'
|
from _winapi import GetACP
|
||||||
with mock.patch('_winapi.GetACP', return_value=code_page):
|
except ImportError:
|
||||||
try:
|
self.skipTest('requires _winapi.GetACP')
|
||||||
codec = codecs.lookup(name)
|
cp = GetACP()
|
||||||
self.assertEqual(codec.name, 'mbcs')
|
codecs.lookup(f'cp{cp}')
|
||||||
finally:
|
|
||||||
codecs.unregister(name)
|
|
||||||
|
|
||||||
@support.bigmemtest(size=2**31, memuse=7, dry_run=False)
|
@support.bigmemtest(size=2**31, memuse=7, dry_run=False)
|
||||||
def test_large_input(self, size):
|
def test_large_input(self, size):
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
All Windows code pages are now supported as "cpXXX" codecs on Windows.
|
Loading…
Reference in New Issue