gh-123803: Support arbitrary code page encodings on Windows (GH-123804)

If the cpXXX encoding is not directly implemented in Python, fall back
to use the Windows-specific API codecs.code_page_encode() and
codecs.code_page_decode().
This commit is contained in:
Serhiy Storchaka 2024-11-18 19:45:25 +02:00 committed by GitHub
parent 8fe1926164
commit f7ef0203d4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 162 additions and 36 deletions

View File

@ -1042,6 +1042,10 @@ is meant to be exhaustive. Notice that spelling alternatives that only differ in
case or use a hyphen instead of an underscore are also valid aliases; therefore, case or use a hyphen instead of an underscore are also valid aliases; therefore,
e.g. ``'utf-8'`` is a valid alias for the ``'utf_8'`` codec. e.g. ``'utf-8'`` is a valid alias for the ``'utf_8'`` codec.
On Windows, ``cpXXX`` codecs are available for all code pages.
But only codecs listed in the following table are guarantead to exist on
other platforms.
.. impl-detail:: .. impl-detail::
Some common encodings can bypass the codecs lookup machinery to Some common encodings can bypass the codecs lookup machinery to
@ -1307,6 +1311,9 @@ particular, the following variants typically exist:
.. versionchanged:: 3.8 .. versionchanged:: 3.8
``cp65001`` is now an alias to ``utf_8``. ``cp65001`` is now an alias to ``utf_8``.
.. versionchanged:: 3.14
On Windows, ``cpXXX`` codecs are now available for all code pages.
Python Specific Encodings Python Specific Encodings
------------------------- -------------------------

View File

@ -194,6 +194,9 @@ Other language changes
They raise an error if the argument is a string. They raise an error if the argument is a string.
(Contributed by Serhiy Storchaka in :gh:`84978`.) (Contributed by Serhiy Storchaka in :gh:`84978`.)
* All Windows code pages are now supported as "cpXXX" codecs on Windows.
(Contributed by Serhiy Storchaka in :gh:`123803`.)
* :class:`super` objects are now :mod:`pickleable <pickle>` and * :class:`super` objects are now :mod:`pickleable <pickle>` and
:mod:`copyable <copy>`. :mod:`copyable <copy>`.
(Contributed by Serhiy Storchaka in :gh:`125767`.) (Contributed by Serhiy Storchaka in :gh:`125767`.)

View File

@ -156,19 +156,22 @@ def search_function(encoding):
codecs.register(search_function) codecs.register(search_function)
if sys.platform == 'win32': if sys.platform == 'win32':
# bpo-671666, bpo-46668: If Python does not implement a codec for current from ._win_cp_codecs import create_win32_code_page_codec
# Windows ANSI code page, use the "mbcs" codec instead:
# WideCharToMultiByte() and MultiByteToWideChar() functions with CP_ACP.
# Python does not support custom code pages.
def _alias_mbcs(encoding):
try:
import _winapi
ansi_code_page = "cp%s" % _winapi.GetACP()
if encoding == ansi_code_page:
import encodings.mbcs
return encodings.mbcs.getregentry()
except ImportError:
# Imports may fail while we are shutting down
pass
codecs.register(_alias_mbcs) def win32_code_page_search_function(encoding):
encoding = encoding.lower()
if not encoding.startswith('cp'):
return None
try:
cp = int(encoding[2:])
except ValueError:
return None
# Test if the code page is supported
try:
codecs.code_page_encode(cp, 'x')
except (OverflowError, OSError):
return None
return create_win32_code_page_codec(cp)
codecs.register(win32_code_page_search_function)

View File

@ -0,0 +1,36 @@
import codecs
def create_win32_code_page_codec(cp):
from codecs import code_page_encode, code_page_decode
def encode(input, errors='strict'):
return code_page_encode(cp, input, errors)
def decode(input, errors='strict'):
return code_page_decode(cp, input, errors, True)
class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return code_page_encode(cp, input, self.errors)[0]
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def _buffer_decode(self, input, errors, final):
return code_page_decode(cp, input, errors, final)
class StreamWriter(codecs.StreamWriter):
def encode(self, input, errors='strict'):
return code_page_encode(cp, input, errors)
class StreamReader(codecs.StreamReader):
def decode(self, input, errors, final):
return code_page_decode(cp, input, errors, final)
return codecs.CodecInfo(
name=f'cp{cp}',
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)

View File

@ -3256,7 +3256,11 @@ class CodePageTest(unittest.TestCase):
codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True) codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
def check_decode(self, cp, tests): def check_decode(self, cp, tests):
for raw, errors, expected in tests: for raw, errors, expected, *rest in tests:
if rest:
altexpected, = rest
else:
altexpected = expected
if expected is not None: if expected is not None:
try: try:
decoded = codecs.code_page_decode(cp, raw, errors, True) decoded = codecs.code_page_decode(cp, raw, errors, True)
@ -3273,8 +3277,21 @@ class CodePageTest(unittest.TestCase):
self.assertRaises(UnicodeDecodeError, self.assertRaises(UnicodeDecodeError,
codecs.code_page_decode, cp, raw, errors, True) codecs.code_page_decode, cp, raw, errors, True)
if altexpected is not None:
decoded = raw.decode(f'cp{cp}', errors)
self.assertEqual(decoded, altexpected,
'%a.decode("cp%s", %r)=%a != %a'
% (raw, cp, errors, decoded, altexpected))
else:
self.assertRaises(UnicodeDecodeError,
raw.decode, f'cp{cp}', errors)
def check_encode(self, cp, tests): def check_encode(self, cp, tests):
for text, errors, expected in tests: for text, errors, expected, *rest in tests:
if rest:
altexpected, = rest
else:
altexpected = expected
if expected is not None: if expected is not None:
try: try:
encoded = codecs.code_page_encode(cp, text, errors) encoded = codecs.code_page_encode(cp, text, errors)
@ -3285,18 +3302,26 @@ class CodePageTest(unittest.TestCase):
'%a.encode("cp%s", %r)=%a != %a' '%a.encode("cp%s", %r)=%a != %a'
% (text, cp, errors, encoded[0], expected)) % (text, cp, errors, encoded[0], expected))
self.assertEqual(encoded[1], len(text)) self.assertEqual(encoded[1], len(text))
encoded = text.encode(f'cp{cp}', errors)
self.assertEqual(encoded, altexpected,
'%a.encode("cp%s", %r)=%a != %a'
% (text, cp, errors, encoded, altexpected))
else: else:
self.assertRaises(UnicodeEncodeError, self.assertRaises(UnicodeEncodeError,
codecs.code_page_encode, cp, text, errors) codecs.code_page_encode, cp, text, errors)
self.assertRaises(UnicodeEncodeError,
text.encode, f'cp{cp}', errors)
def test_cp932(self): def test_cp932(self):
self.check_encode(932, ( self.check_encode(932, (
('abc', 'strict', b'abc'), ('abc', 'strict', b'abc'),
('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'), ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
('\uf8f3', 'strict', b'\xff'),
# test error handlers # test error handlers
('\xff', 'strict', None), ('\xff', 'strict', None),
('[\xff]', 'ignore', b'[]'), ('[\xff]', 'ignore', b'[]'),
('[\xff]', 'replace', b'[y]'), ('[\xff]', 'replace', b'[y]', b'[?]'),
('[\u20ac]', 'replace', b'[?]'), ('[\u20ac]', 'replace', b'[?]'),
('[\xff]', 'backslashreplace', b'[\\xff]'), ('[\xff]', 'backslashreplace', b'[\\xff]'),
('[\xff]', 'namereplace', ('[\xff]', 'namereplace',
@ -3310,12 +3335,12 @@ class CodePageTest(unittest.TestCase):
(b'abc', 'strict', 'abc'), (b'abc', 'strict', 'abc'),
(b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'), (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
# invalid bytes # invalid bytes
(b'[\xff]', 'strict', None), (b'[\xff]', 'strict', None, '[\uf8f3]'),
(b'[\xff]', 'ignore', '[]'), (b'[\xff]', 'ignore', '[]', '[\uf8f3]'),
(b'[\xff]', 'replace', '[\ufffd]'), (b'[\xff]', 'replace', '[\ufffd]', '[\uf8f3]'),
(b'[\xff]', 'backslashreplace', '[\\xff]'), (b'[\xff]', 'backslashreplace', '[\\xff]', '[\uf8f3]'),
(b'[\xff]', 'surrogateescape', '[\udcff]'), (b'[\xff]', 'surrogateescape', '[\udcff]', '[\uf8f3]'),
(b'[\xff]', 'surrogatepass', None), (b'[\xff]', 'surrogatepass', None, '[\uf8f3]'),
(b'\x81\x00abc', 'strict', None), (b'\x81\x00abc', 'strict', None),
(b'\x81\x00abc', 'ignore', '\x00abc'), (b'\x81\x00abc', 'ignore', '\x00abc'),
(b'\x81\x00abc', 'replace', '\ufffd\x00abc'), (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
@ -3330,7 +3355,7 @@ class CodePageTest(unittest.TestCase):
# test error handlers # test error handlers
('\u0141', 'strict', None), ('\u0141', 'strict', None),
('\u0141', 'ignore', b''), ('\u0141', 'ignore', b''),
('\u0141', 'replace', b'L'), ('\u0141', 'replace', b'L', b'?'),
('\udc98', 'surrogateescape', b'\x98'), ('\udc98', 'surrogateescape', b'\x98'),
('\udc98', 'surrogatepass', None), ('\udc98', 'surrogatepass', None),
)) ))
@ -3340,6 +3365,59 @@ class CodePageTest(unittest.TestCase):
(b'\xff', 'strict', '\xff'), (b'\xff', 'strict', '\xff'),
)) ))
def test_cp708(self):
self.check_encode(708, (
('abc2%', 'strict', b'abc2%'),
('\u060c\u0621\u064a', 'strict', b'\xac\xc1\xea'),
('\u2562\xe7\xa0', 'strict', b'\x86\x87\xff'),
('\x9a\x9f', 'strict', b'\x9a\x9f'),
('\u256b', 'strict', b'\xc0'),
# test error handlers
('[\u0662]', 'strict', None),
('[\u0662]', 'ignore', b'[]'),
('[\u0662]', 'replace', b'[?]'),
('\udca0', 'surrogateescape', b'\xa0'),
('\udca0', 'surrogatepass', None),
))
self.check_decode(708, (
(b'abc2%', 'strict', 'abc2%'),
(b'\xac\xc1\xea', 'strict', '\u060c\u0621\u064a'),
(b'\x86\x87\xff', 'strict', '\u2562\xe7\xa0'),
(b'\x9a\x9f', 'strict', '\x9a\x9f'),
(b'\xc0', 'strict', '\u256b'),
# test error handlers
(b'\xa0', 'strict', None),
(b'[\xa0]', 'ignore', '[]'),
(b'[\xa0]', 'replace', '[\ufffd]'),
(b'[\xa0]', 'backslashreplace', '[\\xa0]'),
(b'[\xa0]', 'surrogateescape', '[\udca0]'),
(b'[\xa0]', 'surrogatepass', None),
))
def test_cp20106(self):
self.check_encode(20106, (
('abc', 'strict', b'abc'),
('\xa7\xc4\xdf', 'strict', b'@[~'),
# test error handlers
('@', 'strict', None),
('@', 'ignore', b''),
('@', 'replace', b'?'),
('\udcbf', 'surrogateescape', b'\xbf'),
('\udcbf', 'surrogatepass', None),
))
self.check_decode(20106, (
(b'abc', 'strict', 'abc'),
(b'@[~', 'strict', '\xa7\xc4\xdf'),
(b'\xe1\xfe', 'strict', 'a\xdf'),
# test error handlers
(b'(\xbf)', 'strict', None),
(b'(\xbf)', 'ignore', '()'),
(b'(\xbf)', 'replace', '(\ufffd)'),
(b'(\xbf)', 'backslashreplace', '(\\xbf)'),
(b'(\xbf)', 'surrogateescape', '(\udcbf)'),
(b'(\xbf)', 'surrogatepass', None),
))
def test_cp_utf7(self): def test_cp_utf7(self):
cp = 65000 cp = 65000
self.check_encode(cp, ( self.check_encode(cp, (
@ -3412,17 +3490,15 @@ class CodePageTest(unittest.TestCase):
False) False)
self.assertEqual(decoded, ('abc', 3)) self.assertEqual(decoded, ('abc', 3))
def test_mbcs_alias(self): def test_mbcs_code_page(self):
# Check that looking up our 'default' codepage will return # Check that codec for the current Windows (ANSII) code page is
# mbcs when we don't have a more specific one available # always available.
code_page = 99_999 try:
name = f'cp{code_page}' from _winapi import GetACP
with mock.patch('_winapi.GetACP', return_value=code_page): except ImportError:
try: self.skipTest('requires _winapi.GetACP')
codec = codecs.lookup(name) cp = GetACP()
self.assertEqual(codec.name, 'mbcs') codecs.lookup(f'cp{cp}')
finally:
codecs.unregister(name)
@support.bigmemtest(size=2**31, memuse=7, dry_run=False) @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
def test_large_input(self, size): def test_large_input(self, size):

View File

@ -0,0 +1 @@
All Windows code pages are now supported as "cpXXX" codecs on Windows.