From f7ef0203d44acb21ab1c5ff0c3e15f9727862760 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 18 Nov 2024 19:45:25 +0200 Subject: [PATCH] gh-123803: Support arbitrary code page encodings on Windows (GH-123804) If the cpXXX encoding is not directly implemented in Python, fall back to use the Windows-specific API codecs.code_page_encode() and codecs.code_page_decode(). --- Doc/library/codecs.rst | 7 ++ Doc/whatsnew/3.14.rst | 3 + Lib/encodings/__init__.py | 33 ++--- Lib/encodings/_win_cp_codecs.py | 36 ++++++ Lib/test/test_codecs.py | 118 ++++++++++++++---- ...-09-07-15-16-24.gh-issue-123803.J9VNQU.rst | 1 + 6 files changed, 162 insertions(+), 36 deletions(-) create mode 100644 Lib/encodings/_win_cp_codecs.py create mode 100644 Misc/NEWS.d/next/Windows/2024-09-07-15-16-24.gh-issue-123803.J9VNQU.rst diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 2cfd8a1eaee..a129a26190b 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1042,6 +1042,10 @@ is meant to be exhaustive. Notice that spelling alternatives that only differ in case or use a hyphen instead of an underscore are also valid aliases; therefore, e.g. ``'utf-8'`` is a valid alias for the ``'utf_8'`` codec. +On Windows, ``cpXXX`` codecs are available for all code pages. +But only codecs listed in the following table are guarantead to exist on +other platforms. + .. impl-detail:: Some common encodings can bypass the codecs lookup machinery to @@ -1307,6 +1311,9 @@ particular, the following variants typically exist: .. versionchanged:: 3.8 ``cp65001`` is now an alias to ``utf_8``. +.. versionchanged:: 3.14 + On Windows, ``cpXXX`` codecs are now available for all code pages. + Python Specific Encodings ------------------------- diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 958efbe73c1..8196250d784 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -194,6 +194,9 @@ Other language changes They raise an error if the argument is a string. (Contributed by Serhiy Storchaka in :gh:`84978`.) +* All Windows code pages are now supported as "cpXXX" codecs on Windows. + (Contributed by Serhiy Storchaka in :gh:`123803`.) + * :class:`super` objects are now :mod:`pickleable ` and :mod:`copyable `. (Contributed by Serhiy Storchaka in :gh:`125767`.) diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index f9075b8f0d9..298177eb800 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -156,19 +156,22 @@ def search_function(encoding): codecs.register(search_function) if sys.platform == 'win32': - # bpo-671666, bpo-46668: If Python does not implement a codec for current - # Windows ANSI code page, use the "mbcs" codec instead: - # WideCharToMultiByte() and MultiByteToWideChar() functions with CP_ACP. - # Python does not support custom code pages. - def _alias_mbcs(encoding): - try: - import _winapi - ansi_code_page = "cp%s" % _winapi.GetACP() - if encoding == ansi_code_page: - import encodings.mbcs - return encodings.mbcs.getregentry() - except ImportError: - # Imports may fail while we are shutting down - pass + from ._win_cp_codecs import create_win32_code_page_codec - codecs.register(_alias_mbcs) + def win32_code_page_search_function(encoding): + encoding = encoding.lower() + if not encoding.startswith('cp'): + return None + try: + cp = int(encoding[2:]) + except ValueError: + return None + # Test if the code page is supported + try: + codecs.code_page_encode(cp, 'x') + except (OverflowError, OSError): + return None + + return create_win32_code_page_codec(cp) + + codecs.register(win32_code_page_search_function) diff --git a/Lib/encodings/_win_cp_codecs.py b/Lib/encodings/_win_cp_codecs.py new file mode 100644 index 00000000000..4f8eb886794 --- /dev/null +++ b/Lib/encodings/_win_cp_codecs.py @@ -0,0 +1,36 @@ +import codecs + +def create_win32_code_page_codec(cp): + from codecs import code_page_encode, code_page_decode + + def encode(input, errors='strict'): + return code_page_encode(cp, input, errors) + + def decode(input, errors='strict'): + return code_page_decode(cp, input, errors, True) + + class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return code_page_encode(cp, input, self.errors)[0] + + class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def _buffer_decode(self, input, errors, final): + return code_page_decode(cp, input, errors, final) + + class StreamWriter(codecs.StreamWriter): + def encode(self, input, errors='strict'): + return code_page_encode(cp, input, errors) + + class StreamReader(codecs.StreamReader): + def decode(self, input, errors, final): + return code_page_decode(cp, input, errors, final) + + return codecs.CodecInfo( + name=f'cp{cp}', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 290656f0705..e51f7e0ee12 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3256,7 +3256,11 @@ class CodePageTest(unittest.TestCase): codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True) def check_decode(self, cp, tests): - for raw, errors, expected in tests: + for raw, errors, expected, *rest in tests: + if rest: + altexpected, = rest + else: + altexpected = expected if expected is not None: try: decoded = codecs.code_page_decode(cp, raw, errors, True) @@ -3273,8 +3277,21 @@ class CodePageTest(unittest.TestCase): self.assertRaises(UnicodeDecodeError, codecs.code_page_decode, cp, raw, errors, True) + if altexpected is not None: + decoded = raw.decode(f'cp{cp}', errors) + self.assertEqual(decoded, altexpected, + '%a.decode("cp%s", %r)=%a != %a' + % (raw, cp, errors, decoded, altexpected)) + else: + self.assertRaises(UnicodeDecodeError, + raw.decode, f'cp{cp}', errors) + def check_encode(self, cp, tests): - for text, errors, expected in tests: + for text, errors, expected, *rest in tests: + if rest: + altexpected, = rest + else: + altexpected = expected if expected is not None: try: encoded = codecs.code_page_encode(cp, text, errors) @@ -3285,18 +3302,26 @@ class CodePageTest(unittest.TestCase): '%a.encode("cp%s", %r)=%a != %a' % (text, cp, errors, encoded[0], expected)) self.assertEqual(encoded[1], len(text)) + + encoded = text.encode(f'cp{cp}', errors) + self.assertEqual(encoded, altexpected, + '%a.encode("cp%s", %r)=%a != %a' + % (text, cp, errors, encoded, altexpected)) else: self.assertRaises(UnicodeEncodeError, codecs.code_page_encode, cp, text, errors) + self.assertRaises(UnicodeEncodeError, + text.encode, f'cp{cp}', errors) def test_cp932(self): self.check_encode(932, ( ('abc', 'strict', b'abc'), ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'), + ('\uf8f3', 'strict', b'\xff'), # test error handlers ('\xff', 'strict', None), ('[\xff]', 'ignore', b'[]'), - ('[\xff]', 'replace', b'[y]'), + ('[\xff]', 'replace', b'[y]', b'[?]'), ('[\u20ac]', 'replace', b'[?]'), ('[\xff]', 'backslashreplace', b'[\\xff]'), ('[\xff]', 'namereplace', @@ -3310,12 +3335,12 @@ class CodePageTest(unittest.TestCase): (b'abc', 'strict', 'abc'), (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'), # invalid bytes - (b'[\xff]', 'strict', None), - (b'[\xff]', 'ignore', '[]'), - (b'[\xff]', 'replace', '[\ufffd]'), - (b'[\xff]', 'backslashreplace', '[\\xff]'), - (b'[\xff]', 'surrogateescape', '[\udcff]'), - (b'[\xff]', 'surrogatepass', None), + (b'[\xff]', 'strict', None, '[\uf8f3]'), + (b'[\xff]', 'ignore', '[]', '[\uf8f3]'), + (b'[\xff]', 'replace', '[\ufffd]', '[\uf8f3]'), + (b'[\xff]', 'backslashreplace', '[\\xff]', '[\uf8f3]'), + (b'[\xff]', 'surrogateescape', '[\udcff]', '[\uf8f3]'), + (b'[\xff]', 'surrogatepass', None, '[\uf8f3]'), (b'\x81\x00abc', 'strict', None), (b'\x81\x00abc', 'ignore', '\x00abc'), (b'\x81\x00abc', 'replace', '\ufffd\x00abc'), @@ -3330,7 +3355,7 @@ class CodePageTest(unittest.TestCase): # test error handlers ('\u0141', 'strict', None), ('\u0141', 'ignore', b''), - ('\u0141', 'replace', b'L'), + ('\u0141', 'replace', b'L', b'?'), ('\udc98', 'surrogateescape', b'\x98'), ('\udc98', 'surrogatepass', None), )) @@ -3340,6 +3365,59 @@ class CodePageTest(unittest.TestCase): (b'\xff', 'strict', '\xff'), )) + def test_cp708(self): + self.check_encode(708, ( + ('abc2%', 'strict', b'abc2%'), + ('\u060c\u0621\u064a', 'strict', b'\xac\xc1\xea'), + ('\u2562\xe7\xa0', 'strict', b'\x86\x87\xff'), + ('\x9a\x9f', 'strict', b'\x9a\x9f'), + ('\u256b', 'strict', b'\xc0'), + # test error handlers + ('[\u0662]', 'strict', None), + ('[\u0662]', 'ignore', b'[]'), + ('[\u0662]', 'replace', b'[?]'), + ('\udca0', 'surrogateescape', b'\xa0'), + ('\udca0', 'surrogatepass', None), + )) + self.check_decode(708, ( + (b'abc2%', 'strict', 'abc2%'), + (b'\xac\xc1\xea', 'strict', '\u060c\u0621\u064a'), + (b'\x86\x87\xff', 'strict', '\u2562\xe7\xa0'), + (b'\x9a\x9f', 'strict', '\x9a\x9f'), + (b'\xc0', 'strict', '\u256b'), + # test error handlers + (b'\xa0', 'strict', None), + (b'[\xa0]', 'ignore', '[]'), + (b'[\xa0]', 'replace', '[\ufffd]'), + (b'[\xa0]', 'backslashreplace', '[\\xa0]'), + (b'[\xa0]', 'surrogateescape', '[\udca0]'), + (b'[\xa0]', 'surrogatepass', None), + )) + + def test_cp20106(self): + self.check_encode(20106, ( + ('abc', 'strict', b'abc'), + ('\xa7\xc4\xdf', 'strict', b'@[~'), + # test error handlers + ('@', 'strict', None), + ('@', 'ignore', b''), + ('@', 'replace', b'?'), + ('\udcbf', 'surrogateescape', b'\xbf'), + ('\udcbf', 'surrogatepass', None), + )) + self.check_decode(20106, ( + (b'abc', 'strict', 'abc'), + (b'@[~', 'strict', '\xa7\xc4\xdf'), + (b'\xe1\xfe', 'strict', 'a\xdf'), + # test error handlers + (b'(\xbf)', 'strict', None), + (b'(\xbf)', 'ignore', '()'), + (b'(\xbf)', 'replace', '(\ufffd)'), + (b'(\xbf)', 'backslashreplace', '(\\xbf)'), + (b'(\xbf)', 'surrogateescape', '(\udcbf)'), + (b'(\xbf)', 'surrogatepass', None), + )) + def test_cp_utf7(self): cp = 65000 self.check_encode(cp, ( @@ -3412,17 +3490,15 @@ class CodePageTest(unittest.TestCase): False) self.assertEqual(decoded, ('abc', 3)) - def test_mbcs_alias(self): - # Check that looking up our 'default' codepage will return - # mbcs when we don't have a more specific one available - code_page = 99_999 - name = f'cp{code_page}' - with mock.patch('_winapi.GetACP', return_value=code_page): - try: - codec = codecs.lookup(name) - self.assertEqual(codec.name, 'mbcs') - finally: - codecs.unregister(name) + def test_mbcs_code_page(self): + # Check that codec for the current Windows (ANSII) code page is + # always available. + try: + from _winapi import GetACP + except ImportError: + self.skipTest('requires _winapi.GetACP') + cp = GetACP() + codecs.lookup(f'cp{cp}') @support.bigmemtest(size=2**31, memuse=7, dry_run=False) def test_large_input(self, size): diff --git a/Misc/NEWS.d/next/Windows/2024-09-07-15-16-24.gh-issue-123803.J9VNQU.rst b/Misc/NEWS.d/next/Windows/2024-09-07-15-16-24.gh-issue-123803.J9VNQU.rst new file mode 100644 index 00000000000..3ad4d12eb0d --- /dev/null +++ b/Misc/NEWS.d/next/Windows/2024-09-07-15-16-24.gh-issue-123803.J9VNQU.rst @@ -0,0 +1 @@ +All Windows code pages are now supported as "cpXXX" codecs on Windows.