bpo-36312: Fix decoders for some code pages. (GH-12369)

(cherry picked from commit c1e2c288f4) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2019-03-20 21:31:57 -07:00 · 2019-03-20 21:31:57 -07:00 · 74829b7323
parent 65b9849f0f
commit 74829b7323
3 changed files with 27 additions and 5 deletions
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -3159,6 +3159,15 @@ class CodePageTest(unittest.TestCase):
            ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
        ))
    def test_code_page_decode_flags(self):
        # Issue #36312: For some code pages (e.g. UTF-7) flags for
        # MultiByteToWideChar() must be set to 0.
        for cp in (50220, 50221, 50222, 50225, 50227, 50229,
                   *range(57002, 57011+1), 65000):
            self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3))
        self.assertEqual(codecs.code_page_decode(42, b'abc'),
                         ('\uf061\uf062\uf063', 3))
    def test_incremental(self):
        decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
        self.assertEqual(decoded, ('', 0))
--- a/Misc/NEWS.d/next/Windows/2019-03-16-16-51-17.bpo-36312.Niwm-T.rst
+++ b/Misc/NEWS.d/next/Windows/2019-03-16-16-51-17.bpo-36312.Niwm-T.rst
@ -0,0 +1,2 @@
 Fixed decoders for the following code pages: 50220, 50221, 50222, 50225,
 50227, 50229, 57002 through 57011, 65000 and 42.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -7123,15 +7123,21 @@ decode_code_page_strict(UINT code_page,
                        const char *in,
                        int insize)
 {
-    const DWORD flags = decode_code_page_flags(code_page);
+    DWORD flags = MB_ERR_INVALID_CHARS;
    wchar_t *out;
    DWORD outsize;
    /* First get the size of the result */
    assert(insize > 0);
-    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
+    while ((outsize = MultiByteToWideChar(code_page, flags,
-    if (outsize <= 0)
+                                          in, insize, NULL, 0)) <= 0)
-        goto error;
+    {
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
            goto error;
        }
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
        flags = 0;
    }
    if (*v == NULL) {
        /* Create unicode object */
@ -7177,7 +7183,7 @@ decode_code_page_errors(UINT code_page,
 {
    const char *startin = in;
    const char *endin = in + size;
-    const DWORD flags = decode_code_page_flags(code_page);
+    DWORD flags = MB_ERR_INVALID_CHARS;
    /* Ideally, we should get reason from FormatMessage. This is the Windows
       2000 English version of the message. */
    const char *reason = "No mapping for the Unicode character exists "
@ -7248,6 +7254,11 @@ decode_code_page_errors(UINT code_page,
            if (outsize > 0)
                break;
            err = GetLastError();
            if (err == ERROR_INVALID_FLAGS && flags) {
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
                flags = 0;
                continue;
            }
            if (err != ERROR_NO_UNICODE_TRANSLATION
                && err != ERROR_INSUFFICIENT_BUFFER)
            {
		`@ -0,0 +1,2 @@`
							`Fixed decoders for the following code pages: 50220, 50221, 50222, 50225,`
							`50227, 50229, 57002 through 57011, 65000 and 42.`