Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore

error handlers on all Windows versions. The MBCS codec is now supporting all error handlers, instead of only replace to encode and ignore to decode.
2011-10-18 21:21:00 +02:00 · 2011-10-18 21:21:00 +02:00 · 3a50e7056e
parent 1e73a2467f
commit 3a50e7056e
8 changed files with 893 additions and 152 deletions
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@ -1280,12 +1280,13 @@ functions can be used directly if desired.
 .. module:: encodings.mbcs
   :synopsis: Windows ANSI codepage

-Encode operand according to the ANSI codepage (CP_ACP). This codec only
-supports ``'strict'`` and ``'replace'`` error handlers to encode, and
-``'strict'`` and ``'ignore'`` error handlers to decode.
+Encode operand according to the ANSI codepage (CP_ACP).

 Availability: Windows only.

+.. versionchanged:: 3.3
+   Support any error handler.
+
 .. versionchanged:: 3.2
   Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
   to encode, and ``'ignore'`` to decode.
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@ -197,6 +197,11 @@ The :mod:`array` module supports the :c:type:`long long` type using ``q`` and
 codecs
 ------

+The :mod:`~encodings.mbcs` codec has be rewritten to handle correclty
+``replace`` and ``ignore`` error handlers on all Windows versions. The
+:mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of
+only ``replace`` to encode and ``ignore`` to decode.
+
 Multibyte CJK decoders now resynchronize faster. They only ignore the first
 byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312',
 'replace')`` now returns a ``\n`` after the replacement character.
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -1466,6 +1466,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
    Py_ssize_t *consumed        /* bytes consumed */
    );

+PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
+    int code_page,              /* code page number */
+    const char *string,         /* encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
    PyObject *unicode           /* Unicode object */
    );
@ -1473,11 +1481,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
 #ifndef Py_LIMITED_API
 PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
    const Py_UNICODE *data,     /* Unicode char buffer */
-    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
+    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
    const char *errors          /* error handling */
    );
 #endif

+PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
+    int code_page,              /* code page number */
+    PyObject *unicode,          /* Unicode object */
+    const char *errors          /* error handling */
+    );
+
 #endif /* HAVE_MBCS */

 /* --- Decimal Encoder ---------------------------------------------------- */
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -1744,6 +1744,203 @@ class TransformCodecTest(unittest.TestCase):
            self.assertEqual(sout, b"\x80")


+class CodePageTest(unittest.TestCase):
+    CP_UTF8 = 65001
+    vista_or_later = (sys.getwindowsversion().major >= 6)
+
+    def test_invalid_code_page(self):
+        self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
+        self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
+        self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
+        self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
+
+    def test_code_page_name(self):
+        self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
+            codecs.code_page_encode, 932, '\xff')
+        self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
+            codecs.code_page_decode, 932, b'\x81\x00')
+        self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
+            codecs.code_page_decode, self.CP_UTF8, b'\xff')
+
+    def check_decode(self, cp, tests):
+        for raw, errors, expected in tests:
+            if expected is not None:
+                try:
+                    decoded = codecs.code_page_decode(cp, raw, errors)
+                except UnicodeDecodeError as err:
+                    self.fail('Unable to decode %a from "cp%s" with '
+                              'errors=%r: %s' % (raw, cp, errors, err))
+                self.assertEqual(decoded[0], expected,
+                    '%a.decode("cp%s", %r)=%a != %a'
+                    % (raw, cp, errors, decoded[0], expected))
+                # assert 0 <= decoded[1] <= len(raw)
+                self.assertGreaterEqual(decoded[1], 0)
+                self.assertLessEqual(decoded[1], len(raw))
+            else:
+                self.assertRaises(UnicodeDecodeError,
+                    codecs.code_page_decode, cp, raw, errors)
+
+    def check_encode(self, cp, tests):
+        for text, errors, expected in tests:
+            if expected is not None:
+                try:
+                    encoded = codecs.code_page_encode(cp, text, errors)
+                except UnicodeEncodeError as err:
+                    self.fail('Unable to encode %a to "cp%s" with '
+                              'errors=%r: %s' % (text, cp, errors, err))
+                self.assertEqual(encoded[0], expected,
+                    '%a.encode("cp%s", %r)=%a != %a'
+                    % (text, cp, errors, encoded[0], expected))
+                self.assertEqual(encoded[1], len(text))
+            else:
+                self.assertRaises(UnicodeEncodeError,
+                    codecs.code_page_encode, cp, text, errors)
+
+    def test_cp932(self):
+        self.check_encode(932, (
+            ('abc', 'strict', b'abc'),
+            ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
+            # not encodable
+            ('\xff', 'strict', None),
+            ('[\xff]', 'ignore', b'[]'),
+            ('[\xff]', 'replace', b'[y]'),
+            ('[\u20ac]', 'replace', b'[?]'),
+        ))
+        tests = [
+            (b'abc', 'strict', 'abc'),
+            (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
+            # invalid bytes
+            (b'\xff', 'strict', None),
+            (b'\xff', 'ignore', ''),
+            (b'\xff', 'replace', '\ufffd'),
+            (b'\x81\x00abc', 'strict', None),
+            (b'\x81\x00abc', 'ignore', '\x00abc'),
+        ]
+        if self.vista_or_later:
+            tests.append((b'\x81\x00abc', 'replace', '\ufffd\x00abc'))
+        else:
+            tests.append((b'\x81\x00abc', 'replace', '\x00\x00abc'))
+        self.check_decode(932, tests)
+
+    def test_cp1252(self):
+        self.check_encode(1252, (
+            ('abc', 'strict', b'abc'),
+            ('\xe9\u20ac', 'strict',  b'\xe9\x80'),
+            ('\xff', 'strict', b'\xff'),
+            ('\u0141', 'strict', None),
+            ('\u0141', 'ignore', b''),
+            ('\u0141', 'replace', b'L'),
+        ))
+        self.check_decode(1252, (
+            (b'abc', 'strict', 'abc'),
+            (b'\xe9\x80', 'strict', '\xe9\u20ac'),
+            (b'\xff', 'strict', '\xff'),
+        ))
+
+    def test_cp_utf7(self):
+        cp = 65000
+        self.check_encode(cp, (
+            ('abc', 'strict', b'abc'),
+            ('\xe9\u20ac', 'strict',  b'+AOkgrA-'),
+            ('\U0010ffff', 'strict',  b'+2//f/w-'),
+            ('\udc80', 'strict', b'+3IA-'),
+            ('\ufffd', 'strict', b'+//0-'),
+        ))
+        self.check_decode(cp, (
+            (b'abc', 'strict', 'abc'),
+            (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
+            (b'+2//f/w-', 'strict', '\U0010ffff'),
+            (b'+3IA-', 'strict', '\udc80'),
+            (b'+//0-', 'strict', '\ufffd'),
+            # invalid bytes
+            (b'[+/]', 'strict', '[]'),
+            (b'[\xff]', 'strict', '[\xff]'),
+        ))
+
+    def test_cp_utf8(self):
+        cp = self.CP_UTF8
+
+        tests = [
+            ('abc', 'strict', b'abc'),
+            ('\xe9\u20ac', 'strict',  b'\xc3\xa9\xe2\x82\xac'),
+            ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
+        ]
+        if self.vista_or_later:
+            tests.append(('\udc80', 'strict', None))
+            tests.append(('\udc80', 'ignore', b''))
+            tests.append(('\udc80', 'replace', b'?'))
+        else:
+            tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
+        self.check_encode(cp, tests)
+
+        tests = [
+            (b'abc', 'strict', 'abc'),
+            (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
+            (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
+            (b'\xef\xbf\xbd', 'strict', '\ufffd'),
+            (b'[\xc3\xa9]', 'strict', '[\xe9]'),
+            # invalid bytes
+            (b'[\xff]', 'strict', None),
+            (b'[\xff]', 'ignore', '[]'),
+            (b'[\xff]', 'replace', '[\ufffd]'),
+        ]
+        if self.vista_or_later:
+            tests.extend((
+                (b'[\xed\xb2\x80]', 'strict', None),
+                (b'[\xed\xb2\x80]', 'ignore', '[]'),
+                (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
+            ))
+        else:
+            tests.extend((
+                (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
+            ))
+        self.check_decode(cp, tests)
+
+    def test_error_handlers(self):
+        self.check_encode(932, (
+            ('\xff', 'backslashreplace', b'\\xff'),
+            ('\xff', 'xmlcharrefreplace', b'&#255;'),
+        ))
+        self.check_decode(932, (
+            (b'\xff', 'surrogateescape', '\udcff'),
+        ))
+        if self.vista_or_later:
+            self.check_encode(self.CP_UTF8, (
+                ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
+            ))
+
+    def test_multibyte_encoding(self):
+        self.check_decode(932, (
+            (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
+            (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
+        ))
+        self.check_decode(self.CP_UTF8, (
+            (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
+            (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
+        ))
+        if self.vista_or_later:
+            self.check_encode(self.CP_UTF8, (
+                ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
+                ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
+            ))
+
+    def test_incremental(self):
+        decoded = codecs.code_page_decode(932,
+                                          b'\xe9\x80\xe9', 'strict',
+                                          False)
+        self.assertEqual(decoded, ('\u9a3e', 2))
+
+        decoded = codecs.code_page_decode(932,
+                                          b'\xe9\x80\xe9\x80', 'strict',
+                                          False)
+        self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
+
+        decoded = codecs.code_page_decode(932,
+                                          b'abc', 'strict',
+                                          False)
+        self.assertEqual(decoded, ('abc', 3))
+
+
 def test_main():
    support.run_unittest(
        UTF32Test,
@ -1772,6 +1969,7 @@ def test_main():
        SurrogateEscapeTest,
        BomTest,
        TransformCodecTest,
+        CodePageTest,
    )


--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------

+- Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore
+  error handlers on all Windows versions. The MBCS codec is now supporting all
+  error handlers, instead of only replace to encode and ignore to decode.
+
 - Issue #13188: When called without an explicit traceback argument,
  generator.throw() now gets the traceback from the passed exception's
  ``__traceback__`` attribute.  Patch by Petri Lehtinen.
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@ -612,6 +612,31 @@ mbcs_decode(PyObject *self,
    return codec_tuple(decoded, consumed);
 }

+static PyObject *
+code_page_decode(PyObject *self,
+                 PyObject *args)
+{
+    Py_buffer pbuf;
+    const char *errors = NULL;
+    int final = 0;
+    Py_ssize_t consumed;
+    PyObject *decoded = NULL;
+    int code_page;
+
+    if (!PyArg_ParseTuple(args, "iy*|zi:code_page_decode",
+                          &code_page, &pbuf, &errors, &final))
+        return NULL;
+    consumed = pbuf.len;
+
+    decoded = PyUnicode_DecodeCodePageStateful(code_page,
+                                               pbuf.buf, pbuf.len, errors,
+                                               final ? NULL : &consumed);
+    PyBuffer_Release(&pbuf);
+    if (decoded == NULL)
+        return NULL;
+    return codec_tuple(decoded, consumed);
+}
+
 #endif /* HAVE_MBCS */

 /* --- Encoder ------------------------------------------------------------ */
@ -1011,6 +1036,29 @@ mbcs_encode(PyObject *self,
    return v;
 }

+static PyObject *
+code_page_encode(PyObject *self,
+                 PyObject *args)
+{
+    PyObject *str, *v;
+    const char *errors = NULL;
+    int code_page;
+
+    if (!PyArg_ParseTuple(args, "iO|z:code_page_encode",
+                          &code_page, &str, &errors))
+        return NULL;
+
+    str = PyUnicode_FromObject(str);
+    if (str == NULL)
+        return NULL;
+    v = codec_tuple(PyUnicode_EncodeCodePage(code_page,
+                                             str,
+                                             errors),
+                    PyUnicode_GET_LENGTH(str));
+    Py_DECREF(str);
+    return v;
+}
+
 #endif /* HAVE_MBCS */

 /* --- Error handler registry --------------------------------------------- */
@ -1101,6 +1149,8 @@ static PyMethodDef _codecs_functions[] = {
 #ifdef HAVE_MBCS
    {"mbcs_encode",             mbcs_encode,                    METH_VARARGS},
    {"mbcs_decode",             mbcs_decode,                    METH_VARARGS},
+    {"code_page_encode",        code_page_encode,               METH_VARARGS},
+    {"code_page_decode",        code_page_decode,               METH_VARARGS},
 #endif
    {"register_error",          register_error,                 METH_VARARGS,
        register_error__doc__},
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -429,6 +429,10 @@ _PyUnicode_CheckConsistency(void *op, int check_content)
 }
 #endif

+#ifdef HAVE_MBCS
+static OSVERSIONINFOEX winver;
+#endif
+
 /* --- Bloom Filters ----------------------------------------------------- */

 /* stuff to implement simple "bloom filters" for Unicode characters.
@ -6896,113 +6900,285 @@ PyUnicode_AsASCIIString(PyObject *unicode)
 #define NEED_RETRY
 #endif

-/* XXX This code is limited to "true" double-byte encodings, as
-   a) it assumes an incomplete character consists of a single byte, and
-   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
-   encodings, see IsDBCSLeadByteEx documentation. */
+#ifndef WC_ERR_INVALID_CHARS
+#  define WC_ERR_INVALID_CHARS 0x0080
+#endif
+
+static char*
+code_page_name(UINT code_page, PyObject **obj)
+{
+    *obj = NULL;
+    if (code_page == CP_ACP)
+        return "mbcs";
+    if (code_page == CP_UTF7)
+        return "CP_UTF7";
+    if (code_page == CP_UTF8)
+        return "CP_UTF8";
+
+    *obj = PyBytes_FromFormat("cp%u", code_page);
+    if (*obj == NULL)
+        return NULL;
+    return PyBytes_AS_STRING(*obj);
+}

 static int
-is_dbcs_lead_byte(const char *s, int offset)
+is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
 {
    const char *curr = s + offset;
+    const char *prev;

-    if (IsDBCSLeadByte(*curr)) {
-        const char *prev = CharPrev(s, curr);
-        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
+    if (!IsDBCSLeadByteEx(code_page, *curr))
+        return 0;
+
+    prev = CharPrevExA(code_page, s, curr, 0);
+    if (prev == curr)
+        return 1;
+    /* FIXME: This code is limited to "true" double-byte encodings,
+       as it assumes an incomplete character consists of a single
+       byte. */
+    if (curr - prev == 2)
+        return 1;
+    if (!IsDBCSLeadByteEx(code_page, *prev))
+        return 1;
+    return 0;
+}
+
+static DWORD
+decode_code_page_flags(UINT code_page)
+{
+    if (code_page == CP_UTF7) {
+        /* The CP_UTF7 decoder only supports flags=0 */
+        return 0;
+    }
+    else
+        return MB_ERR_INVALID_CHARS;
+}
+
+/*
+ * Decode a byte string from a Windows code page into unicode object in strict
+ * mode.
+ *
+ * Returns consumed size if succeed, returns -2 on decode error, or raise a
+ * WindowsError and returns -1 on other error.
+ */
+static int
+decode_code_page_strict(UINT code_page,
+                        PyUnicodeObject **v,
+                        const char *in,
+                        int insize)
+{
+    const DWORD flags = decode_code_page_flags(code_page);
+    Py_UNICODE *out;
+    DWORD outsize;
+
+    /* First get the size of the result */
+    assert(insize > 0);
+    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
+    if (outsize <= 0)
+        goto error;
+
+    if (*v == NULL) {
+        /* Create unicode object */
+        *v = _PyUnicode_New(outsize);
+        if (*v == NULL)
+            return -1;
+        out = PyUnicode_AS_UNICODE(*v);
+    }
+    else {
+        /* Extend unicode object */
+        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+        if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0)
+            return -1;
+        out = PyUnicode_AS_UNICODE(*v) + n;
+    }
+
+    /* Do the conversion */
+    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
+    if (outsize <= 0)
+        goto error;
+    return insize;
+
+error:
+    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+        return -2;
+    PyErr_SetFromWindowsErr(0);
+    return -1;
+}
+
+/*
+ * Decode a byte string from a code page into unicode object with an error
+ * handler.
+ *
+ * Returns consumed size if succeed, or raise a WindowsError or
+ * UnicodeDecodeError exception and returns -1 on error.
+ */
+static int
+decode_code_page_errors(UINT code_page,
+                        PyUnicodeObject **v,
+                        const char *in,
+                        int size,
+                        const char *errors)
+{
+    const char *startin = in;
+    const char *endin = in + size;
+    const DWORD flags = decode_code_page_flags(code_page);
+    /* Ideally, we should get reason from FormatMessage. This is the Windows
+       2000 English version of the message. */
+    const char *reason = "No mapping for the Unicode character exists "
+                         "in the target code page.";
+    /* each step cannot decode more than 1 character, but a character can be
+       represented as a surrogate pair */
+    wchar_t buffer[2], *startout, *out;
+    int insize, outsize;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+    PyObject *encoding_obj = NULL;
+    char *encoding;
+    DWORD err;
+    int ret = -1;
+
+    assert(size > 0);
+
+    encoding = code_page_name(code_page, &encoding_obj);
+    if (encoding == NULL)
+        return -1;
+
+    if (errors == NULL || strcmp(errors, "strict") == 0) {
+        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
+           UnicodeDecodeError. */
+        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
+        if (exc != NULL) {
+            PyCodec_StrictErrors(exc);
+            Py_CLEAR(exc);
+        }
+        goto error;
+    }
+
+    if (*v == NULL) {
+        /* Create unicode object */
+        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+            PyErr_NoMemory();
+            goto error;
+        }
+        *v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
+        if (*v == NULL)
+            goto error;
+        startout = PyUnicode_AS_UNICODE(*v);
+    }
+    else {
+        /* Extend unicode object */
+        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+            PyErr_NoMemory();
+            goto error;
+        }
+        if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
+            goto error;
+        startout = PyUnicode_AS_UNICODE(*v) + n;
+    }
+
+    /* Decode the byte string character per character */
+    out = startout;
+    while (in < endin)
+    {
+        /* Decode a character */
+        insize = 1;
+        do
+        {
+            outsize = MultiByteToWideChar(code_page, flags,
+                                          in, insize,
+                                          buffer, Py_ARRAY_LENGTH(buffer));
+            if (outsize > 0)
+                break;
+            err = GetLastError();
+            if (err != ERROR_NO_UNICODE_TRANSLATION
+                && err != ERROR_INSUFFICIENT_BUFFER)
+            {
+                PyErr_SetFromWindowsErr(0);
+                goto error;
+            }
+            insize++;
+        }
+        /* 4=maximum length of a UTF-8 sequence */
+        while (insize <= 4 && (in + insize) <= endin);
+
+        if (outsize <= 0) {
+            Py_ssize_t startinpos, endinpos, outpos;
+
+            startinpos = in - startin;
+            endinpos = startinpos + 1;
+            outpos = out - PyUnicode_AS_UNICODE(*v);
+            if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    encoding, reason,
+                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
+                    v, &outpos, &out))
+            {
+                goto error;
+            }
+        }
+        else {
+            in += insize;
+            memcpy(out, buffer, outsize * sizeof(wchar_t));
+            out += outsize;
+        }
+    }
+
+    /* write a NUL character at the end */
+    *out = 0;
+
+    /* Extend unicode object */
+    outsize = out - startout;
+    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
+    if (PyUnicode_Resize((PyObject**)v, outsize) < 0)
+        goto error;
+    ret = 0;
+
+error:
+    Py_XDECREF(encoding_obj);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return ret;
+}
+
+/*
+ * Decode a byte string from a Windows code page into unicode object. If
+ * 'final' is set, converts trailing lead-byte too.
+ *
+ * Returns consumed size if succeed, or raise a WindowsError or
+ * UnicodeDecodeError exception and returns -1 on error.
+ */
+static int
+decode_code_page(UINT code_page,
+                 PyUnicodeObject **v,
+                 const char *s,  int size,
+                 int final, const char *errors)
+{
+    int done;
+
+    /* Skip trailing lead-byte unless 'final' is set */
+    if (size == 0) {
+        if (*v == NULL) {
+            Py_INCREF(unicode_empty);
+            *v = (PyUnicodeObject*)unicode_empty;
+            if (*v == NULL)
+                return -1;
        }
        return 0;
    }

-/*
- * Decode MBCS string into unicode object. If 'final' is set, converts
- * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
- */
-static int
-decode_mbcs(PyUnicodeObject **v,
-            const char *s, /* MBCS string */
-            int size, /* sizeof MBCS string */
-            int final,
-            const char *errors)
-{
-    Py_UNICODE *p;
-    Py_ssize_t n;
-    DWORD usize;
-    DWORD flags;
-
-    assert(size >= 0);
-
-    /* check and handle 'errors' arg */
-    if (errors==NULL || strcmp(errors, "strict")==0)
-        flags = MB_ERR_INVALID_CHARS;
-    else if (strcmp(errors, "ignore")==0)
-        flags = 0;
-    else {
-        PyErr_Format(PyExc_ValueError,
-                     "mbcs encoding does not support errors='%s'",
-                     errors);
-        return -1;
-    }
-
-    /* Skip trailing lead-byte unless 'final' is set */
-    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
+    if (!final && is_dbcs_lead_byte(code_page, s, size - 1))
        --size;

-    /* First get the size of the result */
-    if (size > 0) {
-        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
-        if (usize==0)
-            goto mbcs_decode_error;
-    } else
-        usize = 0;
-
-    if (*v == NULL) {
-        /* Create unicode object */
-        *v = _PyUnicode_New(usize);
-        if (*v == NULL)
-            return -1;
-        n = 0;
-    }
-    else {
-        /* Extend unicode object */
-        n = PyUnicode_GET_SIZE(*v);
-        if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
-            return -1;
+    done = decode_code_page_strict(code_page, v, s, size);
+    if (done == -2)
+        done = decode_code_page_errors(code_page, v, s, size, errors);
+    return done;
 }

-    /* Do the conversion */
-    if (usize > 0) {
-        p = PyUnicode_AS_UNICODE(*v) + n;
-        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
-            goto mbcs_decode_error;
-        }
-    }
-    return size;
-
-mbcs_decode_error:
-    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
-       we raise a UnicodeDecodeError - else it is a 'generic'
-       windows error
-     */
-    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
-        /* Ideally, we should get reason from FormatMessage - this
-           is the Windows 2000 English version of the message
-        */
-        PyObject *exc = NULL;
-        const char *reason = "No mapping for the Unicode character exists "
-                             "in the target multi-byte code page.";
-        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
-        if (exc != NULL) {
-            PyCodec_StrictErrors(exc);
-            Py_DECREF(exc);
-        }
-    } else {
-        PyErr_SetFromWindowsErrWithFilename(0, NULL);
-    }
-    return -1;
-}
-
-PyObject *
-PyUnicode_DecodeMBCSStateful(const char *s,
+static PyObject *
+decode_code_page_stateful(int code_page,
+                          const char *s,
                          Py_ssize_t size,
                          const char *errors,
                          Py_ssize_t *consumed)
@ -7010,16 +7186,21 @@ PyUnicode_DecodeMBCSStateful(const char *s,
    PyUnicodeObject *v = NULL;
    int done;

+    if (code_page < 0) {
+        PyErr_SetString(PyExc_ValueError, "invalid code page number");
+        return NULL;
+    }
+
    if (consumed)
        *consumed = 0;

 #ifdef NEED_RETRY
  retry:
    if (size > INT_MAX)
-        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
+        done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors);
    else
 #endif
-        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
+        done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors);

    if (done < 0) {
        Py_XDECREF(v);
@ -7036,6 +7217,7 @@ PyUnicode_DecodeMBCSStateful(const char *s,
        goto retry;
    }
 #endif
+
 #ifndef DONT_MAKE_RESULT_READY
    if (_PyUnicode_READY_REPLACE(&v)) {
        Py_DECREF(v);
@ -7046,6 +7228,25 @@ PyUnicode_DecodeMBCSStateful(const char *s,
    return (PyObject *)v;
 }

+PyObject *
+PyUnicode_DecodeCodePageStateful(int code_page,
+                                 const char *s,
+                                 Py_ssize_t size,
+                                 const char *errors,
+                                 Py_ssize_t *consumed)
+{
+    return decode_code_page_stateful(code_page, s, size, errors, consumed);
+}
+
+PyObject *
+PyUnicode_DecodeMBCSStateful(const char *s,
+                             Py_ssize_t size,
+                             const char *errors,
+                             Py_ssize_t *consumed)
+{
+    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
+}
+
 PyObject *
 PyUnicode_DecodeMBCS(const char *s,
                     Py_ssize_t size,
@ -7054,105 +7255,342 @@ PyUnicode_DecodeMBCS(const char *s,
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
 }

+static DWORD
+encode_code_page_flags(UINT code_page, const char *errors)
+{
+    if (code_page == CP_UTF8) {
+        if (winver.dwMajorVersion >= 6)
+            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
+               and later */
+            return WC_ERR_INVALID_CHARS;
+        else
+            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
+            return 0;
+    }
+    else if (code_page == CP_UTF7) {
+        /* CP_UTF7 only supports flags=0 */
+        return 0;
+    }
+    else {
+        if (errors != NULL && strcmp(errors, "replace") == 0)
+            return 0;
+        else
+            return WC_NO_BEST_FIT_CHARS;
+    }
+}
+
 /*
- * Convert unicode into string object (MBCS).
- * Returns 0 if succeed, -1 otherwise.
+ * Encode a Unicode string to a Windows code page into a byte string in strict
+ * mode.
+ *
+ * Returns consumed characters if succeed, returns -2 on encode error, or raise
+ * a WindowsError and returns -1 on other error.
 */
 static int
-encode_mbcs(PyObject **repr,
-            const Py_UNICODE *p, /* unicode */
-            int size, /* size of unicode */
+encode_code_page_strict(UINT code_page, PyObject **outbytes,
+                        const Py_UNICODE *p, const int size,
                        const char* errors)
 {
    BOOL usedDefaultChar = FALSE;
-    BOOL *pusedDefaultChar;
-    int mbcssize;
-    Py_ssize_t n;
+    BOOL *pusedDefaultChar = &usedDefaultChar;
+    int outsize;
    PyObject *exc = NULL;
-    DWORD flags;
+    const DWORD flags = encode_code_page_flags(code_page, NULL);
+    char *out;

-    assert(size >= 0);
+    assert(size > 0);

-    /* check and handle 'errors' arg */
-    if (errors==NULL || strcmp(errors, "strict")==0) {
-        flags = WC_NO_BEST_FIT_CHARS;
+    if (code_page != CP_UTF8 && code_page != CP_UTF7)
        pusedDefaultChar = &usedDefaultChar;
-    } else if (strcmp(errors, "replace")==0) {
-        flags = 0;
+    else
        pusedDefaultChar = NULL;
-    } else {
-         PyErr_Format(PyExc_ValueError,
-                      "mbcs encoding does not support errors='%s'",
-                      errors);
-         return -1;
-    }

    /* First get the size of the result */
-    if (size > 0) {
-        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
+    outsize = WideCharToMultiByte(code_page, flags,
+                                  p, size,
+                                  NULL, 0,
                                  NULL, pusedDefaultChar);
-        if (mbcssize == 0) {
-            PyErr_SetFromWindowsErrWithFilename(0, NULL);
-            return -1;
-        }
+    if (outsize <= 0)
+        goto error;
    /* If we used a default char, then we failed! */
    if (pusedDefaultChar && *pusedDefaultChar)
-            goto mbcs_encode_error;
-    } else {
-        mbcssize = 0;
-    }
+        return -2;

-    if (*repr == NULL) {
+    if (*outbytes == NULL) {
        /* Create string object */
-        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
-        if (*repr == NULL)
+        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
+        if (*outbytes == NULL)
            return -1;
-        n = 0;
+        out = PyBytes_AS_STRING(*outbytes);
    }
    else {
        /* Extend string object */
-        n = PyBytes_Size(*repr);
-        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
+        const Py_ssize_t n = PyBytes_Size(*outbytes);
+        if (outsize > PY_SSIZE_T_MAX - n) {
+            PyErr_NoMemory();
            return -1;
        }
+        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
+            return -1;
+        out = PyBytes_AS_STRING(*outbytes) + n;
+    }

    /* Do the conversion */
-    if (size > 0) {
-        char *s = PyBytes_AS_STRING(*repr) + n;
-        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
-                                     NULL, pusedDefaultChar)) {
-            PyErr_SetFromWindowsErrWithFilename(0, NULL);
-            return -1;
-        }
+    outsize = WideCharToMultiByte(code_page, flags,
+                                  p, size,
+                                  out, outsize,
+                                  NULL, pusedDefaultChar);
+    if (outsize <= 0)
+        goto error;
    if (pusedDefaultChar && *pusedDefaultChar)
-            goto mbcs_encode_error;
-    }
+        return -2;
    return 0;

-mbcs_encode_error:
-    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
-    Py_XDECREF(exc);
+error:
+    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+        return -2;
+    PyErr_SetFromWindowsErr(0);
    return -1;
 }

-PyObject *
-PyUnicode_EncodeMBCS(const Py_UNICODE *p,
-                     Py_ssize_t size,
+/*
+ * Encode a Unicode string to a Windows code page into a byte string using a
+ * error handler.
+ *
+ * Returns consumed characters if succeed, or raise a WindowsError and returns
+ * -1 on other error.
+ */
+static int
+encode_code_page_errors(UINT code_page, PyObject **outbytes,
+                        const Py_UNICODE *in, const int insize,
                        const char* errors)
 {
-    PyObject *repr = NULL;
+    const DWORD flags = encode_code_page_flags(code_page, errors);
+    const Py_UNICODE *startin = in;
+    const Py_UNICODE *endin = in + insize;
+    /* Ideally, we should get reason from FormatMessage. This is the Windows
+       2000 English version of the message. */
+    const char *reason = "invalid character";
+    /* 4=maximum length of a UTF-8 sequence */
+    char buffer[4];
+    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
+    Py_ssize_t outsize;
+    char *out;
+    int charsize;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+    PyObject *encoding_obj = NULL;
+    char *encoding;
+    int err;
+    Py_ssize_t startpos, newpos, newoutsize;
+    PyObject *rep;
+    int ret = -1;
+
+    assert(insize > 0);
+
+    encoding = code_page_name(code_page, &encoding_obj);
+    if (encoding == NULL)
+        return -1;
+
+    if (errors == NULL || strcmp(errors, "strict") == 0) {
+        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
+           then we raise a UnicodeEncodeError. */
+        make_encode_exception(&exc, encoding, in, insize, 0, 0, reason);
+        if (exc != NULL) {
+            PyCodec_StrictErrors(exc);
+            Py_DECREF(exc);
+        }
+        Py_XDECREF(encoding_obj);
+        return -1;
+    }
+
+    if (code_page != CP_UTF8 && code_page != CP_UTF7)
+        pusedDefaultChar = &usedDefaultChar;
+    else
+        pusedDefaultChar = NULL;
+
+    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
+        PyErr_NoMemory();
+        goto error;
+    }
+    outsize = insize * Py_ARRAY_LENGTH(buffer);
+
+    if (*outbytes == NULL) {
+        /* Create string object */
+        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
+        if (*outbytes == NULL)
+            goto error;
+        out = PyBytes_AS_STRING(*outbytes);
+    }
+    else {
+        /* Extend string object */
+        Py_ssize_t n = PyBytes_Size(*outbytes);
+        if (n > PY_SSIZE_T_MAX - outsize) {
+            PyErr_NoMemory();
+            goto error;
+        }
+        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
+            goto error;
+        out = PyBytes_AS_STRING(*outbytes) + n;
+    }
+
+    /* Encode the string character per character */
+    while (in < endin)
+    {
+        if ((in + 2) <= endin
+            && 0xD800 <= in[0] && in[0] <= 0xDBFF
+            && 0xDC00 <= in[1] && in[1] <= 0xDFFF)
+            charsize = 2;
+        else
+            charsize = 1;
+
+        outsize = WideCharToMultiByte(code_page, flags,
+                                      in, charsize,
+                                      buffer, Py_ARRAY_LENGTH(buffer),
+                                      NULL, pusedDefaultChar);
+        if (outsize > 0) {
+            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
+            {
+                in += charsize;
+                memcpy(out, buffer, outsize);
+                out += outsize;
+                continue;
+            }
+        }
+        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
+            PyErr_SetFromWindowsErr(0);
+            goto error;
+        }
+
+        charsize = Py_MAX(charsize - 1, 1);
+        startpos = in - startin;
+        rep = unicode_encode_call_errorhandler(
+                  errors, &errorHandler, encoding, reason,
+                  startin, insize, &exc,
+                  startpos, startpos + charsize, &newpos);
+        if (rep == NULL)
+            goto error;
+        in = startin + newpos;
+
+        if (PyBytes_Check(rep)) {
+            outsize = PyBytes_GET_SIZE(rep);
+            if (outsize != 1) {
+                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
+                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
+                    Py_DECREF(rep);
+                    goto error;
+                }
+                out = PyBytes_AS_STRING(*outbytes) + offset;
+            }
+            memcpy(out, PyBytes_AS_STRING(rep), outsize);
+            out += outsize;
+        }
+        else {
+            Py_ssize_t i;
+            enum PyUnicode_Kind kind;
+            void *data;
+
+            if (PyUnicode_READY(rep) < 0) {
+                Py_DECREF(rep);
+                goto error;
+            }
+
+            outsize = PyUnicode_GET_LENGTH(rep);
+            if (outsize != 1) {
+                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
+                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
+                    Py_DECREF(rep);
+                    goto error;
+                }
+                out = PyBytes_AS_STRING(*outbytes) + offset;
+            }
+            kind = PyUnicode_KIND(rep);
+            data = PyUnicode_DATA(rep);
+            for (i=0; i < outsize; i++) {
+                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+                if (ch > 127) {
+                    raise_encode_exception(&exc,
+                        encoding,
+                        startin, insize,
+                        startpos, startpos + charsize,
+                        "unable to encode error handler result to ASCII");
+                    Py_DECREF(rep);
+                    goto error;
+                }
+                *out = (unsigned char)ch;
+                out++;
+            }
+        }
+        Py_DECREF(rep);
+    }
+    /* write a NUL byte */
+    *out = 0;
+    outsize = out - PyBytes_AS_STRING(*outbytes);
+    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
+    if (_PyBytes_Resize(outbytes, outsize) < 0)
+        goto error;
+    ret = 0;
+
+error:
+    Py_XDECREF(encoding_obj);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return ret;
+}
+
+/*
+ * Encode a Unicode string to a Windows code page into a byte string.
+ *
+ * Returns consumed characters if succeed, or raise a WindowsError and returns
+ * -1 on other error.
+ */
+static int
+encode_code_page_chunk(UINT code_page, PyObject **outbytes,
+                       const Py_UNICODE *p, int size,
+                       const char* errors)
+{
+    int done;
+
+    if (size == 0) {
+        if (*outbytes == NULL) {
+            *outbytes = PyBytes_FromStringAndSize(NULL, 0);
+            if (*outbytes == NULL)
+                return -1;
+        }
+        return 0;
+    }
+
+    done = encode_code_page_strict(code_page, outbytes, p, size, errors);
+    if (done == -2)
+        done = encode_code_page_errors(code_page, outbytes, p, size, errors);
+    return done;
+}
+
+static PyObject *
+encode_code_page(int code_page,
+                 const Py_UNICODE *p, Py_ssize_t size,
+                 const char *errors)
+{
+    PyObject *outbytes = NULL;
    int ret;

+    if (code_page < 0) {
+        PyErr_SetString(PyExc_ValueError, "invalid code page number");
+        return NULL;
+    }
+
 #ifdef NEED_RETRY
  retry:
    if (size > INT_MAX)
-        ret = encode_mbcs(&repr, p, INT_MAX, errors);
+        ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors);
    else
 #endif
-        ret = encode_mbcs(&repr, p, (int)size, errors);
+        ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors);

    if (ret < 0) {
-        Py_XDECREF(repr);
+        Py_XDECREF(outbytes);
        return NULL;
    }

@ -7164,7 +7602,28 @@ PyUnicode_EncodeMBCS(const Py_UNICODE *p,
    }
 #endif

-    return repr;
+    return outbytes;
+}
+
+PyObject *
+PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+                     Py_ssize_t size,
+                     const char *errors)
+{
+    return encode_code_page(CP_ACP, p, size, errors);
+}
+
+PyObject *
+PyUnicode_EncodeCodePage(int code_page,
+                         PyObject *unicode,
+                         const char *errors)
+{
+    const Py_UNICODE *p;
+    Py_ssize_t size;
+    p = PyUnicode_AsUnicodeAndSize(unicode, &size);
+    if (p == NULL)
+        return NULL;
+    return encode_code_page(code_page, p, size, errors);
 }

 PyObject *
@ -13434,7 +13893,7 @@ PyTypeObject PyUnicode_Type = {

 /* Initialize the Unicode implementation */

-void _PyUnicode_Init(void)
+int _PyUnicode_Init(void)
 {
    int i;

@ -13467,6 +13926,15 @@ void _PyUnicode_Init(void)
        Py_ARRAY_LENGTH(linebreak));

    PyType_Ready(&EncodingMapType);
+
+#ifdef HAVE_MBCS
+    winver.dwOSVersionInfoSize = sizeof(winver);
+    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
+        PyErr_SetFromWindowsErr(0);
+        return -1;
+    }
+#endif
+    return 0;
 }

 /* Finalize the Unicode implementation */
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@ -67,7 +67,7 @@ static void initsigs(void);
 static void call_py_exitfuncs(void);
 static void wait_for_thread_shutdown(void);
 static void call_ll_exitfuncs(void);
-extern void _PyUnicode_Init(void);
+extern int _PyUnicode_Init(void);
 extern void _PyUnicode_Fini(void);
 extern int _PyLong_Init(void);
 extern void PyLong_Fini(void);
@ -261,7 +261,8 @@ Py_InitializeEx(int install_sigs)
        Py_FatalError("Py_Initialize: can't make modules_reloading dictionary");

    /* Init Unicode implementation; relies on the codec registry */
-    _PyUnicode_Init();
+    if (_PyUnicode_Init() < 0)
+        Py_FatalError("Py_Initialize: can't initialize unicode");

    bimod = _PyBuiltin_Init();
    if (bimod == NULL)