Issue #12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.

The utf-16* and utf-32* encoders no longer allow surrogate code points (U+D800-U+DFFF) to be encoded. The utf-32* decoders no longer decode byte sequences that correspond to surrogate code points. The surrogatepass error handler now works with the utf-16* and utf-32* codecs. Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
2013-11-19 11:32:41 +02:00 · 2013-11-19 11:32:41 +02:00 · 58cf607d13
parent a938bcfe95
commit 58cf607d13
8 changed files with 639 additions and 78 deletions
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@ -365,18 +365,23 @@ and implemented by all standard Python codecs:
 |                         | in :pep:`383`.                                |
 +-------------------------+-----------------------------------------------+
-In addition, the following error handlers are specific to a single codec:
+In addition, the following error handlers are specific to Unicode encoding
 schemes:
-+-------------------+---------+-------------------------------------------+
+-------------------+------------------------+-------------------------------------------+
-| Value             | Codec   | Meaning                                   |
+| Value             | Codec                  | Meaning                                   |
-+===================+=========+===========================================+
+===================+========================+===========================================+
-|``'surrogatepass'``| utf-8   | Allow encoding and decoding of surrogate  |
+|``'surrogatepass'``| utf-8, utf-16, utf-32, | Allow encoding and decoding of surrogate  |
-|                   |         | codes in UTF-8.                           |
+|                   | utf-16-be, utf-16-le,  | codes in all the Unicode encoding schemes.|
-+-------------------+---------+-------------------------------------------+
+|                   | utf-32-be, utf-32-le   |                                           |
 +-------------------+------------------------+-------------------------------------------+
 .. versionadded:: 3.1
   The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers.
 .. versionchanged:: 3.4
   The ``'surrogatepass'`` error handlers now works with utf-16\* and utf-32\* codecs.
 The set of allowed values can be extended via :meth:`register_error`.
@ -1167,6 +1172,12 @@ particular, the following variants typically exist:
 | utf_8_sig       |                                | all languages                  |
 +-----------------+--------------------------------+--------------------------------+
 .. versionchanged:: 3.4
   The utf-16\* and utf-32\* encoders no longer allow surrogate code points
   (U+D800--U+DFFF) to be encoded.  The utf-32\* decoders no longer decode
   byte sequences that correspond to surrogate code points.
 Python Specific Encodings
 -------------------------
--- a/Doc/whatsnew/3.4.rst
+++ b/Doc/whatsnew/3.4.rst
@ -253,6 +253,13 @@ Some smaller changes made to the core Python language are:
  ``__main__.__file__`` when a script has been executed directly using
  a relative path (Contributed by Brett Cannon in :issue:`18416`).
 * Now all the UTF-\* codecs (except UTF-7) reject surrogates during both
  encoding and decoding unless the ``surrogatepass`` error handler is used,
  with the exception of the UTF-16 decoder that accepts valid surrogate pairs,
  and the UTF-16 encoder that produces them while encoding non-BMP characters.
  Contributed by Victor Stinner, Kang-Hao (Kenny) Lu and Serhiy Storchaka in
  :issue:`12892`.
 New Modules
 ===========
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -300,8 +300,46 @@ class ReadTest(MixInCheckStateHandling):
        self.assertEqual(reader.readline(), s5)
        self.assertEqual(reader.readline(), "")
    ill_formed_sequence_replace = "\ufffd"
    def test_lone_surrogates(self):
        self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
        self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
                         "[\\udc80]".encode(self.encoding))
        self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
                         "[&#56448;]".encode(self.encoding))
        self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
                         "[]".encode(self.encoding))
        self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
                         "[?]".encode(self.encoding))
        bom = "".encode(self.encoding)
        for before, after in [("\U00010fff", "A"), ("[", "]"),
                              ("A", "\U00010fff")]:
            before_sequence = before.encode(self.encoding)[len(bom):]
            after_sequence = after.encode(self.encoding)[len(bom):]
            test_string = before + "\uDC80" + after
            test_sequence = (bom + before_sequence +
                             self.ill_formed_sequence + after_sequence)
            self.assertRaises(UnicodeDecodeError, test_sequence.decode,
                              self.encoding)
            self.assertEqual(test_string.encode(self.encoding,
                                                "surrogatepass"),
                             test_sequence)
            self.assertEqual(test_sequence.decode(self.encoding,
                                                  "surrogatepass"),
                             test_string)
            self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
                             before + after)
            self.assertEqual(test_sequence.decode(self.encoding, "replace"),
                             before + self.ill_formed_sequence_replace + after)
 class UTF32Test(ReadTest, unittest.TestCase):
    encoding = "utf-32"
    if sys.byteorder == 'little':
        ill_formed_sequence = b"\x80\xdc\x00\x00"
    else:
        ill_formed_sequence = b"\x00\x00\xdc\x80"
    spamle = (b'\xff\xfe\x00\x00'
              b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
@ -393,6 +431,7 @@ class UTF32Test(ReadTest, unittest.TestCase):
 class UTF32LETest(ReadTest, unittest.TestCase):
    encoding = "utf-32-le"
    ill_formed_sequence = b"\x80\xdc\x00\x00"
    def test_partial(self):
        self.check_partial(
@ -437,6 +476,7 @@ class UTF32LETest(ReadTest, unittest.TestCase):
 class UTF32BETest(ReadTest, unittest.TestCase):
    encoding = "utf-32-be"
    ill_formed_sequence = b"\x00\x00\xdc\x80"
    def test_partial(self):
        self.check_partial(
@ -482,6 +522,10 @@ class UTF32BETest(ReadTest, unittest.TestCase):
 class UTF16Test(ReadTest, unittest.TestCase):
    encoding = "utf-16"
    if sys.byteorder == 'little':
        ill_formed_sequence = b"\x80\xdc"
    else:
        ill_formed_sequence = b"\xdc\x80"
    spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
    spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
@ -562,6 +606,7 @@ class UTF16Test(ReadTest, unittest.TestCase):
 class UTF16LETest(ReadTest, unittest.TestCase):
    encoding = "utf-16-le"
    ill_formed_sequence = b"\x80\xdc"
    def test_partial(self):
        self.check_partial(
@ -605,6 +650,7 @@ class UTF16LETest(ReadTest, unittest.TestCase):
 class UTF16BETest(ReadTest, unittest.TestCase):
    encoding = "utf-16-be"
    ill_formed_sequence = b"\xdc\x80"
    def test_partial(self):
        self.check_partial(
@ -648,6 +694,8 @@ class UTF16BETest(ReadTest, unittest.TestCase):
 class UTF8Test(ReadTest, unittest.TestCase):
    encoding = "utf-8"
    ill_formed_sequence = b"\xed\xb2\x80"
    ill_formed_sequence_replace = "\ufffd" * 3
    def test_partial(self):
        self.check_partial(
@ -677,18 +725,11 @@ class UTF8Test(ReadTest, unittest.TestCase):
                                         u, u.encode(self.encoding))
    def test_lone_surrogates(self):
-        self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
+        super().test_lone_surrogates()
-        self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
+        # not sure if this is making sense for
-        self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
+        # UTF-16 and UTF-32
-                         b'[\\udc80]')
+        self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
        self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
                         b'[&#56448;]')
        self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
                         b'[\x80]')
        self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
                         b'[]')
        self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
                         b'[?]')
    def test_surrogatepass_handler(self):
        self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
@ -851,6 +892,9 @@ class UTF7Test(ReadTest, unittest.TestCase):
        self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
        self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
    test_lone_surrogates = None
 class UTF16ExTest(unittest.TestCase):
    def test_errors(self):
@ -875,7 +919,7 @@ class ReadBufferTest(unittest.TestCase):
        self.assertRaises(TypeError, codecs.readbuffer_encode)
        self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
-class UTF8SigTest(ReadTest, unittest.TestCase):
+class UTF8SigTest(UTF8Test, unittest.TestCase):
    encoding = "utf-8-sig"
    def test_partial(self):
--- a/Misc/ACKS
+++ b/Misc/ACKS
@ -783,6 +783,7 @@ Ned Jackson Lovely
 Jason Lowe
 Tony Lownds
 Ray Loyzaga
 Kang-Hao (Kenny) Lu
 Lukas Lueg
 Loren Luke
 Fredrik Lundh
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,12 @@ Projected release date: 2013-11-24
 Core and Builtins
 -----------------
 - Issue #12892: The utf-16* and utf-32* encoders no longer allow surrogate code
  points (U+D800-U+DFFF) to be encoded.  The utf-32* decoders no longer decode
  byte sequences that correspond to surrogate code points.  The surrogatepass
  error handler now works with the utf-16* and utf-32* codecs.  Based on
  patches by Victor Stinner and Kang-Hao (Kenny) Lu.
 - Issue #17806: Added keyword-argument support for "tabsize" to
  str/bytes.expandtabs().
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@ -596,26 +596,134 @@ IllegalSurrogate:
 #undef SWAB
-Py_LOCAL_INLINE(void)
+#if STRINGLIB_MAX_CHAR >= 0x80
-STRINGLIB(utf16_encode)(unsigned short *out,
+Py_LOCAL_INLINE(Py_ssize_t)
-                        const STRINGLIB_CHAR *in,
+STRINGLIB(utf16_encode_)(const STRINGLIB_CHAR *in,
                        Py_ssize_t len,
                        unsigned short **outptr,
                        int native_ordering)
 {
    unsigned short *out = *outptr;
    const STRINGLIB_CHAR *end = in + len;
 #if STRINGLIB_SIZEOF_CHAR == 1
 # define SWAB2(CH)  ((CH) << 8)
 #else
 # define SWAB2(CH)  (((CH) << 8) | ((CH) >> 8))
 #endif
 #if STRINGLIB_MAX_CHAR < 0x10000
    if (native_ordering) {
-# if STRINGLIB_SIZEOF_CHAR == 2
+#if STRINGLIB_MAX_CHAR < 0x10000
-        Py_MEMCPY(out, in, 2 * len);
+        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
-# else
+        while (in < unrolled_end) {
-        _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out);
+# if STRINGLIB_MAX_CHAR >= 0xd800
            if (((in[0] ^ 0xd800) &
                 (in[1] ^ 0xd800) &
                 (in[2] ^ 0xd800) &
                 (in[3] ^ 0xd800) & 0xf800) == 0)
                break;
 # endif
            out[0] = in[0];
            out[1] = in[1];
            out[2] = in[2];
            out[3] = in[3];
            in += 4; out += 4;
        }
 #endif
        while (in < end) {
            Py_UCS4 ch;
            ch = *in++;
 #if STRINGLIB_MAX_CHAR >= 0xd800
            if (ch < 0xd800)
                *out++ = ch;
            else if (ch < 0xe000)
                /* reject surrogate characters (U+DC800-U+DFFF) */
                goto fail;
 # if STRINGLIB_MAX_CHAR >= 0x10000
            else if (ch >= 0x10000) {
                out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
                out[1] = Py_UNICODE_LOW_SURROGATE(ch);
                out += 2;
            }
 # endif
            else
 #endif
                *out++ = ch;
        }
    } else {
 #if STRINGLIB_MAX_CHAR < 0x10000
        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
        while (in < unrolled_end) {
 # if STRINGLIB_MAX_CHAR >= 0xd800
            if (((in[0] ^ 0xd800) &
                 (in[1] ^ 0xd800) &
                 (in[2] ^ 0xd800) &
                 (in[3] ^ 0xd800) & 0xf800) == 0)
                break;
 # endif
            out[0] = SWAB2(in[0]);
            out[1] = SWAB2(in[1]);
            out[2] = SWAB2(in[2]);
            out[3] = SWAB2(in[3]);
            in += 4; out += 4;
        }
 #endif
        while (in < end) {
            Py_UCS4 ch = *in++;
 #if STRINGLIB_MAX_CHAR >= 0xd800
            if (ch < 0xd800)
                *out++ = SWAB2((Py_UCS2)ch);
            else if (ch < 0xe000)
                /* reject surrogate characters (U+DC800-U+DFFF) */
                goto fail;
 # if STRINGLIB_MAX_CHAR >= 0x10000
            else if (ch >= 0x10000) {
                Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
                Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
                out[0] = SWAB2(ch1);
                out[1] = SWAB2(ch2);
                out += 2;
            }
 # endif
            else
 #endif
                *out++ = SWAB2((Py_UCS2)ch);
        }
    }
    *outptr = out;
    return len;
 #if STRINGLIB_MAX_CHAR >= 0xd800
  fail:
 #endif
    *outptr = out;
    return len - (end - in + 1);
 }
 #endif
 #undef SWAB2
 #if STRINGLIB_MAX_CHAR >= 0x80
 Py_LOCAL_INLINE(Py_ssize_t)
 STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
                        Py_ssize_t len,
                        unsigned short **outptr,
                        int native_ordering)
 {
    unsigned short *out = *outptr;
    const STRINGLIB_CHAR *end = in + len;
 #if STRINGLIB_SIZEOF_CHAR == 1
    if (native_ordering) {
        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
        while (in < unrolled_end) {
            out[0] = in[0];
            out[1] = in[1];
            out[2] = in[2];
            out[3] = in[3];
            in += 4; out += 4;
        }
        while (in < end) {
            *out++ = *in++;
        }
    } else {
 # define SWAB2(CH)  ((CH) << 8) /* high byte is zero */
        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
        while (in < unrolled_end) {
            out[0] = SWAB2(in[0]);
@ -625,37 +733,95 @@ STRINGLIB(utf16_encode)(unsigned short *out,
            in += 4; out += 4;
        }
        while (in < end) {
-            *out++ = SWAB2(*in);
+            Py_UCS4 ch = *in++;
-            ++in;
+            *out++ = SWAB2((Py_UCS2)ch);
        }
 #undef SWAB2
    }
    *outptr = out;
    return len;
 #else
    if (native_ordering) {
 #if STRINGLIB_MAX_CHAR < 0x10000
        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
        while (in < unrolled_end) {
            /* check if any character is a surrogate character */
            if (((in[0] ^ 0xd800) &
                 (in[1] ^ 0xd800) &
                 (in[2] ^ 0xd800) &
                 (in[3] ^ 0xd800) & 0xf800) == 0)
                break;
            out[0] = in[0];
            out[1] = in[1];
            out[2] = in[2];
            out[3] = in[3];
            in += 4; out += 4;
        }
 #endif
        while (in < end) {
-            Py_UCS4 ch = *in++;
+            Py_UCS4 ch;
-            if (ch < 0x10000)
+            ch = *in++;
            if (ch < 0xd800)
                *out++ = ch;
-            else {
+            else if (ch < 0xe000)
                /* reject surrogate characters (U+DC800-U+DFFF) */
                goto fail;
 #if STRINGLIB_MAX_CHAR >= 0x10000
            else if (ch >= 0x10000) {
                out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
                out[1] = Py_UNICODE_LOW_SURROGATE(ch);
                out += 2;
            }
 #endif
            else
                *out++ = ch;
        }
    } else {
 #define SWAB2(CH)  (((CH) << 8) | ((CH) >> 8))
 #if STRINGLIB_MAX_CHAR < 0x10000
        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
        while (in < unrolled_end) {
            /* check if any character is a surrogate character */
            if (((in[0] ^ 0xd800) &
                 (in[1] ^ 0xd800) &
                 (in[2] ^ 0xd800) &
                 (in[3] ^ 0xd800) & 0xf800) == 0)
                break;
            out[0] = SWAB2(in[0]);
            out[1] = SWAB2(in[1]);
            out[2] = SWAB2(in[2]);
            out[3] = SWAB2(in[3]);
            in += 4; out += 4;
        }
 #endif
        while (in < end) {
            Py_UCS4 ch = *in++;
-            if (ch < 0x10000)
+            if (ch < 0xd800)
                *out++ = SWAB2((Py_UCS2)ch);
-            else {
+            else if (ch < 0xe000)
                /* reject surrogate characters (U+DC800-U+DFFF) */
                goto fail;
 #if STRINGLIB_MAX_CHAR >= 0x10000
            else if (ch >= 0x10000) {
                Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
                Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
                out[0] = SWAB2(ch1);
                out[1] = SWAB2(ch2);
                out += 2;
            }
        }
    }
 #endif
            else
                *out++ = SWAB2((Py_UCS2)ch);
        }
 #undef SWAB2
    }
    *outptr = out;
    return len;
  fail:
    *outptr = out;
    return len - (end - in + 1);
 #endif
 }
 #endif
 #endif /* STRINGLIB_IS_UNICODE */
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4963,6 +4963,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
    _PyUnicodeWriter writer;
    const unsigned char *q, *e;
    int le, bo = 0;       /* assume native ordering by default */
    const char *encoding;
    const char *errmsg = "";
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
@ -5002,6 +5003,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
 #else
    le = bo <= 0;
 #endif
    encoding = le ? "utf-32-le" : "utf-32-be";
    _PyUnicodeWriter_Init(&writer);
    writer.min_length = (e - q + 3) / 4;
@ -5022,6 +5024,9 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
                    if (ch > maxch)
                        break;
                    if (kind != PyUnicode_1BYTE_KIND &&
                        Py_UNICODE_IS_SURROGATE(ch))
                        break;
                    PyUnicode_WRITE(kind, data, pos++, ch);
                    q += 4;
                } while (q <= last);
@ -5031,6 +5036,9 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
                    if (ch > maxch)
                        break;
                    if (kind != PyUnicode_1BYTE_KIND &&
                        Py_UNICODE_IS_SURROGATE(ch))
                        break;
                    PyUnicode_WRITE(kind, data, pos++, ch);
                    q += 4;
                } while (q <= last);
@ -5038,7 +5046,12 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
            writer.pos = pos;
        }
-        if (ch <= maxch) {
+        if (Py_UNICODE_IS_SURROGATE(ch)) {
            errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
            startinpos = ((const char *)q) - starts;
            endinpos = startinpos + 4;
        }
        else if (ch <= maxch) {
            if (q == e || consumed)
                break;
            /* remaining bytes at the end? (size should be divisible by 4) */
@ -5062,7 +5075,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
           chooses to skip the input */
        if (unicode_decode_call_errorhandler_writer(
                errors, &errorHandler,
-                "utf32", errmsg,
+                encoding, errmsg,
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
                &writer))
            goto onError;
@ -5099,6 +5112,10 @@ _PyUnicode_EncodeUTF32(PyObject *str,
 #else
    int iorder[] = {3, 2, 1, 0};
 #endif
    const char *encoding;
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    PyObject *rep = NULL;
 #define STORECHAR(CH)                           \
    do {                                        \
@ -5130,7 +5147,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
    if (byteorder == 0)
        STORECHAR(0xFEFF);
    if (len == 0)
-        goto done;
+        return v;
    if (byteorder == -1) {
        /* force LE */
@ -5138,6 +5155,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
        iorder[1] = 1;
        iorder[2] = 2;
        iorder[3] = 3;
        encoding = "utf-32-le";
    }
    else if (byteorder == 1) {
        /* force BE */
@ -5145,13 +5163,103 @@ _PyUnicode_EncodeUTF32(PyObject *str,
        iorder[1] = 2;
        iorder[2] = 1;
        iorder[3] = 0;
        encoding = "utf-32-be";
    }
    else
        encoding = "utf-32";
    if (kind == PyUnicode_1BYTE_KIND) {
        for (i = 0; i < len; i++)
            STORECHAR(PyUnicode_READ(kind, data, i));
        return v;
    }
-    for (i = 0; i < len; i++)
+    for (i = 0; i < len;) {
-        STORECHAR(PyUnicode_READ(kind, data, i));
+        Py_ssize_t repsize, moreunits;
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
        i++;
        assert(ch <= MAX_UNICODE);
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
            STORECHAR(ch);
            continue;
        }
-  done:
+        rep = unicode_encode_call_errorhandler(
                errors, &errorHandler,
                encoding, "surrogates not allowed",
                str, &exc, i-1, i, &i);
        if (!rep)
            goto error;
        if (PyBytes_Check(rep)) {
            repsize = PyBytes_GET_SIZE(rep);
            if (repsize & 3) {
                raise_encode_exception(&exc, encoding,
                                       str, i - 1, i,
                                       "surrogates not allowed");
                goto error;
            }
            moreunits = repsize / 4;
        }
        else {
            assert(PyUnicode_Check(rep));
            if (PyUnicode_READY(rep) < 0)
                goto error;
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
            if (!PyUnicode_IS_ASCII(rep)) {
                raise_encode_exception(&exc, encoding,
                                       str, i - 1, i,
                                       "surrogates not allowed");
                goto error;
            }
        }
        /* four bytes are reserved for each surrogate */
        if (moreunits > 1) {
            Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
            Py_ssize_t morebytes = 4 * (moreunits - 1);
            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
                /* integer overflow */
                PyErr_NoMemory();
                goto error;
            }
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
                goto error;
            p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
        }
        if (PyBytes_Check(rep)) {
            Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
            p += repsize;
        } else /* rep is unicode */ {
            const Py_UCS1 *repdata;
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
            repdata = PyUnicode_1BYTE_DATA(rep);
            while (repsize--) {
                Py_UCS4 ch = *repdata++;
                STORECHAR(ch);
            }
        }
        Py_CLEAR(rep);
    }
    /* Cut back to size actually needed. This is necessary for, for example,
       encoding of a string containing isolated surrogates and the 'ignore'
       handler is used. */
    nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
    if (nsize != PyBytes_GET_SIZE(v))
      _PyBytes_Resize(&v, nsize);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return v;
  error:
    Py_XDECREF(rep);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    Py_XDECREF(v);
    return NULL;
 #undef STORECHAR
 }
@ -5204,6 +5312,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
    const char *errmsg = "";
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    const char *encoding;
    q = (unsigned char *)s;
    e = q + size;
@ -5237,8 +5346,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
 #if PY_LITTLE_ENDIAN
    native_ordering = bo <= 0;
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
 #else
    native_ordering = bo >= 0;
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
 #endif
    /* Note: size will always be longer than the resulting Unicode
@ -5312,7 +5423,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
        if (unicode_decode_call_errorhandler_writer(
                errors,
                &errorHandler,
-                "utf16", errmsg,
+                encoding, errmsg,
                &starts,
                (const char **)&e,
                &startinpos,
@ -5348,13 +5459,17 @@ _PyUnicode_EncodeUTF16(PyObject *str,
    Py_ssize_t len;
    PyObject *v;
    unsigned short *out;
    Py_ssize_t bytesize;
    Py_ssize_t pairs;
 #if PY_BIG_ENDIAN
    int native_ordering = byteorder >= 0;
 #else
    int native_ordering = byteorder <= 0;
 #endif
    const char *encoding;
    Py_ssize_t nsize, pos;
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    PyObject *rep = NULL;
    if (!PyUnicode_Check(str)) {
        PyErr_BadArgument();
@ -5376,8 +5491,8 @@ _PyUnicode_EncodeUTF16(PyObject *str,
    }
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
        return PyErr_NoMemory();
-    bytesize = (len + pairs + (byteorder == 0)) * 2;
+    nsize = len + pairs + (byteorder == 0);
-    v = PyBytes_FromStringAndSize(NULL, bytesize);
+    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
    if (v == NULL)
        return NULL;
@ -5389,25 +5504,107 @@ _PyUnicode_EncodeUTF16(PyObject *str,
    if (len == 0)
        goto done;
-    switch (kind) {
+    if (kind == PyUnicode_1BYTE_KIND) {
-    case PyUnicode_1BYTE_KIND: {
+        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
-        ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
+        goto done;
        break;
    }
    case PyUnicode_2BYTE_KIND: {
        ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
        break;
    }
    case PyUnicode_4BYTE_KIND: {
        ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
        break;
    }
    default:
        assert(0);
    }
    if (byteorder < 0)
        encoding = "utf-16-le";
    else if (byteorder > 0)
        encoding = "utf-16-be";
    else
        encoding = "utf-16";
    pos = 0;
    while (pos < len) {
        Py_ssize_t repsize, moreunits;
        if (kind == PyUnicode_2BYTE_KIND) {
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
                                        &out, native_ordering);
        }
        else {
            assert(kind == PyUnicode_4BYTE_KIND);
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
                                        &out, native_ordering);
        }
        if (pos == len)
            break;
        rep = unicode_encode_call_errorhandler(
                errors, &errorHandler,
                encoding, "surrogates not allowed",
                str, &exc, pos, pos + 1, &pos);
        if (!rep)
            goto error;
        if (PyBytes_Check(rep)) {
            repsize = PyBytes_GET_SIZE(rep);
            if (repsize & 1) {
                raise_encode_exception(&exc, encoding,
                                       str, pos - 1, pos,
                                       "surrogates not allowed");
                goto error;
            }
            moreunits = repsize / 2;
        }
        else {
            assert(PyUnicode_Check(rep));
            if (PyUnicode_READY(rep) < 0)
                goto error;
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
            if (!PyUnicode_IS_ASCII(rep)) {
                raise_encode_exception(&exc, encoding,
                                       str, pos - 1, pos,
                                       "surrogates not allowed");
                goto error;
            }
        }
        /* two bytes are reserved for each surrogate */
        if (moreunits > 1) {
            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
            Py_ssize_t morebytes = 2 * (moreunits - 1);
            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
                /* integer overflow */
                PyErr_NoMemory();
                goto error;
            }
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
                goto error;
            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
        }
        if (PyBytes_Check(rep)) {
            Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
            out += moreunits;
        } else /* rep is unicode */ {
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
                                 &out, native_ordering);
        }
        Py_CLEAR(rep);
    }
    /* Cut back to size actually needed. This is necessary for, for example,
    encoding of a string containing isolated surrogates and the 'ignore' handler
    is used. */
    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
    if (nsize != PyBytes_GET_SIZE(v))
      _PyBytes_Resize(&v, nsize);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
  done:
    return v;
  error:
    Py_XDECREF(rep);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    Py_XDECREF(v);
    return NULL;
 #undef STORECHAR
 }
 PyObject *
--- a/Python/codecs.c
+++ b/Python/codecs.c
@ -753,6 +753,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
    }
 }
 #define ENC_UTF8        0
 #define ENC_UTF16BE     1
 #define ENC_UTF16LE     2
 #define ENC_UTF32BE     3
 #define ENC_UTF32LE     4
 static int
 get_standard_encoding(const char *encoding, int *bytelength)
 {
    if (Py_TOLOWER(encoding[0]) == 'u' &&
        Py_TOLOWER(encoding[1]) == 't' &&
        Py_TOLOWER(encoding[2]) == 'f') {
        encoding += 3;
        if (*encoding == '-' || *encoding == '_' )
            encoding++;
        if (encoding[0] == '1' && encoding[1] == '6') {
            encoding += 2;
            *bytelength = 2;
            if (*encoding == '\0') {
 #ifdef WORDS_BIGENDIAN
                return ENC_UTF16BE;
 #else
                return ENC_UTF16LE;
 #endif
            }
            if (*encoding == '-' || *encoding == '_' )
                encoding++;
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
                if (Py_TOLOWER(encoding[0]) == 'b')
                    return ENC_UTF16BE;
                if (Py_TOLOWER(encoding[0]) == 'l')
                    return ENC_UTF16LE;
            }
        }
        else if (encoding[0] == '3' && encoding[1] == '2') {
            encoding += 2;
            *bytelength = 4;
            if (*encoding == '\0') {
 #ifdef WORDS_BIGENDIAN
                return ENC_UTF32BE;
 #else
                return ENC_UTF32LE;
 #endif
            }
            if (*encoding == '-' || *encoding == '_' )
                encoding++;
            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
                if (Py_TOLOWER(encoding[0]) == 'b')
                    return ENC_UTF32BE;
                if (Py_TOLOWER(encoding[0]) == 'l')
                    return ENC_UTF32LE;
            }
        }
    }
    /* utf-8 */
    *bytelength = 3;
    return ENC_UTF8;
 }
 /* This handler is declared static until someone demonstrates
   a need to call it directly. */
 static PyObject *
@ -760,24 +819,40 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
 {
    PyObject *restuple;
    PyObject *object;
    PyObject *encode;
    char *encoding;
    int code;
    int bytelength;
    Py_ssize_t i;
    Py_ssize_t start;
    Py_ssize_t end;
    PyObject *res;
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
-        char *outp;
+        unsigned char *outp;
        if (PyUnicodeEncodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeEncodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
            return NULL;
-        res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
+        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
            Py_DECREF(object);
            return NULL;
        }
        if (!(encoding = PyUnicode_AsUTF8(encode))) {
            Py_DECREF(object);
            Py_DECREF(encode);
            return NULL;
        }
        code = get_standard_encoding(encoding, &bytelength);
        Py_DECREF(encode);
        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
        if (!res) {
            Py_DECREF(object);
            return NULL;
        }
-        outp = PyBytes_AsString(res);
+        outp = (unsigned char*)PyBytes_AsString(res);
        for (i = start; i < end; i++) {
            /* object is guaranteed to be "ready" */
            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
@ -788,9 +863,33 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
                Py_DECREF(object);
                return NULL;
            }
-            *outp++ = (char)(0xe0 | (ch >> 12));
+            switch (code) {
-            *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+            case ENC_UTF8:
-            *outp++ = (char)(0x80 | (ch & 0x3f));
+                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
                break;
            case ENC_UTF16LE:
                *outp++ = (unsigned char) ch;
                *outp++ = (unsigned char)(ch >> 8);
                break;
            case ENC_UTF16BE:
                *outp++ = (unsigned char)(ch >> 8);
                *outp++ = (unsigned char) ch;
                break;
            case ENC_UTF32LE:
                *outp++ = (unsigned char) ch;
                *outp++ = (unsigned char)(ch >> 8);
                *outp++ = (unsigned char)(ch >> 16);
                *outp++ = (unsigned char)(ch >> 24);
                break;
            case ENC_UTF32BE:
                *outp++ = (unsigned char)(ch >> 24);
                *outp++ = (unsigned char)(ch >> 16);
                *outp++ = (unsigned char)(ch >> 8);
                *outp++ = (unsigned char) ch;
                break;
            }
        }
        restuple = Py_BuildValue("(On)", res, end);
        Py_DECREF(res);
@ -802,34 +901,64 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
        Py_UCS4 ch = 0;
        if (PyUnicodeDecodeError_GetStart(exc, &start))
            return NULL;
        if (PyUnicodeDecodeError_GetEnd(exc, &end))
            return NULL;
        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
            return NULL;
        if (!(p = (unsigned char*)PyBytes_AsString(object))) {
            Py_DECREF(object);
            return NULL;
        }
        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
            Py_DECREF(object);
            return NULL;
        }
        if (!(encoding = PyUnicode_AsUTF8(encode))) {
            Py_DECREF(object);
            Py_DECREF(encode);
            return NULL;
        }
        code = get_standard_encoding(encoding, &bytelength);
        Py_DECREF(encode);
        /* Try decoding a single surrogate character. If
           there are more, let the codec call us again. */
        p += start;
-        if (PyBytes_GET_SIZE(object) - start >= 3 &&
+        if (PyBytes_GET_SIZE(object) - start >= bytelength) {
-            (p[0] & 0xf0) == 0xe0 &&
+            switch (code) {
-            (p[1] & 0xc0) == 0x80 &&
+            case ENC_UTF8:
-            (p[2] & 0xc0) == 0x80) {
+                if ((p[0] & 0xf0) == 0xe0 &&
-            /* it's a three-byte code */
+                    (p[1] & 0xc0) == 0x80 &&
-            ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
+                    (p[2] & 0xc0) == 0x80) {
-            if (!Py_UNICODE_IS_SURROGATE(ch))
+                    /* it's a three-byte code */
-                /* it's not a surrogate - fail */
+                    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
-                ch = 0;
+                }
                break;
            case ENC_UTF16LE:
                ch = p[1] << 8 | p[0];
                break;
            case ENC_UTF16BE:
                ch = p[0] << 8 | p[1];
                break;
            case ENC_UTF32LE:
                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
                break;
            case ENC_UTF32BE:
                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
                break;
            }
        }
        Py_DECREF(object);
-        if (ch == 0) {
+        if (!Py_UNICODE_IS_SURROGATE(ch)) {
            /* it's not a surrogate - fail */
            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
            return NULL;
        }
        res = PyUnicode_FromOrdinal(ch);
        if (res == NULL)
            return NULL;
-        return Py_BuildValue("(Nn)", res, start+3);
+        return Py_BuildValue("(Nn)", res, start + bytelength);
    }
    else {
        wrong_exception_type(exc);