Fix issue #4730: cPickle corrupts high-unicode strings.

Update outdated copy of PyUnicode_EncodeRawUnicodeEscape. Add a test case.
2008-12-27 07:08:47 +00:00 · 2008-12-27 07:08:47 +00:00 · f852bf97ef
parent 034e08ce8d
commit f852bf97ef
2 changed files with 87 additions and 31 deletions
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@ -480,14 +480,21 @@ class AbstractPickleTests(unittest.TestCase):
    if have_unicode:
        def test_unicode(self):
-            endcases = [unicode(''), unicode('<\\u>'), unicode('<\\\u1234>'),
+            endcases = [u'', u'<\\u>', u'<\\\\u1234>', u'<\n>',
-                        unicode('<\n>'),  unicode('<\\>')]
+                        u'<\\>', u'<\\\\U00012345>']
            for proto in protocols:
                for u in endcases:
                    p = self.dumps(u, proto)
                    u2 = self.loads(p)
                    self.assertEqual(u2, u)
        def test_unicode_high_plane(self):
            t = u'\U00012345'
            for proto in protocols:
                p = self.dumps(t, proto)
                t2 = self.loads(p)
                self.assertEqual(t2, t)
    def test_ints(self):
        import sys
        for proto in protocols:
--- a/Modules/cPickle.c
+++ b/Modules/cPickle.c
@ -1255,15 +1255,23 @@ save_string(Picklerobject *self, PyObject *args, int doput)
 /* A copy of PyUnicode_EncodeRawUnicodeEscape() that also translates
   backslash and newline characters to \uXXXX escapes. */
 static PyObject *
-modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size)
+modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size)
 {
    PyObject *repr;
    char *p;
    char *q;
-	static const char *hexdigit = "0123456789ABCDEF";
+    static const char *hexdigit = "0123456789abcdef";
 #ifdef Py_UNICODE_WIDE
    const Py_ssize_t expandsize = 10;
 #else
    const Py_ssize_t expandsize = 6;
 #endif
-	repr = PyString_FromStringAndSize(NULL, 6 * size);
+    if (size > PY_SSIZE_T_MAX / expandsize)
        return PyErr_NoMemory();
    repr = PyString_FromStringAndSize(NULL, expandsize * size);
    if (repr == NULL)
        return NULL;
    if (size == 0)
@ -1272,6 +1280,48 @@ modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size)
    p = q = PyString_AS_STRING(repr);
    while (size-- > 0) {
        Py_UNICODE ch = *s++;
 #ifdef Py_UNICODE_WIDE
 	/* Map 32-bit characters to '\Uxxxxxxxx' */
 	if (ch >= 0x10000) {
            *p++ = '\\';
            *p++ = 'U';
            *p++ = hexdigit[(ch >> 28) & 0xf];
            *p++ = hexdigit[(ch >> 24) & 0xf];
            *p++ = hexdigit[(ch >> 20) & 0xf];
            *p++ = hexdigit[(ch >> 16) & 0xf];
            *p++ = hexdigit[(ch >> 12) & 0xf];
            *p++ = hexdigit[(ch >> 8) & 0xf];
            *p++ = hexdigit[(ch >> 4) & 0xf];
            *p++ = hexdigit[ch & 15];
        }
        else
 #else
 	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
 	if (ch >= 0xD800 && ch < 0xDC00) {
 	    Py_UNICODE ch2;
 	    Py_UCS4 ucs;
 	    ch2 = *s++;
 	    size--;
 	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
 		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
 		*p++ = '\\';
 		*p++ = 'U';
 		*p++ = hexdigit[(ucs >> 28) & 0xf];
 		*p++ = hexdigit[(ucs >> 24) & 0xf];
 		*p++ = hexdigit[(ucs >> 20) & 0xf];
 		*p++ = hexdigit[(ucs >> 16) & 0xf];
 		*p++ = hexdigit[(ucs >> 12) & 0xf];
 		*p++ = hexdigit[(ucs >> 8) & 0xf];
 		*p++ = hexdigit[(ucs >> 4) & 0xf];
 		*p++ = hexdigit[ucs & 0xf];
 		continue;
 	    }
 	    /* Fall through: isolated surrogates are copied as-is */
 	    s--;
 	    size++;
 	}
 #endif
 	/* Map 16-bit characters to '\uxxxx' */
 	if (ch >= 256 || ch == '\\' || ch == '\n') {
            *p++ = '\\';
@ -1290,7 +1340,6 @@ modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size)
    return repr;
 }
 static int
 save_unicode(Picklerobject *self, PyObject *args, int doput)
 {