From f852bf97ef45e10f37938434c84b58d65b1b9a7e Mon Sep 17 00:00:00 2001 From: Alexandre Vassalotti Date: Sat, 27 Dec 2008 07:08:47 +0000 Subject: [PATCH] Fix issue #4730: cPickle corrupts high-unicode strings. Update outdated copy of PyUnicode_EncodeRawUnicodeEscape. Add a test case. --- Lib/test/pickletester.py | 11 +++- Modules/cPickle.c | 107 ++++++++++++++++++++++++++++----------- 2 files changed, 87 insertions(+), 31 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index bf9bca78f98..bf25245bd1a 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -480,14 +480,21 @@ class AbstractPickleTests(unittest.TestCase): if have_unicode: def test_unicode(self): - endcases = [unicode(''), unicode('<\\u>'), unicode('<\\\u1234>'), - unicode('<\n>'), unicode('<\\>')] + endcases = [u'', u'<\\u>', u'<\\\\u1234>', u'<\n>', + u'<\\>', u'<\\\\U00012345>'] for proto in protocols: for u in endcases: p = self.dumps(u, proto) u2 = self.loads(p) self.assertEqual(u2, u) + def test_unicode_high_plane(self): + t = u'\U00012345' + for proto in protocols: + p = self.dumps(t, proto) + t2 = self.loads(p) + self.assertEqual(t2, t) + def test_ints(self): import sys for proto in protocols: diff --git a/Modules/cPickle.c b/Modules/cPickle.c index f777286a6a7..18baee1c395 100644 --- a/Modules/cPickle.c +++ b/Modules/cPickle.c @@ -1255,41 +1255,90 @@ save_string(Picklerobject *self, PyObject *args, int doput) /* A copy of PyUnicode_EncodeRawUnicodeEscape() that also translates backslash and newline characters to \uXXXX escapes. */ static PyObject * -modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size) +modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size) { - PyObject *repr; - char *p; - char *q; + PyObject *repr; + char *p; + char *q; - static const char *hexdigit = "0123456789ABCDEF"; + static const char *hexdigit = "0123456789abcdef"; +#ifdef Py_UNICODE_WIDE + const Py_ssize_t expandsize = 10; +#else + const Py_ssize_t expandsize = 6; +#endif - repr = PyString_FromStringAndSize(NULL, 6 * size); - if (repr == NULL) - return NULL; - if (size == 0) - return repr; + if (size > PY_SSIZE_T_MAX / expandsize) + return PyErr_NoMemory(); - p = q = PyString_AS_STRING(repr); - while (size-- > 0) { - Py_UNICODE ch = *s++; - /* Map 16-bit characters to '\uxxxx' */ - if (ch >= 256 || ch == '\\' || ch == '\n') { - *p++ = '\\'; - *p++ = 'u'; - *p++ = hexdigit[(ch >> 12) & 0xf]; - *p++ = hexdigit[(ch >> 8) & 0xf]; - *p++ = hexdigit[(ch >> 4) & 0xf]; - *p++ = hexdigit[ch & 15]; - } - /* Copy everything else as-is */ - else - *p++ = (char) ch; - } - *p = '\0'; - _PyString_Resize(&repr, p - q); + repr = PyString_FromStringAndSize(NULL, expandsize * size); + if (repr == NULL) + return NULL; + if (size == 0) return repr; -} + p = q = PyString_AS_STRING(repr); + while (size-- > 0) { + Py_UNICODE ch = *s++; +#ifdef Py_UNICODE_WIDE + /* Map 32-bit characters to '\Uxxxxxxxx' */ + if (ch >= 0x10000) { + *p++ = '\\'; + *p++ = 'U'; + *p++ = hexdigit[(ch >> 28) & 0xf]; + *p++ = hexdigit[(ch >> 24) & 0xf]; + *p++ = hexdigit[(ch >> 20) & 0xf]; + *p++ = hexdigit[(ch >> 16) & 0xf]; + *p++ = hexdigit[(ch >> 12) & 0xf]; + *p++ = hexdigit[(ch >> 8) & 0xf]; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 15]; + } + else +#else + /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ + if (ch >= 0xD800 && ch < 0xDC00) { + Py_UNICODE ch2; + Py_UCS4 ucs; + + ch2 = *s++; + size--; + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { + ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; + *p++ = '\\'; + *p++ = 'U'; + *p++ = hexdigit[(ucs >> 28) & 0xf]; + *p++ = hexdigit[(ucs >> 24) & 0xf]; + *p++ = hexdigit[(ucs >> 20) & 0xf]; + *p++ = hexdigit[(ucs >> 16) & 0xf]; + *p++ = hexdigit[(ucs >> 12) & 0xf]; + *p++ = hexdigit[(ucs >> 8) & 0xf]; + *p++ = hexdigit[(ucs >> 4) & 0xf]; + *p++ = hexdigit[ucs & 0xf]; + continue; + } + /* Fall through: isolated surrogates are copied as-is */ + s--; + size++; + } +#endif + /* Map 16-bit characters to '\uxxxx' */ + if (ch >= 256 || ch == '\\' || ch == '\n') { + *p++ = '\\'; + *p++ = 'u'; + *p++ = hexdigit[(ch >> 12) & 0xf]; + *p++ = hexdigit[(ch >> 8) & 0xf]; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 15]; + } + /* Copy everything else as-is */ + else + *p++ = (char) ch; + } + *p = '\0'; + _PyString_Resize(&repr, p - q); + return repr; +} static int save_unicode(Picklerobject *self, PyObject *args, int doput)