Fix issue #4730: cPickle corrupts high-unicode strings.
Update outdated copy of PyUnicode_EncodeRawUnicodeEscape. Add a test case.
This commit is contained in:
parent
034e08ce8d
commit
f852bf97ef
|
@ -480,14 +480,21 @@ class AbstractPickleTests(unittest.TestCase):
|
||||||
|
|
||||||
if have_unicode:
|
if have_unicode:
|
||||||
def test_unicode(self):
|
def test_unicode(self):
|
||||||
endcases = [unicode(''), unicode('<\\u>'), unicode('<\\\u1234>'),
|
endcases = [u'', u'<\\u>', u'<\\\\u1234>', u'<\n>',
|
||||||
unicode('<\n>'), unicode('<\\>')]
|
u'<\\>', u'<\\\\U00012345>']
|
||||||
for proto in protocols:
|
for proto in protocols:
|
||||||
for u in endcases:
|
for u in endcases:
|
||||||
p = self.dumps(u, proto)
|
p = self.dumps(u, proto)
|
||||||
u2 = self.loads(p)
|
u2 = self.loads(p)
|
||||||
self.assertEqual(u2, u)
|
self.assertEqual(u2, u)
|
||||||
|
|
||||||
|
def test_unicode_high_plane(self):
|
||||||
|
t = u'\U00012345'
|
||||||
|
for proto in protocols:
|
||||||
|
p = self.dumps(t, proto)
|
||||||
|
t2 = self.loads(p)
|
||||||
|
self.assertEqual(t2, t)
|
||||||
|
|
||||||
def test_ints(self):
|
def test_ints(self):
|
||||||
import sys
|
import sys
|
||||||
for proto in protocols:
|
for proto in protocols:
|
||||||
|
|
|
@ -1255,15 +1255,23 @@ save_string(Picklerobject *self, PyObject *args, int doput)
|
||||||
/* A copy of PyUnicode_EncodeRawUnicodeEscape() that also translates
|
/* A copy of PyUnicode_EncodeRawUnicodeEscape() that also translates
|
||||||
backslash and newline characters to \uXXXX escapes. */
|
backslash and newline characters to \uXXXX escapes. */
|
||||||
static PyObject *
|
static PyObject *
|
||||||
modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size)
|
modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size)
|
||||||
{
|
{
|
||||||
PyObject *repr;
|
PyObject *repr;
|
||||||
char *p;
|
char *p;
|
||||||
char *q;
|
char *q;
|
||||||
|
|
||||||
static const char *hexdigit = "0123456789ABCDEF";
|
static const char *hexdigit = "0123456789abcdef";
|
||||||
|
#ifdef Py_UNICODE_WIDE
|
||||||
|
const Py_ssize_t expandsize = 10;
|
||||||
|
#else
|
||||||
|
const Py_ssize_t expandsize = 6;
|
||||||
|
#endif
|
||||||
|
|
||||||
repr = PyString_FromStringAndSize(NULL, 6 * size);
|
if (size > PY_SSIZE_T_MAX / expandsize)
|
||||||
|
return PyErr_NoMemory();
|
||||||
|
|
||||||
|
repr = PyString_FromStringAndSize(NULL, expandsize * size);
|
||||||
if (repr == NULL)
|
if (repr == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (size == 0)
|
if (size == 0)
|
||||||
|
@ -1272,6 +1280,48 @@ modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size)
|
||||||
p = q = PyString_AS_STRING(repr);
|
p = q = PyString_AS_STRING(repr);
|
||||||
while (size-- > 0) {
|
while (size-- > 0) {
|
||||||
Py_UNICODE ch = *s++;
|
Py_UNICODE ch = *s++;
|
||||||
|
#ifdef Py_UNICODE_WIDE
|
||||||
|
/* Map 32-bit characters to '\Uxxxxxxxx' */
|
||||||
|
if (ch >= 0x10000) {
|
||||||
|
*p++ = '\\';
|
||||||
|
*p++ = 'U';
|
||||||
|
*p++ = hexdigit[(ch >> 28) & 0xf];
|
||||||
|
*p++ = hexdigit[(ch >> 24) & 0xf];
|
||||||
|
*p++ = hexdigit[(ch >> 20) & 0xf];
|
||||||
|
*p++ = hexdigit[(ch >> 16) & 0xf];
|
||||||
|
*p++ = hexdigit[(ch >> 12) & 0xf];
|
||||||
|
*p++ = hexdigit[(ch >> 8) & 0xf];
|
||||||
|
*p++ = hexdigit[(ch >> 4) & 0xf];
|
||||||
|
*p++ = hexdigit[ch & 15];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#else
|
||||||
|
/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
|
||||||
|
if (ch >= 0xD800 && ch < 0xDC00) {
|
||||||
|
Py_UNICODE ch2;
|
||||||
|
Py_UCS4 ucs;
|
||||||
|
|
||||||
|
ch2 = *s++;
|
||||||
|
size--;
|
||||||
|
if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
|
||||||
|
ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
|
||||||
|
*p++ = '\\';
|
||||||
|
*p++ = 'U';
|
||||||
|
*p++ = hexdigit[(ucs >> 28) & 0xf];
|
||||||
|
*p++ = hexdigit[(ucs >> 24) & 0xf];
|
||||||
|
*p++ = hexdigit[(ucs >> 20) & 0xf];
|
||||||
|
*p++ = hexdigit[(ucs >> 16) & 0xf];
|
||||||
|
*p++ = hexdigit[(ucs >> 12) & 0xf];
|
||||||
|
*p++ = hexdigit[(ucs >> 8) & 0xf];
|
||||||
|
*p++ = hexdigit[(ucs >> 4) & 0xf];
|
||||||
|
*p++ = hexdigit[ucs & 0xf];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* Fall through: isolated surrogates are copied as-is */
|
||||||
|
s--;
|
||||||
|
size++;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
/* Map 16-bit characters to '\uxxxx' */
|
/* Map 16-bit characters to '\uxxxx' */
|
||||||
if (ch >= 256 || ch == '\\' || ch == '\n') {
|
if (ch >= 256 || ch == '\\' || ch == '\n') {
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
|
@ -1290,7 +1340,6 @@ modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size)
|
||||||
return repr;
|
return repr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
save_unicode(Picklerobject *self, PyObject *args, int doput)
|
save_unicode(Picklerobject *self, PyObject *args, int doput)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue