Issue #8383: pickle and pickletools use surrogatepass error handler when
encoding unicode as utf8 to support lone surrogates and stay compatible with Python 2.x and 3.0
This commit is contained in:
parent
3606760670
commit
485fb56eb8
|
@ -499,7 +499,7 @@ class _Pickler:
|
||||||
|
|
||||||
def save_str(self, obj, pack=struct.pack):
|
def save_str(self, obj, pack=struct.pack):
|
||||||
if self.bin:
|
if self.bin:
|
||||||
encoded = obj.encode('utf-8')
|
encoded = obj.encode('utf-8', 'surrogatepass')
|
||||||
n = len(encoded)
|
n = len(encoded)
|
||||||
self.write(BINUNICODE + pack("<i", n) + encoded)
|
self.write(BINUNICODE + pack("<i", n) + encoded)
|
||||||
else:
|
else:
|
||||||
|
@ -966,7 +966,7 @@ class _Unpickler:
|
||||||
|
|
||||||
def load_binunicode(self):
|
def load_binunicode(self):
|
||||||
len = mloads(b'i' + self.read(4))
|
len = mloads(b'i' + self.read(4))
|
||||||
self.append(str(self.read(len), 'utf-8'))
|
self.append(str(self.read(len), 'utf-8', 'surrogatepass'))
|
||||||
dispatch[BINUNICODE[0]] = load_binunicode
|
dispatch[BINUNICODE[0]] = load_binunicode
|
||||||
|
|
||||||
def load_short_binstring(self):
|
def load_short_binstring(self):
|
||||||
|
|
|
@ -469,7 +469,7 @@ def read_unicodestring4(f):
|
||||||
raise ValueError("unicodestring4 byte count < 0: %d" % n)
|
raise ValueError("unicodestring4 byte count < 0: %d" % n)
|
||||||
data = f.read(n)
|
data = f.read(n)
|
||||||
if len(data) == n:
|
if len(data) == n:
|
||||||
return str(data, 'utf-8')
|
return str(data, 'utf-8', 'surrogatepass')
|
||||||
raise ValueError("expected %d bytes in a unicodestring4, but only %d "
|
raise ValueError("expected %d bytes in a unicodestring4, but only %d "
|
||||||
"remain" % (n, len(data)))
|
"remain" % (n, len(data)))
|
||||||
|
|
||||||
|
|
|
@ -515,7 +515,9 @@ class AbstractPickleTests(unittest.TestCase):
|
||||||
|
|
||||||
def test_unicode(self):
|
def test_unicode(self):
|
||||||
endcases = ['', '<\\u>', '<\\\u1234>', '<\n>',
|
endcases = ['', '<\\u>', '<\\\u1234>', '<\n>',
|
||||||
'<\\>', '<\\\U00012345>']
|
'<\\>', '<\\\U00012345>',
|
||||||
|
# surrogates
|
||||||
|
'<\udc80>']
|
||||||
for proto in protocols:
|
for proto in protocols:
|
||||||
for u in endcases:
|
for u in endcases:
|
||||||
p = self.dumps(u, proto)
|
p = self.dumps(u, proto)
|
||||||
|
|
|
@ -312,6 +312,10 @@ C-API
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #8383: pickle and pickletools use surrogatepass error handler when
|
||||||
|
encoding unicode as utf8 to support lone surrogates and stay compatible with
|
||||||
|
Python 2.x and 3.0
|
||||||
|
|
||||||
- Issue #7585: difflib context and unified diffs now place a tab between
|
- Issue #7585: difflib context and unified diffs now place a tab between
|
||||||
filename and date, conforming to the 'standards' they were originally
|
filename and date, conforming to the 'standards' they were originally
|
||||||
designed to follow. This improves compatibility with patch tools.
|
designed to follow. This improves compatibility with patch tools.
|
||||||
|
|
|
@ -1227,7 +1227,9 @@ save_unicode(PicklerObject *self, PyObject *obj)
|
||||||
if (self->bin) {
|
if (self->bin) {
|
||||||
char pdata[5];
|
char pdata[5];
|
||||||
|
|
||||||
encoded = PyUnicode_AsUTF8String(obj);
|
encoded = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(obj),
|
||||||
|
PyUnicode_GET_SIZE(obj),
|
||||||
|
"surrogatepass");
|
||||||
if (encoded == NULL)
|
if (encoded == NULL)
|
||||||
goto error;
|
goto error;
|
||||||
|
|
||||||
|
@ -3352,7 +3354,7 @@ load_binunicode(UnpicklerObject *self)
|
||||||
if (unpickler_read(self, &s, size) < 0)
|
if (unpickler_read(self, &s, size) < 0)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
str = PyUnicode_DecodeUTF8(s, size, NULL);
|
str = PyUnicode_DecodeUTF8(s, size, "surrogatepass");
|
||||||
if (str == NULL)
|
if (str == NULL)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue