mirror of https://github.com/python/cpython
Issue #8383: pickle and pickletools use surrogatepass error handler when
encoding unicode as utf8 to support lone surrogates and stay compatible with Python 2.x and 3.0
This commit is contained in:
parent
3606760670
commit
485fb56eb8
|
@ -499,7 +499,7 @@ class _Pickler:
|
|||
|
||||
def save_str(self, obj, pack=struct.pack):
|
||||
if self.bin:
|
||||
encoded = obj.encode('utf-8')
|
||||
encoded = obj.encode('utf-8', 'surrogatepass')
|
||||
n = len(encoded)
|
||||
self.write(BINUNICODE + pack("<i", n) + encoded)
|
||||
else:
|
||||
|
@ -966,7 +966,7 @@ class _Unpickler:
|
|||
|
||||
def load_binunicode(self):
|
||||
len = mloads(b'i' + self.read(4))
|
||||
self.append(str(self.read(len), 'utf-8'))
|
||||
self.append(str(self.read(len), 'utf-8', 'surrogatepass'))
|
||||
dispatch[BINUNICODE[0]] = load_binunicode
|
||||
|
||||
def load_short_binstring(self):
|
||||
|
|
|
@ -469,7 +469,7 @@ def read_unicodestring4(f):
|
|||
raise ValueError("unicodestring4 byte count < 0: %d" % n)
|
||||
data = f.read(n)
|
||||
if len(data) == n:
|
||||
return str(data, 'utf-8')
|
||||
return str(data, 'utf-8', 'surrogatepass')
|
||||
raise ValueError("expected %d bytes in a unicodestring4, but only %d "
|
||||
"remain" % (n, len(data)))
|
||||
|
||||
|
|
|
@ -515,7 +515,9 @@ class AbstractPickleTests(unittest.TestCase):
|
|||
|
||||
def test_unicode(self):
|
||||
endcases = ['', '<\\u>', '<\\\u1234>', '<\n>',
|
||||
'<\\>', '<\\\U00012345>']
|
||||
'<\\>', '<\\\U00012345>',
|
||||
# surrogates
|
||||
'<\udc80>']
|
||||
for proto in protocols:
|
||||
for u in endcases:
|
||||
p = self.dumps(u, proto)
|
||||
|
|
|
@ -312,6 +312,10 @@ C-API
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #8383: pickle and pickletools use surrogatepass error handler when
|
||||
encoding unicode as utf8 to support lone surrogates and stay compatible with
|
||||
Python 2.x and 3.0
|
||||
|
||||
- Issue #7585: difflib context and unified diffs now place a tab between
|
||||
filename and date, conforming to the 'standards' they were originally
|
||||
designed to follow. This improves compatibility with patch tools.
|
||||
|
|
|
@ -1227,7 +1227,9 @@ save_unicode(PicklerObject *self, PyObject *obj)
|
|||
if (self->bin) {
|
||||
char pdata[5];
|
||||
|
||||
encoded = PyUnicode_AsUTF8String(obj);
|
||||
encoded = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(obj),
|
||||
PyUnicode_GET_SIZE(obj),
|
||||
"surrogatepass");
|
||||
if (encoded == NULL)
|
||||
goto error;
|
||||
|
||||
|
@ -3352,7 +3354,7 @@ load_binunicode(UnpicklerObject *self)
|
|||
if (unpickler_read(self, &s, size) < 0)
|
||||
return -1;
|
||||
|
||||
str = PyUnicode_DecodeUTF8(s, size, NULL);
|
||||
str = PyUnicode_DecodeUTF8(s, size, "surrogatepass");
|
||||
if (str == NULL)
|
||||
return -1;
|
||||
|
||||
|
|
Loading…
Reference in New Issue