Issue #8383: pickle and pickletools use surrogatepass error handler when

encoding unicode as utf8 to support lone surrogates and stay compatible with
Python 2.x and 3.0
This commit is contained in:
Victor Stinner 2010-04-13 11:07:24 +00:00
parent 3606760670
commit 485fb56eb8
5 changed files with 14 additions and 6 deletions

View File

@ -499,7 +499,7 @@ class _Pickler:
def save_str(self, obj, pack=struct.pack): def save_str(self, obj, pack=struct.pack):
if self.bin: if self.bin:
encoded = obj.encode('utf-8') encoded = obj.encode('utf-8', 'surrogatepass')
n = len(encoded) n = len(encoded)
self.write(BINUNICODE + pack("<i", n) + encoded) self.write(BINUNICODE + pack("<i", n) + encoded)
else: else:
@ -966,7 +966,7 @@ class _Unpickler:
def load_binunicode(self): def load_binunicode(self):
len = mloads(b'i' + self.read(4)) len = mloads(b'i' + self.read(4))
self.append(str(self.read(len), 'utf-8')) self.append(str(self.read(len), 'utf-8', 'surrogatepass'))
dispatch[BINUNICODE[0]] = load_binunicode dispatch[BINUNICODE[0]] = load_binunicode
def load_short_binstring(self): def load_short_binstring(self):

View File

@ -469,7 +469,7 @@ def read_unicodestring4(f):
raise ValueError("unicodestring4 byte count < 0: %d" % n) raise ValueError("unicodestring4 byte count < 0: %d" % n)
data = f.read(n) data = f.read(n)
if len(data) == n: if len(data) == n:
return str(data, 'utf-8') return str(data, 'utf-8', 'surrogatepass')
raise ValueError("expected %d bytes in a unicodestring4, but only %d " raise ValueError("expected %d bytes in a unicodestring4, but only %d "
"remain" % (n, len(data))) "remain" % (n, len(data)))

View File

@ -515,7 +515,9 @@ class AbstractPickleTests(unittest.TestCase):
def test_unicode(self): def test_unicode(self):
endcases = ['', '<\\u>', '<\\\u1234>', '<\n>', endcases = ['', '<\\u>', '<\\\u1234>', '<\n>',
'<\\>', '<\\\U00012345>'] '<\\>', '<\\\U00012345>',
# surrogates
'<\udc80>']
for proto in protocols: for proto in protocols:
for u in endcases: for u in endcases:
p = self.dumps(u, proto) p = self.dumps(u, proto)

View File

@ -312,6 +312,10 @@ C-API
Library Library
------- -------
- Issue #8383: pickle and pickletools use surrogatepass error handler when
encoding unicode as utf8 to support lone surrogates and stay compatible with
Python 2.x and 3.0
- Issue #7585: difflib context and unified diffs now place a tab between - Issue #7585: difflib context and unified diffs now place a tab between
filename and date, conforming to the 'standards' they were originally filename and date, conforming to the 'standards' they were originally
designed to follow. This improves compatibility with patch tools. designed to follow. This improves compatibility with patch tools.

View File

@ -1227,7 +1227,9 @@ save_unicode(PicklerObject *self, PyObject *obj)
if (self->bin) { if (self->bin) {
char pdata[5]; char pdata[5];
encoded = PyUnicode_AsUTF8String(obj); encoded = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(obj),
PyUnicode_GET_SIZE(obj),
"surrogatepass");
if (encoded == NULL) if (encoded == NULL)
goto error; goto error;
@ -3352,7 +3354,7 @@ load_binunicode(UnpicklerObject *self)
if (unpickler_read(self, &s, size) < 0) if (unpickler_read(self, &s, size) < 0)
return -1; return -1;
str = PyUnicode_DecodeUTF8(s, size, NULL); str = PyUnicode_DecodeUTF8(s, size, "surrogatepass");
if (str == NULL) if (str == NULL)
return -1; return -1;