mirror of https://github.com/python/cpython
gh-113028: Correctly memoize str in pickle when escapes added (GH-113436)
This fixes a divergence between the Python and C implementations of pickle for protocol 0, such that it pickle.py fails to re-use the first pickled representation of strings involving characters that have to be escaped.
This commit is contained in:
parent
894f0e573d
commit
08398631a0
|
@ -857,13 +857,13 @@ class _Pickler:
|
|||
else:
|
||||
self.write(BINUNICODE + pack("<I", n) + encoded)
|
||||
else:
|
||||
obj = obj.replace("\\", "\\u005c")
|
||||
obj = obj.replace("\0", "\\u0000")
|
||||
obj = obj.replace("\n", "\\u000a")
|
||||
obj = obj.replace("\r", "\\u000d")
|
||||
obj = obj.replace("\x1a", "\\u001a") # EOF on DOS
|
||||
self.write(UNICODE + obj.encode('raw-unicode-escape') +
|
||||
b'\n')
|
||||
# Escape what raw-unicode-escape doesn't, but memoize the original.
|
||||
tmp = obj.replace("\\", "\\u005c")
|
||||
tmp = tmp.replace("\0", "\\u0000")
|
||||
tmp = tmp.replace("\n", "\\u000a")
|
||||
tmp = tmp.replace("\r", "\\u000d")
|
||||
tmp = tmp.replace("\x1a", "\\u001a") # EOF on DOS
|
||||
self.write(UNICODE + tmp.encode('raw-unicode-escape') + b'\n')
|
||||
self.memoize(obj)
|
||||
dispatch[str] = save_str
|
||||
|
||||
|
|
|
@ -1825,6 +1825,14 @@ class AbstractPickleTests:
|
|||
t2 = self.loads(p)
|
||||
self.assert_is_copy(t, t2)
|
||||
|
||||
def test_unicode_memoization(self):
|
||||
# Repeated str is re-used (even when escapes added).
|
||||
for proto in protocols:
|
||||
for s in '', 'xyz', 'xyz\n', 'x\\yz', 'x\xa1yz\r':
|
||||
p = self.dumps((s, s), proto)
|
||||
s1, s2 = self.loads(p)
|
||||
self.assertIs(s1, s2)
|
||||
|
||||
def test_bytes(self):
|
||||
for proto in protocols:
|
||||
for s in b'', b'xyz', b'xyz'*100:
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
When a second reference to a string appears in the input to :mod:`pickle`,
|
||||
and the Python implementation is in use,
|
||||
we are guaranteed that a single copy gets pickled
|
||||
and a single object is shared when reloaded.
|
||||
Previously, in protocol 0, when a string contained certain characters
|
||||
(e.g. newline) it resulted in duplicate objects.
|
Loading…
Reference in New Issue