bpo-31993: Do not use memoryview when pickle large strings. (#5154)

PyMemoryView_FromMemory() created a memoryview referring to
the internal data of the string.  When the string is destroyed
the memoryview become referring to a freed memory.
This commit is contained in:
Serhiy Storchaka 2018-01-13 00:28:31 +02:00 committed by GitHub
parent f3031b8a7a
commit 5b76bdba07
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 36 additions and 35 deletions

View File

@ -2169,67 +2169,67 @@ class AbstractPickleTests(unittest.TestCase):
def test_framed_write_sizes_with_delayed_writer(self):
class ChunkAccumulator:
"""Accumulate pickler output in a list of raw chunks."""
def __init__(self):
self.chunks = []
def write(self, chunk):
self.chunks.append(chunk)
def concatenate_chunks(self):
# Some chunks can be memoryview instances, we need to convert
# them to bytes to be able to call join
return b"".join([c.tobytes() if hasattr(c, 'tobytes') else c
for c in self.chunks])
small_objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})
for i in range(int(1e4))]
return b"".join(self.chunks)
for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})
for i in range(int(1e4))]
# Add a large unique ASCII string
objects.append('0123456789abcdef' *
(self.FRAME_SIZE_TARGET // 16 + 1))
# Protocol 4 packs groups of small objects into frames and issues
# calls to write only once or twice per frame:
# The C pickler issues one call to write per-frame (header and
# contents) while Python pickler issues two calls to write: one for
# the frame header and one for the frame binary contents.
writer = ChunkAccumulator()
self.pickler(writer, proto).dump(small_objects)
self.pickler(writer, proto).dump(objects)
# Actually read the binary content of the chunks after the end
# of the call to dump: ant memoryview passed to write should not
# of the call to dump: any memoryview passed to write should not
# be released otherwise this delayed access would not be possible.
pickled = writer.concatenate_chunks()
reconstructed = self.loads(pickled)
self.assertEqual(reconstructed, small_objects)
self.assertEqual(reconstructed, objects)
self.assertGreater(len(writer.chunks), 1)
n_frames, remainder = divmod(len(pickled), self.FRAME_SIZE_TARGET)
if remainder > 0:
n_frames += 1
# memoryviews should own the memory.
del objects
support.gc_collect()
self.assertEqual(writer.concatenate_chunks(), pickled)
n_frames = (len(pickled) - 1) // self.FRAME_SIZE_TARGET + 1
# There should be at least one call to write per frame
self.assertGreaterEqual(len(writer.chunks), n_frames)
# but not too many either: there can be one for the proto,
# one per-frame header and one per frame for the actual contents.
self.assertGreaterEqual(2 * n_frames + 1, len(writer.chunks))
# one per-frame header, one per frame for the actual contents,
# and two for the header.
self.assertLessEqual(len(writer.chunks), 2 * n_frames + 3)
chunk_sizes = [len(c) for c in writer.chunks[:-1]]
chunk_sizes = [len(c) for c in writer.chunks]
large_sizes = [s for s in chunk_sizes
if s >= self.FRAME_SIZE_TARGET]
small_sizes = [s for s in chunk_sizes
if s < self.FRAME_SIZE_TARGET]
medium_sizes = [s for s in chunk_sizes
if 9 < s < self.FRAME_SIZE_TARGET]
small_sizes = [s for s in chunk_sizes if s <= 9]
# Large chunks should not be too large:
for chunk_size in large_sizes:
self.assertGreater(2 * self.FRAME_SIZE_TARGET, chunk_size)
last_chunk_size = len(writer.chunks[-1])
self.assertGreater(2 * self.FRAME_SIZE_TARGET, last_chunk_size)
# Small chunks (if any) should be very small
# (only proto and frame headers)
for chunk_size in small_sizes:
self.assertGreaterEqual(9, chunk_size)
self.assertLess(chunk_size, 2 * self.FRAME_SIZE_TARGET,
chunk_sizes)
# There shouldn't bee too many small chunks: the protocol header,
# the frame headers and the large string headers are written
# in small chunks.
self.assertLessEqual(len(small_sizes),
len(large_sizes) + len(medium_sizes) + 3,
chunk_sizes)
def test_nested_names(self):
global Nested

View File

@ -693,9 +693,9 @@ ctypes._aix.find_library() Patch by: Michael Felt
.. nonce: -OMNg8
.. section: Library
The picklers no longer allocate temporary memory when dumping large
``bytes`` and ``str`` objects into a file object. Instead the data is
directly streamed into the underlying file object.
The pickler now uses less memory when serializing large bytes and str
objects into a file. Pickles created with protocol 4 will require less
memory for unpickling large bytes and str objects.
..

View File

@ -2184,8 +2184,9 @@ _Pickler_write_bytes(PicklerObject *self,
/* Stream write the payload into the file without going through the
output buffer. */
if (payload == NULL) {
payload = mem = PyMemoryView_FromMemory((char *) data, data_size,
PyBUF_READ);
/* TODO: It would be better to use a memoryview with a linked
original string if this is possible. */
payload = mem = PyBytes_FromStringAndSize(data, data_size);
if (payload == NULL) {
return -1;
}