bpo-31993: Do not use memoryview when pickle large strings. (#5154)
PyMemoryView_FromMemory() created a memoryview referring to the internal data of the string. When the string is destroyed the memoryview become referring to a freed memory.
This commit is contained in:
parent
f3031b8a7a
commit
5b76bdba07
|
@ -2169,67 +2169,67 @@ class AbstractPickleTests(unittest.TestCase):
|
|||
def test_framed_write_sizes_with_delayed_writer(self):
|
||||
class ChunkAccumulator:
|
||||
"""Accumulate pickler output in a list of raw chunks."""
|
||||
|
||||
def __init__(self):
|
||||
self.chunks = []
|
||||
|
||||
def write(self, chunk):
|
||||
self.chunks.append(chunk)
|
||||
|
||||
def concatenate_chunks(self):
|
||||
# Some chunks can be memoryview instances, we need to convert
|
||||
# them to bytes to be able to call join
|
||||
return b"".join([c.tobytes() if hasattr(c, 'tobytes') else c
|
||||
for c in self.chunks])
|
||||
|
||||
small_objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})
|
||||
for i in range(int(1e4))]
|
||||
return b"".join(self.chunks)
|
||||
|
||||
for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
|
||||
objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})
|
||||
for i in range(int(1e4))]
|
||||
# Add a large unique ASCII string
|
||||
objects.append('0123456789abcdef' *
|
||||
(self.FRAME_SIZE_TARGET // 16 + 1))
|
||||
|
||||
# Protocol 4 packs groups of small objects into frames and issues
|
||||
# calls to write only once or twice per frame:
|
||||
# The C pickler issues one call to write per-frame (header and
|
||||
# contents) while Python pickler issues two calls to write: one for
|
||||
# the frame header and one for the frame binary contents.
|
||||
writer = ChunkAccumulator()
|
||||
self.pickler(writer, proto).dump(small_objects)
|
||||
self.pickler(writer, proto).dump(objects)
|
||||
|
||||
# Actually read the binary content of the chunks after the end
|
||||
# of the call to dump: ant memoryview passed to write should not
|
||||
# of the call to dump: any memoryview passed to write should not
|
||||
# be released otherwise this delayed access would not be possible.
|
||||
pickled = writer.concatenate_chunks()
|
||||
reconstructed = self.loads(pickled)
|
||||
self.assertEqual(reconstructed, small_objects)
|
||||
self.assertEqual(reconstructed, objects)
|
||||
self.assertGreater(len(writer.chunks), 1)
|
||||
|
||||
n_frames, remainder = divmod(len(pickled), self.FRAME_SIZE_TARGET)
|
||||
if remainder > 0:
|
||||
n_frames += 1
|
||||
# memoryviews should own the memory.
|
||||
del objects
|
||||
support.gc_collect()
|
||||
self.assertEqual(writer.concatenate_chunks(), pickled)
|
||||
|
||||
n_frames = (len(pickled) - 1) // self.FRAME_SIZE_TARGET + 1
|
||||
# There should be at least one call to write per frame
|
||||
self.assertGreaterEqual(len(writer.chunks), n_frames)
|
||||
|
||||
# but not too many either: there can be one for the proto,
|
||||
# one per-frame header and one per frame for the actual contents.
|
||||
self.assertGreaterEqual(2 * n_frames + 1, len(writer.chunks))
|
||||
# one per-frame header, one per frame for the actual contents,
|
||||
# and two for the header.
|
||||
self.assertLessEqual(len(writer.chunks), 2 * n_frames + 3)
|
||||
|
||||
chunk_sizes = [len(c) for c in writer.chunks[:-1]]
|
||||
chunk_sizes = [len(c) for c in writer.chunks]
|
||||
large_sizes = [s for s in chunk_sizes
|
||||
if s >= self.FRAME_SIZE_TARGET]
|
||||
small_sizes = [s for s in chunk_sizes
|
||||
if s < self.FRAME_SIZE_TARGET]
|
||||
medium_sizes = [s for s in chunk_sizes
|
||||
if 9 < s < self.FRAME_SIZE_TARGET]
|
||||
small_sizes = [s for s in chunk_sizes if s <= 9]
|
||||
|
||||
# Large chunks should not be too large:
|
||||
for chunk_size in large_sizes:
|
||||
self.assertGreater(2 * self.FRAME_SIZE_TARGET, chunk_size)
|
||||
|
||||
last_chunk_size = len(writer.chunks[-1])
|
||||
self.assertGreater(2 * self.FRAME_SIZE_TARGET, last_chunk_size)
|
||||
|
||||
# Small chunks (if any) should be very small
|
||||
# (only proto and frame headers)
|
||||
for chunk_size in small_sizes:
|
||||
self.assertGreaterEqual(9, chunk_size)
|
||||
self.assertLess(chunk_size, 2 * self.FRAME_SIZE_TARGET,
|
||||
chunk_sizes)
|
||||
# There shouldn't bee too many small chunks: the protocol header,
|
||||
# the frame headers and the large string headers are written
|
||||
# in small chunks.
|
||||
self.assertLessEqual(len(small_sizes),
|
||||
len(large_sizes) + len(medium_sizes) + 3,
|
||||
chunk_sizes)
|
||||
|
||||
def test_nested_names(self):
|
||||
global Nested
|
||||
|
|
|
@ -693,9 +693,9 @@ ctypes._aix.find_library() Patch by: Michael Felt
|
|||
.. nonce: -OMNg8
|
||||
.. section: Library
|
||||
|
||||
The picklers no longer allocate temporary memory when dumping large
|
||||
``bytes`` and ``str`` objects into a file object. Instead the data is
|
||||
directly streamed into the underlying file object.
|
||||
The pickler now uses less memory when serializing large bytes and str
|
||||
objects into a file. Pickles created with protocol 4 will require less
|
||||
memory for unpickling large bytes and str objects.
|
||||
|
||||
..
|
||||
|
||||
|
|
|
@ -2184,8 +2184,9 @@ _Pickler_write_bytes(PicklerObject *self,
|
|||
/* Stream write the payload into the file without going through the
|
||||
output buffer. */
|
||||
if (payload == NULL) {
|
||||
payload = mem = PyMemoryView_FromMemory((char *) data, data_size,
|
||||
PyBUF_READ);
|
||||
/* TODO: It would be better to use a memoryview with a linked
|
||||
original string if this is possible. */
|
||||
payload = mem = PyBytes_FromStringAndSize(data, data_size);
|
||||
if (payload == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue