bpo-31993: Do not create frames for large bytes and str objects (#5114)

when serialize into memory buffer with C pickle implementations.

This optimization already is performed when serialize into memory
with Python pickle implementations or into a file with both
implementations.
This commit is contained in:
Serhiy Storchaka 2018-01-11 13:03:20 +02:00 committed by GitHub
parent cb3ae5588b
commit 0a2da50e18
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 97 additions and 104 deletions

View File

@ -2097,20 +2097,21 @@ class AbstractPickleTests(unittest.TestCase):
N = 1024 * 1024
obj = [b'x' * N, b'y' * N, 'z' * N]
for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
for fast in [True, False]:
for fast in [False, True]:
with self.subTest(proto=proto, fast=fast):
if hasattr(self, 'pickler'):
if not fast:
# fast=False by default.
# This covers in-memory pickling with pickle.dumps().
pickled = self.dumps(obj, proto)
else:
# Pickler is required when fast=True.
if not hasattr(self, 'pickler'):
continue
buf = io.BytesIO()
pickler = self.pickler(buf, protocol=proto)
pickler.fast = fast
pickler.dump(obj)
pickled = buf.getvalue()
elif fast:
continue
else:
# Fallback to self.dumps when fast=False and
# self.pickler is not available.
pickled = self.dumps(obj, proto)
unpickled = self.loads(pickled)
# More informative error message in case of failure.
self.assertEqual([len(x) for x in obj],

View File

@ -71,8 +71,6 @@ class PyPicklerTests(AbstractPickleTests):
class InMemoryPickleTests(AbstractPickleTests, AbstractUnpickleTests,
BigmemPickleTests):
pickler = pickle._Pickler
unpickler = pickle._Unpickler
bad_stack_errors = (pickle.UnpicklingError, IndexError)
truncated_errors = (pickle.UnpicklingError, EOFError,
AttributeError, ValueError,
@ -84,6 +82,8 @@ class InMemoryPickleTests(AbstractPickleTests, AbstractUnpickleTests,
def loads(self, buf, **kwds):
return pickle.loads(buf, **kwds)
test_framed_write_sizes_with_delayed_writer = None
class PersistentPicklerUnpicklerMixin(object):

View File

@ -2142,47 +2142,74 @@ done:
return 0;
}
/* No-copy code-path to write large contiguous data directly into the
underlying file object, bypassing the output_buffer of the Pickler. */
static int
_Pickler_write_large_bytes(
PicklerObject *self, const char *header, Py_ssize_t header_size,
PyObject *payload)
{
assert(self->output_buffer != NULL);
assert(self->write != NULL);
PyObject *result;
/* Perform direct write of the header and payload of the binary object.
/* Commit the previous frame. */
if (_Pickler_CommitFrame(self)) {
return -1;
The large contiguous data is written directly into the underlying file
object, bypassing the output_buffer of the Pickler. We intentionally
do not insert a protocol 4 frame opcode to make it possible to optimize
file.read calls in the loader.
*/
static int
_Pickler_write_bytes(PicklerObject *self,
const char *header, Py_ssize_t header_size,
const char *data, Py_ssize_t data_size,
PyObject *payload)
{
int bypass_buffer = (data_size >= FRAME_SIZE_TARGET);
int framing = self->framing;
if (bypass_buffer) {
assert(self->output_buffer != NULL);
/* Commit the previous frame. */
if (_Pickler_CommitFrame(self)) {
return -1;
}
/* Disable framing temporarily */
self->framing = 0;
}
/* Disable frameing temporarily */
self->framing = 0;
if (_Pickler_Write(self, header, header_size) < 0) {
return -1;
}
/* Dump the output buffer to the file. */
if (_Pickler_FlushToFile(self) < 0) {
return -1;
}
/* Stream write the payload into the file without going through the
output buffer. */
result = PyObject_CallFunctionObjArgs(self->write, payload, NULL);
if (result == NULL) {
return -1;
}
Py_DECREF(result);
if (bypass_buffer && self->write != NULL) {
/* Bypass the in-memory buffer to directly stream large data
into the underlying file object. */
PyObject *result, *mem = NULL;
/* Dump the output buffer to the file. */
if (_Pickler_FlushToFile(self) < 0) {
return -1;
}
/* Reinitialize the buffer for subsequent calls to _Pickler_Write. */
if (_Pickler_ClearBuffer(self) < 0) {
return -1;
/* Stream write the payload into the file without going through the
output buffer. */
if (payload == NULL) {
payload = mem = PyMemoryView_FromMemory((char *) data, data_size,
PyBUF_READ);
if (payload == NULL) {
return -1;
}
}
result = PyObject_CallFunctionObjArgs(self->write, payload, NULL);
Py_XDECREF(mem);
if (result == NULL) {
return -1;
}
Py_DECREF(result);
/* Reinitialize the buffer for subsequent calls to _Pickler_Write. */
if (_Pickler_ClearBuffer(self) < 0) {
return -1;
}
}
else {
if (_Pickler_Write(self, data, data_size) < 0) {
return -1;
}
}
/* Re-enable framing for subsequent calls to _Pickler_Write. */
self->framing = 1;
self->framing = framing;
return 0;
}
@ -2265,20 +2292,10 @@ save_bytes(PicklerObject *self, PyObject *obj)
return -1; /* string too large */
}
if (size < FRAME_SIZE_TARGET || self->write == NULL) {
if (_Pickler_Write(self, header, len) < 0) {
return -1;
}
if (_Pickler_Write(self, PyBytes_AS_STRING(obj), size) < 0) {
return -1;
}
}
else {
/* Bypass the in-memory buffer to directly stream large data
into the underlying file object. */
if (_Pickler_write_large_bytes(self, header, len, obj) < 0) {
return -1;
}
if (_Pickler_write_bytes(self, header, len,
PyBytes_AS_STRING(obj), size, obj) < 0)
{
return -1;
}
if (memo_put(self, obj) < 0)
@ -2360,11 +2377,29 @@ error:
}
static int
write_utf8(PicklerObject *self, const char *data, Py_ssize_t size)
write_unicode_binary(PicklerObject *self, PyObject *obj)
{
char header[9];
Py_ssize_t len;
PyObject *mem;
PyObject *encoded = NULL;
Py_ssize_t size;
const char *data;
if (PyUnicode_READY(obj))
return -1;
data = PyUnicode_AsUTF8AndSize(obj, &size);
if (data == NULL) {
/* Issue #8383: for strings with lone surrogates, fallback on the
"surrogatepass" error handler. */
PyErr_Clear();
encoded = PyUnicode_AsEncodedString(obj, "utf-8", "surrogatepass");
if (encoded == NULL)
return -1;
data = PyBytes_AS_STRING(encoded);
size = PyBytes_GET_SIZE(encoded);
}
assert(size >= 0);
if (size <= 0xff && self->proto >= 4) {
@ -2388,61 +2423,18 @@ write_utf8(PicklerObject *self, const char *data, Py_ssize_t size)
else {
PyErr_SetString(PyExc_OverflowError,
"cannot serialize a string larger than 4GiB");
Py_XDECREF(encoded);
return -1;
}
if (size < FRAME_SIZE_TARGET || self->write == NULL) {
if (_Pickler_Write(self, header, len) < 0) {
return -1;
}
if (_Pickler_Write(self, data, size) < 0) {
return -1;
}
}
else {
/* Bypass the in-memory buffer to directly stream large data
into the underlying file object. */
mem = PyMemoryView_FromMemory((char *) data, size, PyBUF_READ);
if (mem == NULL) {
return -1;
}
if (_Pickler_write_large_bytes(self, header, len, mem) < 0) {
Py_DECREF(mem);
return -1;
}
Py_DECREF(mem);
if (_Pickler_write_bytes(self, header, len, data, size, encoded) < 0) {
Py_XDECREF(encoded);
return -1;
}
Py_XDECREF(encoded);
return 0;
}
static int
write_unicode_binary(PicklerObject *self, PyObject *obj)
{
PyObject *encoded = NULL;
Py_ssize_t size;
const char *data;
int r;
if (PyUnicode_READY(obj))
return -1;
data = PyUnicode_AsUTF8AndSize(obj, &size);
if (data != NULL)
return write_utf8(self, data, size);
/* Issue #8383: for strings with lone surrogates, fallback on the
"surrogatepass" error handler. */
PyErr_Clear();
encoded = PyUnicode_AsEncodedString(obj, "utf-8", "surrogatepass");
if (encoded == NULL)
return -1;
r = write_utf8(self, PyBytes_AS_STRING(encoded),
PyBytes_GET_SIZE(encoded));
Py_DECREF(encoded);
return r;
}
static int
save_unicode(PicklerObject *self, PyObject *obj)
{