From 0a2da50e1867831251fad75377d0f10713eb6301 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 11 Jan 2018 13:03:20 +0200 Subject: [PATCH] bpo-31993: Do not create frames for large bytes and str objects (#5114) when serialize into memory buffer with C pickle implementations. This optimization already is performed when serialize into memory with Python pickle implementations or into a file with both implementations. --- Lib/test/pickletester.py | 17 ++-- Lib/test/test_pickle.py | 4 +- Modules/_pickle.c | 180 +++++++++++++++++++-------------------- 3 files changed, 97 insertions(+), 104 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 5d983eb617f..f4e3f81249c 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -2097,20 +2097,21 @@ class AbstractPickleTests(unittest.TestCase): N = 1024 * 1024 obj = [b'x' * N, b'y' * N, 'z' * N] for proto in range(4, pickle.HIGHEST_PROTOCOL + 1): - for fast in [True, False]: + for fast in [False, True]: with self.subTest(proto=proto, fast=fast): - if hasattr(self, 'pickler'): + if not fast: + # fast=False by default. + # This covers in-memory pickling with pickle.dumps(). + pickled = self.dumps(obj, proto) + else: + # Pickler is required when fast=True. + if not hasattr(self, 'pickler'): + continue buf = io.BytesIO() pickler = self.pickler(buf, protocol=proto) pickler.fast = fast pickler.dump(obj) pickled = buf.getvalue() - elif fast: - continue - else: - # Fallback to self.dumps when fast=False and - # self.pickler is not available. - pickled = self.dumps(obj, proto) unpickled = self.loads(pickled) # More informative error message in case of failure. self.assertEqual([len(x) for x in obj], diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py index 895ed48df1d..0051a0118a3 100644 --- a/Lib/test/test_pickle.py +++ b/Lib/test/test_pickle.py @@ -71,8 +71,6 @@ class PyPicklerTests(AbstractPickleTests): class InMemoryPickleTests(AbstractPickleTests, AbstractUnpickleTests, BigmemPickleTests): - pickler = pickle._Pickler - unpickler = pickle._Unpickler bad_stack_errors = (pickle.UnpicklingError, IndexError) truncated_errors = (pickle.UnpicklingError, EOFError, AttributeError, ValueError, @@ -84,6 +82,8 @@ class InMemoryPickleTests(AbstractPickleTests, AbstractUnpickleTests, def loads(self, buf, **kwds): return pickle.loads(buf, **kwds) + test_framed_write_sizes_with_delayed_writer = None + class PersistentPicklerUnpicklerMixin(object): diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 5cb1fba6cc2..5b3de4d96dd 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -2142,47 +2142,74 @@ done: return 0; } -/* No-copy code-path to write large contiguous data directly into the - underlying file object, bypassing the output_buffer of the Pickler. */ -static int -_Pickler_write_large_bytes( - PicklerObject *self, const char *header, Py_ssize_t header_size, - PyObject *payload) -{ - assert(self->output_buffer != NULL); - assert(self->write != NULL); - PyObject *result; +/* Perform direct write of the header and payload of the binary object. - /* Commit the previous frame. */ - if (_Pickler_CommitFrame(self)) { - return -1; + The large contiguous data is written directly into the underlying file + object, bypassing the output_buffer of the Pickler. We intentionally + do not insert a protocol 4 frame opcode to make it possible to optimize + file.read calls in the loader. + */ +static int +_Pickler_write_bytes(PicklerObject *self, + const char *header, Py_ssize_t header_size, + const char *data, Py_ssize_t data_size, + PyObject *payload) +{ + int bypass_buffer = (data_size >= FRAME_SIZE_TARGET); + int framing = self->framing; + + if (bypass_buffer) { + assert(self->output_buffer != NULL); + /* Commit the previous frame. */ + if (_Pickler_CommitFrame(self)) { + return -1; + } + /* Disable framing temporarily */ + self->framing = 0; } - /* Disable frameing temporarily */ - self->framing = 0; if (_Pickler_Write(self, header, header_size) < 0) { return -1; } - /* Dump the output buffer to the file. */ - if (_Pickler_FlushToFile(self) < 0) { - return -1; - } - /* Stream write the payload into the file without going through the - output buffer. */ - result = PyObject_CallFunctionObjArgs(self->write, payload, NULL); - if (result == NULL) { - return -1; - } - Py_DECREF(result); + if (bypass_buffer && self->write != NULL) { + /* Bypass the in-memory buffer to directly stream large data + into the underlying file object. */ + PyObject *result, *mem = NULL; + /* Dump the output buffer to the file. */ + if (_Pickler_FlushToFile(self) < 0) { + return -1; + } - /* Reinitialize the buffer for subsequent calls to _Pickler_Write. */ - if (_Pickler_ClearBuffer(self) < 0) { - return -1; + /* Stream write the payload into the file without going through the + output buffer. */ + if (payload == NULL) { + payload = mem = PyMemoryView_FromMemory((char *) data, data_size, + PyBUF_READ); + if (payload == NULL) { + return -1; + } + } + result = PyObject_CallFunctionObjArgs(self->write, payload, NULL); + Py_XDECREF(mem); + if (result == NULL) { + return -1; + } + Py_DECREF(result); + + /* Reinitialize the buffer for subsequent calls to _Pickler_Write. */ + if (_Pickler_ClearBuffer(self) < 0) { + return -1; + } + } + else { + if (_Pickler_Write(self, data, data_size) < 0) { + return -1; + } } /* Re-enable framing for subsequent calls to _Pickler_Write. */ - self->framing = 1; + self->framing = framing; return 0; } @@ -2265,20 +2292,10 @@ save_bytes(PicklerObject *self, PyObject *obj) return -1; /* string too large */ } - if (size < FRAME_SIZE_TARGET || self->write == NULL) { - if (_Pickler_Write(self, header, len) < 0) { - return -1; - } - if (_Pickler_Write(self, PyBytes_AS_STRING(obj), size) < 0) { - return -1; - } - } - else { - /* Bypass the in-memory buffer to directly stream large data - into the underlying file object. */ - if (_Pickler_write_large_bytes(self, header, len, obj) < 0) { - return -1; - } + if (_Pickler_write_bytes(self, header, len, + PyBytes_AS_STRING(obj), size, obj) < 0) + { + return -1; } if (memo_put(self, obj) < 0) @@ -2360,11 +2377,29 @@ error: } static int -write_utf8(PicklerObject *self, const char *data, Py_ssize_t size) +write_unicode_binary(PicklerObject *self, PyObject *obj) { char header[9]; Py_ssize_t len; - PyObject *mem; + PyObject *encoded = NULL; + Py_ssize_t size; + const char *data; + + if (PyUnicode_READY(obj)) + return -1; + + data = PyUnicode_AsUTF8AndSize(obj, &size); + if (data == NULL) { + /* Issue #8383: for strings with lone surrogates, fallback on the + "surrogatepass" error handler. */ + PyErr_Clear(); + encoded = PyUnicode_AsEncodedString(obj, "utf-8", "surrogatepass"); + if (encoded == NULL) + return -1; + + data = PyBytes_AS_STRING(encoded); + size = PyBytes_GET_SIZE(encoded); + } assert(size >= 0); if (size <= 0xff && self->proto >= 4) { @@ -2388,61 +2423,18 @@ write_utf8(PicklerObject *self, const char *data, Py_ssize_t size) else { PyErr_SetString(PyExc_OverflowError, "cannot serialize a string larger than 4GiB"); + Py_XDECREF(encoded); return -1; } - if (size < FRAME_SIZE_TARGET || self->write == NULL) { - if (_Pickler_Write(self, header, len) < 0) { - return -1; - } - if (_Pickler_Write(self, data, size) < 0) { - return -1; - } - } - else { - /* Bypass the in-memory buffer to directly stream large data - into the underlying file object. */ - mem = PyMemoryView_FromMemory((char *) data, size, PyBUF_READ); - if (mem == NULL) { - return -1; - } - if (_Pickler_write_large_bytes(self, header, len, mem) < 0) { - Py_DECREF(mem); - return -1; - } - Py_DECREF(mem); + if (_Pickler_write_bytes(self, header, len, data, size, encoded) < 0) { + Py_XDECREF(encoded); + return -1; } + Py_XDECREF(encoded); return 0; } -static int -write_unicode_binary(PicklerObject *self, PyObject *obj) -{ - PyObject *encoded = NULL; - Py_ssize_t size; - const char *data; - int r; - - if (PyUnicode_READY(obj)) - return -1; - - data = PyUnicode_AsUTF8AndSize(obj, &size); - if (data != NULL) - return write_utf8(self, data, size); - - /* Issue #8383: for strings with lone surrogates, fallback on the - "surrogatepass" error handler. */ - PyErr_Clear(); - encoded = PyUnicode_AsEncodedString(obj, "utf-8", "surrogatepass"); - if (encoded == NULL) - return -1; - - r = write_utf8(self, PyBytes_AS_STRING(encoded), - PyBytes_GET_SIZE(encoded)); - Py_DECREF(encoded); - return r; -} - static int save_unicode(PicklerObject *self, PyObject *obj) {