bpo-31993: Do not create frames for large bytes and str objects (#5114)
when serialize into memory buffer with C pickle implementations. This optimization already is performed when serialize into memory with Python pickle implementations or into a file with both implementations.
This commit is contained in:
parent
cb3ae5588b
commit
0a2da50e18
|
@ -2097,20 +2097,21 @@ class AbstractPickleTests(unittest.TestCase):
|
|||
N = 1024 * 1024
|
||||
obj = [b'x' * N, b'y' * N, 'z' * N]
|
||||
for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
|
||||
for fast in [True, False]:
|
||||
for fast in [False, True]:
|
||||
with self.subTest(proto=proto, fast=fast):
|
||||
if hasattr(self, 'pickler'):
|
||||
if not fast:
|
||||
# fast=False by default.
|
||||
# This covers in-memory pickling with pickle.dumps().
|
||||
pickled = self.dumps(obj, proto)
|
||||
else:
|
||||
# Pickler is required when fast=True.
|
||||
if not hasattr(self, 'pickler'):
|
||||
continue
|
||||
buf = io.BytesIO()
|
||||
pickler = self.pickler(buf, protocol=proto)
|
||||
pickler.fast = fast
|
||||
pickler.dump(obj)
|
||||
pickled = buf.getvalue()
|
||||
elif fast:
|
||||
continue
|
||||
else:
|
||||
# Fallback to self.dumps when fast=False and
|
||||
# self.pickler is not available.
|
||||
pickled = self.dumps(obj, proto)
|
||||
unpickled = self.loads(pickled)
|
||||
# More informative error message in case of failure.
|
||||
self.assertEqual([len(x) for x in obj],
|
||||
|
|
|
@ -71,8 +71,6 @@ class PyPicklerTests(AbstractPickleTests):
|
|||
class InMemoryPickleTests(AbstractPickleTests, AbstractUnpickleTests,
|
||||
BigmemPickleTests):
|
||||
|
||||
pickler = pickle._Pickler
|
||||
unpickler = pickle._Unpickler
|
||||
bad_stack_errors = (pickle.UnpicklingError, IndexError)
|
||||
truncated_errors = (pickle.UnpicklingError, EOFError,
|
||||
AttributeError, ValueError,
|
||||
|
@ -84,6 +82,8 @@ class InMemoryPickleTests(AbstractPickleTests, AbstractUnpickleTests,
|
|||
def loads(self, buf, **kwds):
|
||||
return pickle.loads(buf, **kwds)
|
||||
|
||||
test_framed_write_sizes_with_delayed_writer = None
|
||||
|
||||
|
||||
class PersistentPicklerUnpicklerMixin(object):
|
||||
|
||||
|
|
|
@ -2142,47 +2142,74 @@ done:
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* No-copy code-path to write large contiguous data directly into the
|
||||
underlying file object, bypassing the output_buffer of the Pickler. */
|
||||
static int
|
||||
_Pickler_write_large_bytes(
|
||||
PicklerObject *self, const char *header, Py_ssize_t header_size,
|
||||
PyObject *payload)
|
||||
{
|
||||
assert(self->output_buffer != NULL);
|
||||
assert(self->write != NULL);
|
||||
PyObject *result;
|
||||
/* Perform direct write of the header and payload of the binary object.
|
||||
|
||||
/* Commit the previous frame. */
|
||||
if (_Pickler_CommitFrame(self)) {
|
||||
return -1;
|
||||
The large contiguous data is written directly into the underlying file
|
||||
object, bypassing the output_buffer of the Pickler. We intentionally
|
||||
do not insert a protocol 4 frame opcode to make it possible to optimize
|
||||
file.read calls in the loader.
|
||||
*/
|
||||
static int
|
||||
_Pickler_write_bytes(PicklerObject *self,
|
||||
const char *header, Py_ssize_t header_size,
|
||||
const char *data, Py_ssize_t data_size,
|
||||
PyObject *payload)
|
||||
{
|
||||
int bypass_buffer = (data_size >= FRAME_SIZE_TARGET);
|
||||
int framing = self->framing;
|
||||
|
||||
if (bypass_buffer) {
|
||||
assert(self->output_buffer != NULL);
|
||||
/* Commit the previous frame. */
|
||||
if (_Pickler_CommitFrame(self)) {
|
||||
return -1;
|
||||
}
|
||||
/* Disable framing temporarily */
|
||||
self->framing = 0;
|
||||
}
|
||||
/* Disable frameing temporarily */
|
||||
self->framing = 0;
|
||||
|
||||
if (_Pickler_Write(self, header, header_size) < 0) {
|
||||
return -1;
|
||||
}
|
||||
/* Dump the output buffer to the file. */
|
||||
if (_Pickler_FlushToFile(self) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Stream write the payload into the file without going through the
|
||||
output buffer. */
|
||||
result = PyObject_CallFunctionObjArgs(self->write, payload, NULL);
|
||||
if (result == NULL) {
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(result);
|
||||
if (bypass_buffer && self->write != NULL) {
|
||||
/* Bypass the in-memory buffer to directly stream large data
|
||||
into the underlying file object. */
|
||||
PyObject *result, *mem = NULL;
|
||||
/* Dump the output buffer to the file. */
|
||||
if (_Pickler_FlushToFile(self) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Reinitialize the buffer for subsequent calls to _Pickler_Write. */
|
||||
if (_Pickler_ClearBuffer(self) < 0) {
|
||||
return -1;
|
||||
/* Stream write the payload into the file without going through the
|
||||
output buffer. */
|
||||
if (payload == NULL) {
|
||||
payload = mem = PyMemoryView_FromMemory((char *) data, data_size,
|
||||
PyBUF_READ);
|
||||
if (payload == NULL) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
result = PyObject_CallFunctionObjArgs(self->write, payload, NULL);
|
||||
Py_XDECREF(mem);
|
||||
if (result == NULL) {
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(result);
|
||||
|
||||
/* Reinitialize the buffer for subsequent calls to _Pickler_Write. */
|
||||
if (_Pickler_ClearBuffer(self) < 0) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (_Pickler_Write(self, data, data_size) < 0) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Re-enable framing for subsequent calls to _Pickler_Write. */
|
||||
self->framing = 1;
|
||||
self->framing = framing;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -2265,20 +2292,10 @@ save_bytes(PicklerObject *self, PyObject *obj)
|
|||
return -1; /* string too large */
|
||||
}
|
||||
|
||||
if (size < FRAME_SIZE_TARGET || self->write == NULL) {
|
||||
if (_Pickler_Write(self, header, len) < 0) {
|
||||
return -1;
|
||||
}
|
||||
if (_Pickler_Write(self, PyBytes_AS_STRING(obj), size) < 0) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* Bypass the in-memory buffer to directly stream large data
|
||||
into the underlying file object. */
|
||||
if (_Pickler_write_large_bytes(self, header, len, obj) < 0) {
|
||||
return -1;
|
||||
}
|
||||
if (_Pickler_write_bytes(self, header, len,
|
||||
PyBytes_AS_STRING(obj), size, obj) < 0)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (memo_put(self, obj) < 0)
|
||||
|
@ -2360,11 +2377,29 @@ error:
|
|||
}
|
||||
|
||||
static int
|
||||
write_utf8(PicklerObject *self, const char *data, Py_ssize_t size)
|
||||
write_unicode_binary(PicklerObject *self, PyObject *obj)
|
||||
{
|
||||
char header[9];
|
||||
Py_ssize_t len;
|
||||
PyObject *mem;
|
||||
PyObject *encoded = NULL;
|
||||
Py_ssize_t size;
|
||||
const char *data;
|
||||
|
||||
if (PyUnicode_READY(obj))
|
||||
return -1;
|
||||
|
||||
data = PyUnicode_AsUTF8AndSize(obj, &size);
|
||||
if (data == NULL) {
|
||||
/* Issue #8383: for strings with lone surrogates, fallback on the
|
||||
"surrogatepass" error handler. */
|
||||
PyErr_Clear();
|
||||
encoded = PyUnicode_AsEncodedString(obj, "utf-8", "surrogatepass");
|
||||
if (encoded == NULL)
|
||||
return -1;
|
||||
|
||||
data = PyBytes_AS_STRING(encoded);
|
||||
size = PyBytes_GET_SIZE(encoded);
|
||||
}
|
||||
|
||||
assert(size >= 0);
|
||||
if (size <= 0xff && self->proto >= 4) {
|
||||
|
@ -2388,61 +2423,18 @@ write_utf8(PicklerObject *self, const char *data, Py_ssize_t size)
|
|||
else {
|
||||
PyErr_SetString(PyExc_OverflowError,
|
||||
"cannot serialize a string larger than 4GiB");
|
||||
Py_XDECREF(encoded);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (size < FRAME_SIZE_TARGET || self->write == NULL) {
|
||||
if (_Pickler_Write(self, header, len) < 0) {
|
||||
return -1;
|
||||
}
|
||||
if (_Pickler_Write(self, data, size) < 0) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* Bypass the in-memory buffer to directly stream large data
|
||||
into the underlying file object. */
|
||||
mem = PyMemoryView_FromMemory((char *) data, size, PyBUF_READ);
|
||||
if (mem == NULL) {
|
||||
return -1;
|
||||
}
|
||||
if (_Pickler_write_large_bytes(self, header, len, mem) < 0) {
|
||||
Py_DECREF(mem);
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(mem);
|
||||
if (_Pickler_write_bytes(self, header, len, data, size, encoded) < 0) {
|
||||
Py_XDECREF(encoded);
|
||||
return -1;
|
||||
}
|
||||
Py_XDECREF(encoded);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
write_unicode_binary(PicklerObject *self, PyObject *obj)
|
||||
{
|
||||
PyObject *encoded = NULL;
|
||||
Py_ssize_t size;
|
||||
const char *data;
|
||||
int r;
|
||||
|
||||
if (PyUnicode_READY(obj))
|
||||
return -1;
|
||||
|
||||
data = PyUnicode_AsUTF8AndSize(obj, &size);
|
||||
if (data != NULL)
|
||||
return write_utf8(self, data, size);
|
||||
|
||||
/* Issue #8383: for strings with lone surrogates, fallback on the
|
||||
"surrogatepass" error handler. */
|
||||
PyErr_Clear();
|
||||
encoded = PyUnicode_AsEncodedString(obj, "utf-8", "surrogatepass");
|
||||
if (encoded == NULL)
|
||||
return -1;
|
||||
|
||||
r = write_utf8(self, PyBytes_AS_STRING(encoded),
|
||||
PyBytes_GET_SIZE(encoded));
|
||||
Py_DECREF(encoded);
|
||||
return r;
|
||||
}
|
||||
|
||||
static int
|
||||
save_unicode(PicklerObject *self, PyObject *obj)
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue