Issue #3873: Speed up unpickling from file objects which have a peek()
method.
This commit is contained in:
parent
b0182c8ca5
commit
04248a8d99
|
@ -30,6 +30,21 @@ def count_opcode(code, pickle):
|
|||
n += 1
|
||||
return n
|
||||
|
||||
|
||||
class UnseekableIO(io.BytesIO):
|
||||
def peek(self, *args):
|
||||
raise NotImplementedError
|
||||
|
||||
def seekable(self):
|
||||
return False
|
||||
|
||||
def seek(self, *args):
|
||||
raise io.UnsupportedOperation
|
||||
|
||||
def tell(self):
|
||||
raise io.UnsupportedOperation
|
||||
|
||||
|
||||
# We can't very well test the extension registry without putting known stuff
|
||||
# in it, but we have to be careful to restore its original state. Code
|
||||
# should do this:
|
||||
|
@ -1072,9 +1087,10 @@ class AbstractPickleTests(unittest.TestCase):
|
|||
# Test the correctness of internal buffering routines when handling
|
||||
# large data.
|
||||
for proto in protocols:
|
||||
data = (1, b'x' * (256 * 1024))
|
||||
data = (1, min, b'xy' * (30 * 1024), len)
|
||||
dumped = self.dumps(data, proto)
|
||||
loaded = self.loads(dumped)
|
||||
self.assertEqual(len(loaded), len(data))
|
||||
self.assertEqual(loaded, data)
|
||||
|
||||
|
||||
|
@ -1373,6 +1389,31 @@ class AbstractPicklerUnpicklerObjectTests(unittest.TestCase):
|
|||
f.seek(0)
|
||||
self.assertEqual(unpickler.load(), data2)
|
||||
|
||||
def _check_multiple_unpicklings(self, ioclass):
|
||||
for proto in protocols:
|
||||
data1 = [(x, str(x)) for x in range(2000)] + [b"abcde", len]
|
||||
f = ioclass()
|
||||
pickler = self.pickler_class(f, protocol=proto)
|
||||
pickler.dump(data1)
|
||||
pickled = f.getvalue()
|
||||
|
||||
N = 5
|
||||
f = ioclass(pickled * N)
|
||||
unpickler = self.unpickler_class(f)
|
||||
for i in range(N):
|
||||
if f.seekable():
|
||||
pos = f.tell()
|
||||
self.assertEqual(unpickler.load(), data1)
|
||||
if f.seekable():
|
||||
self.assertEqual(f.tell(), pos + len(pickled))
|
||||
self.assertRaises(EOFError, unpickler.load)
|
||||
|
||||
def test_multiple_unpicklings_seekable(self):
|
||||
self._check_multiple_unpicklings(io.BytesIO)
|
||||
|
||||
def test_multiple_unpicklings_unseekable(self):
|
||||
self._check_multiple_unpicklings(UnseekableIO)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Print some stuff that can be used to rewrite DATA{0,1,2}
|
||||
|
|
|
@ -13,6 +13,9 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #3873: Speed up unpickling from file objects which have a peek()
|
||||
method.
|
||||
|
||||
- Issue #10075: Add a session_stats() method to SSLContext objects.
|
||||
|
||||
- Issue #9948: Fixed problem of losing filename case information.
|
||||
|
|
|
@ -101,6 +101,9 @@ enum {
|
|||
/* Maximum size of the write buffer of Pickler when pickling to a
|
||||
stream. This is ignored for in-memory pickling. */
|
||||
MAX_WRITE_BUF_SIZE = 64 * 1024,
|
||||
|
||||
/* Prefetch size when unpickling (disabled on unpeekable streams) */
|
||||
PREFETCH = 8192 * 16,
|
||||
};
|
||||
|
||||
/* Exception classes for pickle. These should override the ones defined in
|
||||
|
@ -355,8 +358,10 @@ typedef struct UnpicklerObject {
|
|||
char *input_line;
|
||||
Py_ssize_t input_len;
|
||||
Py_ssize_t next_read_idx;
|
||||
Py_ssize_t prefetched_idx; /* index of first prefetched byte */
|
||||
PyObject *read; /* read() method of the input stream. */
|
||||
PyObject *readline; /* readline() method of the input stream. */
|
||||
PyObject *peek; /* peek() method of the input stream, or NULL */
|
||||
|
||||
char *encoding; /* Name of the encoding to be used for
|
||||
decoding strings pickled using Python
|
||||
|
@ -859,9 +864,28 @@ _Unpickler_SetStringInput(UnpicklerObject *self, PyObject *input)
|
|||
self->input_buffer = self->buffer.buf;
|
||||
self->input_len = self->buffer.len;
|
||||
self->next_read_idx = 0;
|
||||
self->prefetched_idx = self->input_len;
|
||||
return self->input_len;
|
||||
}
|
||||
|
||||
static int
|
||||
_Unpickler_SkipConsumed(UnpicklerObject *self)
|
||||
{
|
||||
Py_ssize_t consumed = self->next_read_idx - self->prefetched_idx;
|
||||
|
||||
if (consumed > 0) {
|
||||
PyObject *r;
|
||||
assert(self->peek); /* otherwise we did something wrong */
|
||||
/* This makes an useless copy... */
|
||||
r = PyObject_CallFunction(self->read, "n", consumed);
|
||||
if (r == NULL)
|
||||
return -1;
|
||||
Py_DECREF(r);
|
||||
self->prefetched_idx = self->next_read_idx;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const Py_ssize_t READ_WHOLE_LINE = -1;
|
||||
|
||||
/* If reading from a file, we need to only pull the bytes we need, since there
|
||||
|
@ -882,10 +906,12 @@ static Py_ssize_t
|
|||
_Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
|
||||
{
|
||||
PyObject *data;
|
||||
Py_ssize_t read_size;
|
||||
Py_ssize_t read_size, prefetched_size = 0;
|
||||
|
||||
assert(self->read != NULL);
|
||||
assert(self->next_read_idx == 0);
|
||||
|
||||
if (_Unpickler_SkipConsumed(self) < 0)
|
||||
return -1;
|
||||
|
||||
if (n == READ_WHOLE_LINE)
|
||||
data = PyObject_Call(self->readline, empty_tuple, NULL);
|
||||
|
@ -895,13 +921,41 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
|
|||
return -1;
|
||||
data = _Unpickler_FastCall(self, self->read, len);
|
||||
}
|
||||
|
||||
if (data == NULL)
|
||||
return -1;
|
||||
|
||||
read_size = _Unpickler_SetStringInput(self, data);
|
||||
self->input_len = 0;
|
||||
/* Prefetch some data without advancing the file pointer, if possible */
|
||||
if (self->peek) {
|
||||
PyObject *len, *prefetched;
|
||||
len = PyLong_FromSsize_t(PREFETCH);
|
||||
if (len == NULL) {
|
||||
Py_DECREF(data);
|
||||
return -1;
|
||||
}
|
||||
prefetched = _Unpickler_FastCall(self, self->peek, len);
|
||||
if (prefetched == NULL) {
|
||||
if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) {
|
||||
/* peek() is probably not supported by the given file object */
|
||||
PyErr_Clear();
|
||||
Py_CLEAR(self->peek);
|
||||
}
|
||||
else {
|
||||
Py_DECREF(data);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
assert(PyBytes_Check(prefetched));
|
||||
prefetched_size = PyBytes_GET_SIZE(prefetched);
|
||||
PyBytes_ConcatAndDel(&data, prefetched);
|
||||
if (data == NULL)
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
read_size = _Unpickler_SetStringInput(self, data) - prefetched_size;
|
||||
Py_DECREF(data);
|
||||
self->prefetched_idx = read_size;
|
||||
return read_size;
|
||||
}
|
||||
|
||||
|
@ -921,30 +975,31 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
|
|||
static Py_ssize_t
|
||||
_Unpickler_Read(UnpicklerObject *self, char **s, Py_ssize_t n)
|
||||
{
|
||||
Py_ssize_t num_read;
|
||||
|
||||
if (n == 0) {
|
||||
*s = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* This condition will always be true if self->read. */
|
||||
if (self->next_read_idx + n > self->input_len) {
|
||||
if (self->read) {
|
||||
Py_ssize_t num_read;
|
||||
assert(self->next_read_idx == self->input_len);
|
||||
num_read = _Unpickler_ReadFromFile(self, n);
|
||||
if (n < 0)
|
||||
return -1;
|
||||
if (num_read == n) {
|
||||
*s = self->input_buffer;
|
||||
return num_read;
|
||||
}
|
||||
}
|
||||
if (self->next_read_idx + n <= self->input_len) {
|
||||
*s = self->input_buffer + self->next_read_idx;
|
||||
self->next_read_idx += n;
|
||||
return n;
|
||||
}
|
||||
if (!self->read) {
|
||||
PyErr_Format(PyExc_EOFError, "Ran out of input");
|
||||
return -1;
|
||||
}
|
||||
assert(self->read == NULL);
|
||||
*s = self->input_buffer + self->next_read_idx;
|
||||
self->next_read_idx += n;
|
||||
num_read = _Unpickler_ReadFromFile(self, n);
|
||||
if (num_read < 0)
|
||||
return -1;
|
||||
if (num_read < n) {
|
||||
PyErr_Format(PyExc_EOFError, "Ran out of input");
|
||||
return -1;
|
||||
}
|
||||
*s = self->input_buffer;
|
||||
self->next_read_idx = n;
|
||||
return n;
|
||||
}
|
||||
|
||||
|
@ -972,9 +1027,7 @@ _Unpickler_Readline(UnpicklerObject *self, char **result)
|
|||
{
|
||||
Py_ssize_t i, num_read;
|
||||
|
||||
/* This loop will never be entered if self->read is not NULL. */
|
||||
for (i = self->next_read_idx; i < self->input_len; i++) {
|
||||
assert(self->read == NULL);
|
||||
if (self->input_buffer[i] == '\n') {
|
||||
char *line_start = self->input_buffer + self->next_read_idx;
|
||||
num_read = i - self->next_read_idx + 1;
|
||||
|
@ -983,11 +1036,11 @@ _Unpickler_Readline(UnpicklerObject *self, char **result)
|
|||
}
|
||||
}
|
||||
if (self->read) {
|
||||
assert(self->next_read_idx == self->input_len);
|
||||
num_read = _Unpickler_ReadFromFile(self, READ_WHOLE_LINE);
|
||||
if (num_read < 0)
|
||||
return -1;
|
||||
*result = self->input_buffer;
|
||||
self->next_read_idx = num_read;
|
||||
return num_read;
|
||||
}
|
||||
|
||||
|
@ -1106,8 +1159,10 @@ _Unpickler_New(void)
|
|||
self->input_line = NULL;
|
||||
self->input_len = 0;
|
||||
self->next_read_idx = 0;
|
||||
self->prefetched_idx = 0;
|
||||
self->read = NULL;
|
||||
self->readline = NULL;
|
||||
self->peek = NULL;
|
||||
self->encoding = NULL;
|
||||
self->errors = NULL;
|
||||
self->marks = NULL;
|
||||
|
@ -1124,6 +1179,13 @@ _Unpickler_New(void)
|
|||
static int
|
||||
_Unpickler_SetInputStream(UnpicklerObject *self, PyObject *file)
|
||||
{
|
||||
self->peek = PyObject_GetAttrString(file, "peek");
|
||||
if (self->peek == NULL) {
|
||||
if (PyErr_ExceptionMatches(PyExc_AttributeError))
|
||||
PyErr_Clear();
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
self->read = PyObject_GetAttrString(file, "read");
|
||||
self->readline = PyObject_GetAttrString(file, "readline");
|
||||
if (self->readline == NULL || self->read == NULL) {
|
||||
|
@ -1132,6 +1194,7 @@ _Unpickler_SetInputStream(UnpicklerObject *self, PyObject *file)
|
|||
"file must have 'read' and 'readline' attributes");
|
||||
Py_CLEAR(self->read);
|
||||
Py_CLEAR(self->readline);
|
||||
Py_CLEAR(self->peek);
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
|
@ -5207,6 +5270,9 @@ load(UnpicklerObject *self)
|
|||
break; /* and we are done! */
|
||||
}
|
||||
|
||||
if (_Unpickler_SkipConsumed(self) < 0)
|
||||
return NULL;
|
||||
|
||||
/* XXX: It is not clear what this is actually for. */
|
||||
if ((err = PyErr_Occurred())) {
|
||||
if (err == PyExc_EOFError) {
|
||||
|
@ -5356,6 +5422,7 @@ Unpickler_dealloc(UnpicklerObject *self)
|
|||
PyObject_GC_UnTrack((PyObject *)self);
|
||||
Py_XDECREF(self->readline);
|
||||
Py_XDECREF(self->read);
|
||||
Py_XDECREF(self->peek);
|
||||
Py_XDECREF(self->stack);
|
||||
Py_XDECREF(self->pers_func);
|
||||
Py_XDECREF(self->arg);
|
||||
|
@ -5378,6 +5445,7 @@ Unpickler_traverse(UnpicklerObject *self, visitproc visit, void *arg)
|
|||
{
|
||||
Py_VISIT(self->readline);
|
||||
Py_VISIT(self->read);
|
||||
Py_VISIT(self->peek);
|
||||
Py_VISIT(self->stack);
|
||||
Py_VISIT(self->pers_func);
|
||||
Py_VISIT(self->arg);
|
||||
|
@ -5389,6 +5457,7 @@ Unpickler_clear(UnpicklerObject *self)
|
|||
{
|
||||
Py_CLEAR(self->readline);
|
||||
Py_CLEAR(self->read);
|
||||
Py_CLEAR(self->peek);
|
||||
Py_CLEAR(self->stack);
|
||||
Py_CLEAR(self->pers_func);
|
||||
Py_CLEAR(self->arg);
|
||||
|
|
Loading…
Reference in New Issue