Issue #3873: Speed up unpickling from file objects which have a peek()

method.
This commit is contained in:
Antoine Pitrou 2010-10-12 20:51:21 +00:00
parent b0182c8ca5
commit 04248a8d99
3 changed files with 138 additions and 25 deletions

View File

@ -30,6 +30,21 @@ def count_opcode(code, pickle):
n += 1
return n
class UnseekableIO(io.BytesIO):
def peek(self, *args):
raise NotImplementedError
def seekable(self):
return False
def seek(self, *args):
raise io.UnsupportedOperation
def tell(self):
raise io.UnsupportedOperation
# We can't very well test the extension registry without putting known stuff
# in it, but we have to be careful to restore its original state. Code
# should do this:
@ -1072,9 +1087,10 @@ class AbstractPickleTests(unittest.TestCase):
# Test the correctness of internal buffering routines when handling
# large data.
for proto in protocols:
data = (1, b'x' * (256 * 1024))
data = (1, min, b'xy' * (30 * 1024), len)
dumped = self.dumps(data, proto)
loaded = self.loads(dumped)
self.assertEqual(len(loaded), len(data))
self.assertEqual(loaded, data)
@ -1373,6 +1389,31 @@ class AbstractPicklerUnpicklerObjectTests(unittest.TestCase):
f.seek(0)
self.assertEqual(unpickler.load(), data2)
def _check_multiple_unpicklings(self, ioclass):
for proto in protocols:
data1 = [(x, str(x)) for x in range(2000)] + [b"abcde", len]
f = ioclass()
pickler = self.pickler_class(f, protocol=proto)
pickler.dump(data1)
pickled = f.getvalue()
N = 5
f = ioclass(pickled * N)
unpickler = self.unpickler_class(f)
for i in range(N):
if f.seekable():
pos = f.tell()
self.assertEqual(unpickler.load(), data1)
if f.seekable():
self.assertEqual(f.tell(), pos + len(pickled))
self.assertRaises(EOFError, unpickler.load)
def test_multiple_unpicklings_seekable(self):
self._check_multiple_unpicklings(io.BytesIO)
def test_multiple_unpicklings_unseekable(self):
self._check_multiple_unpicklings(UnseekableIO)
if __name__ == "__main__":
# Print some stuff that can be used to rewrite DATA{0,1,2}

View File

@ -13,6 +13,9 @@ Core and Builtins
Library
-------
- Issue #3873: Speed up unpickling from file objects which have a peek()
method.
- Issue #10075: Add a session_stats() method to SSLContext objects.
- Issue #9948: Fixed problem of losing filename case information.

View File

@ -101,6 +101,9 @@ enum {
/* Maximum size of the write buffer of Pickler when pickling to a
stream. This is ignored for in-memory pickling. */
MAX_WRITE_BUF_SIZE = 64 * 1024,
/* Prefetch size when unpickling (disabled on unpeekable streams) */
PREFETCH = 8192 * 16,
};
/* Exception classes for pickle. These should override the ones defined in
@ -355,8 +358,10 @@ typedef struct UnpicklerObject {
char *input_line;
Py_ssize_t input_len;
Py_ssize_t next_read_idx;
Py_ssize_t prefetched_idx; /* index of first prefetched byte */
PyObject *read; /* read() method of the input stream. */
PyObject *readline; /* readline() method of the input stream. */
PyObject *peek; /* peek() method of the input stream, or NULL */
char *encoding; /* Name of the encoding to be used for
decoding strings pickled using Python
@ -859,9 +864,28 @@ _Unpickler_SetStringInput(UnpicklerObject *self, PyObject *input)
self->input_buffer = self->buffer.buf;
self->input_len = self->buffer.len;
self->next_read_idx = 0;
self->prefetched_idx = self->input_len;
return self->input_len;
}
static int
_Unpickler_SkipConsumed(UnpicklerObject *self)
{
Py_ssize_t consumed = self->next_read_idx - self->prefetched_idx;
if (consumed > 0) {
PyObject *r;
assert(self->peek); /* otherwise we did something wrong */
/* This makes an useless copy... */
r = PyObject_CallFunction(self->read, "n", consumed);
if (r == NULL)
return -1;
Py_DECREF(r);
self->prefetched_idx = self->next_read_idx;
}
return 0;
}
static const Py_ssize_t READ_WHOLE_LINE = -1;
/* If reading from a file, we need to only pull the bytes we need, since there
@ -882,10 +906,12 @@ static Py_ssize_t
_Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
{
PyObject *data;
Py_ssize_t read_size;
Py_ssize_t read_size, prefetched_size = 0;
assert(self->read != NULL);
assert(self->next_read_idx == 0);
if (_Unpickler_SkipConsumed(self) < 0)
return -1;
if (n == READ_WHOLE_LINE)
data = PyObject_Call(self->readline, empty_tuple, NULL);
@ -895,13 +921,41 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
return -1;
data = _Unpickler_FastCall(self, self->read, len);
}
if (data == NULL)
return -1;
read_size = _Unpickler_SetStringInput(self, data);
self->input_len = 0;
/* Prefetch some data without advancing the file pointer, if possible */
if (self->peek) {
PyObject *len, *prefetched;
len = PyLong_FromSsize_t(PREFETCH);
if (len == NULL) {
Py_DECREF(data);
return -1;
}
prefetched = _Unpickler_FastCall(self, self->peek, len);
if (prefetched == NULL) {
if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) {
/* peek() is probably not supported by the given file object */
PyErr_Clear();
Py_CLEAR(self->peek);
}
else {
Py_DECREF(data);
return -1;
}
}
else {
assert(PyBytes_Check(prefetched));
prefetched_size = PyBytes_GET_SIZE(prefetched);
PyBytes_ConcatAndDel(&data, prefetched);
if (data == NULL)
return -1;
}
}
read_size = _Unpickler_SetStringInput(self, data) - prefetched_size;
Py_DECREF(data);
self->prefetched_idx = read_size;
return read_size;
}
@ -921,30 +975,31 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
static Py_ssize_t
_Unpickler_Read(UnpicklerObject *self, char **s, Py_ssize_t n)
{
Py_ssize_t num_read;
if (n == 0) {
*s = NULL;
return 0;
}
/* This condition will always be true if self->read. */
if (self->next_read_idx + n > self->input_len) {
if (self->read) {
Py_ssize_t num_read;
assert(self->next_read_idx == self->input_len);
num_read = _Unpickler_ReadFromFile(self, n);
if (n < 0)
return -1;
if (num_read == n) {
*s = self->input_buffer;
return num_read;
}
}
if (self->next_read_idx + n <= self->input_len) {
*s = self->input_buffer + self->next_read_idx;
self->next_read_idx += n;
return n;
}
if (!self->read) {
PyErr_Format(PyExc_EOFError, "Ran out of input");
return -1;
}
assert(self->read == NULL);
*s = self->input_buffer + self->next_read_idx;
self->next_read_idx += n;
num_read = _Unpickler_ReadFromFile(self, n);
if (num_read < 0)
return -1;
if (num_read < n) {
PyErr_Format(PyExc_EOFError, "Ran out of input");
return -1;
}
*s = self->input_buffer;
self->next_read_idx = n;
return n;
}
@ -972,9 +1027,7 @@ _Unpickler_Readline(UnpicklerObject *self, char **result)
{
Py_ssize_t i, num_read;
/* This loop will never be entered if self->read is not NULL. */
for (i = self->next_read_idx; i < self->input_len; i++) {
assert(self->read == NULL);
if (self->input_buffer[i] == '\n') {
char *line_start = self->input_buffer + self->next_read_idx;
num_read = i - self->next_read_idx + 1;
@ -983,11 +1036,11 @@ _Unpickler_Readline(UnpicklerObject *self, char **result)
}
}
if (self->read) {
assert(self->next_read_idx == self->input_len);
num_read = _Unpickler_ReadFromFile(self, READ_WHOLE_LINE);
if (num_read < 0)
return -1;
*result = self->input_buffer;
self->next_read_idx = num_read;
return num_read;
}
@ -1106,8 +1159,10 @@ _Unpickler_New(void)
self->input_line = NULL;
self->input_len = 0;
self->next_read_idx = 0;
self->prefetched_idx = 0;
self->read = NULL;
self->readline = NULL;
self->peek = NULL;
self->encoding = NULL;
self->errors = NULL;
self->marks = NULL;
@ -1124,6 +1179,13 @@ _Unpickler_New(void)
static int
_Unpickler_SetInputStream(UnpicklerObject *self, PyObject *file)
{
self->peek = PyObject_GetAttrString(file, "peek");
if (self->peek == NULL) {
if (PyErr_ExceptionMatches(PyExc_AttributeError))
PyErr_Clear();
else
return -1;
}
self->read = PyObject_GetAttrString(file, "read");
self->readline = PyObject_GetAttrString(file, "readline");
if (self->readline == NULL || self->read == NULL) {
@ -1132,6 +1194,7 @@ _Unpickler_SetInputStream(UnpicklerObject *self, PyObject *file)
"file must have 'read' and 'readline' attributes");
Py_CLEAR(self->read);
Py_CLEAR(self->readline);
Py_CLEAR(self->peek);
return -1;
}
return 0;
@ -5207,6 +5270,9 @@ load(UnpicklerObject *self)
break; /* and we are done! */
}
if (_Unpickler_SkipConsumed(self) < 0)
return NULL;
/* XXX: It is not clear what this is actually for. */
if ((err = PyErr_Occurred())) {
if (err == PyExc_EOFError) {
@ -5356,6 +5422,7 @@ Unpickler_dealloc(UnpicklerObject *self)
PyObject_GC_UnTrack((PyObject *)self);
Py_XDECREF(self->readline);
Py_XDECREF(self->read);
Py_XDECREF(self->peek);
Py_XDECREF(self->stack);
Py_XDECREF(self->pers_func);
Py_XDECREF(self->arg);
@ -5378,6 +5445,7 @@ Unpickler_traverse(UnpicklerObject *self, visitproc visit, void *arg)
{
Py_VISIT(self->readline);
Py_VISIT(self->read);
Py_VISIT(self->peek);
Py_VISIT(self->stack);
Py_VISIT(self->pers_func);
Py_VISIT(self->arg);
@ -5389,6 +5457,7 @@ Unpickler_clear(UnpicklerObject *self)
{
Py_CLEAR(self->readline);
Py_CLEAR(self->read);
Py_CLEAR(self->peek);
Py_CLEAR(self->stack);
Py_CLEAR(self->pers_func);
Py_CLEAR(self->arg);