cPickle.c: Full support for the new LONG1 and LONG4. Added comments.

Assorted code cleanups; e.g., sizeof(char) is 1 by definition, so there's
no need to do things like multiply by sizeof(char) in hairy malloc
arguments.  Fixed an undetected-overflow bug in readline_file().

longobject.c:  Fixed a really stupid bug in the new _PyLong_NumBits.

pickle.py:  Fixed stupid bug in save_long():  When proto is 2, it
wrote LONG1 or LONG4, but forgot to return then -- it went on to
append the proto 1 LONG opcode too.
Fixed equally stupid cancelling bugs in load_long1() and
load_long4():  they *returned* the unpickled long instead of pushing
it on the stack.  The return values were ignored.  Tests passed
before only because save_long() pickled the long twice.

Fixed bugs in encode_long().

Noted that decode_long() is quadratic-time despite our hopes,
because long(string, 16) is still quadratic-time in len(string).
It's hex() that's linear-time.  I don't know a way to make decode_long()
linear-time in Python, short of maybe transforming the 256's-complement
bytes into marshal's funky internal format, and letting marshal decode
that.  It would be more valuable to make long(string, 16) linear time.

pickletester.py:  Added a global "protocols" vector so tests can try
all the protocols in a sane way.  Changed test_ints() and test_unicode()
to do so.  Added a new test_long(), but the tail end of it is disabled
because it "takes forever" under pickle.py (but runs very quickly under
cPickle:  cPickle proto 2 for longs is linear-time).
This commit is contained in:
Tim Peters 2003-02-02 02:57:53 +00:00
parent 5bd2a79b22
commit ee1a53cbb1
4 changed files with 227 additions and 41 deletions

View File

@ -553,6 +553,7 @@ class Pickler:
self.write(LONG1 + chr(n) + bytes)
else:
self.write(LONG4 + pack("<i", n) + bytes)
return
self.write(LONG + `obj` + '\n')
dispatch[LongType] = save_long
@ -1042,13 +1043,13 @@ class Unpickler:
def load_long1(self):
n = ord(self.read(1))
bytes = self.read(n)
return decode_long(bytes)
self.append(decode_long(bytes))
dispatch[LONG1] = load_long1
def load_long4(self):
n = mloads('i' + self.read(4))
bytes = self.read(n)
return decode_long(bytes)
self.append(decode_long(bytes))
dispatch[LONG4] = load_long4
def load_float(self):
@ -1404,24 +1405,31 @@ def encode_long(x):
njunkchars = 2 + ashex.endswith('L')
nibbles = len(ashex) - njunkchars
if nibbles & 1:
# need an even # of nibbles for unhexlify
# Extend to a full byte.
nibbles += 1
nbits = nibbles * 4
x += 1L << nbits
assert x > 0
ashex = hex(x)
if x >> (nbits - 1) == 0:
njunkchars = 2 + ashex.endswith('L')
newnibbles = len(ashex) - njunkchars
if newnibbles < nibbles:
ashex = "0x" + "0" * (nibbles - newnibbles) + ashex[2:]
if int(ashex[2], 16) < 8:
# "looks positive", so need a byte of sign bits
ashex = "0xff" + x[2:]
ashex = "0xff" + ashex[2:]
if ashex.endswith('L'):
ashex = ashex[2:-1]
else:
ashex = ashex[2:]
assert len(ashex) & 1 == 0
assert len(ashex) & 1 == 0, (x, ashex)
binary = _binascii.unhexlify(ashex)
return binary[::-1]
# XXX OOPS! This is still quadratic-time. While hex(n) is linear-time
# XXX in the # of digits in n, long(s, 16) is still quadratic-time
# XXX in len(s).
def decode_long(data):
r"""Decode a long from a two's complement little-endian binary string.
@ -1445,7 +1453,7 @@ def decode_long(data):
if nbytes == 0:
return 0L
ashex = _binascii.hexlify(data[::-1])
n = long(ashex, 16)
n = long(ashex, 16) # quadratic time
if data[-1] >= '\x80':
n -= 1L << (nbytes * 8)
return n

View File

@ -1,6 +1,11 @@
import unittest
from test.test_support import TestFailed, have_unicode, TESTFN
# Tests that try a number of pickle protocols should have a
# for proto in protocols:
# kind of outer loop. Bump the 3 to 4 if/when protocol 3 is invented.
protocols = range(3)
class C:
def __cmp__(self, other):
return cmp(self.__dict__, other.__dict__)
@ -28,6 +33,9 @@ class metaclass(type):
class use_metaclass(object):
__metaclass__ = metaclass
# DATA and BINDATA are the protocol 0 and protocol 1 pickles of the object
# returned by create_data().
# break into multiple strings to avoid confusing font-lock-mode
DATA = """(lp1
I0
@ -210,20 +218,22 @@ class AbstractPickleTests(unittest.TestCase):
def test_unicode(self):
endcases = [unicode(''), unicode('<\\u>'), unicode('<\\\u1234>'),
unicode('<\n>'), unicode('<\\>')]
for u in endcases:
p = self.dumps(u)
u2 = self.loads(p)
self.assertEqual(u2, u)
for proto in protocols:
for u in endcases:
p = self.dumps(u, proto)
u2 = self.loads(p)
self.assertEqual(u2, u)
def test_ints(self):
import sys
n = sys.maxint
while n:
for expected in (-n, n):
s = self.dumps(expected)
n2 = self.loads(s)
self.assertEqual(expected, n2)
n = n >> 1
for proto in protocols:
n = sys.maxint
while n:
for expected in (-n, n):
s = self.dumps(expected, proto)
n2 = self.loads(s)
self.assertEqual(expected, n2)
n = n >> 1
def test_maxint64(self):
maxint64 = (1L << 63) - 1
@ -235,6 +245,34 @@ class AbstractPickleTests(unittest.TestCase):
data = 'I' + str(maxint64) + 'JUNK\n.'
self.assertRaises(ValueError, self.loads, data)
def test_long(self):
for proto in protocols:
# 256 bytes is where LONG4 begins
for nbits in 1, 8, 8*254, 8*255, 8*256, 8*257:
nbase = 1L << nbits
for npos in nbase-1, nbase, nbase+1:
for n in npos, -npos:
pickle = self.dumps(n, proto)
got = self.loads(pickle)
self.assertEqual(n, got)
# Try a monster. This is quadratic-time in protos 0 & 1, so don't
# bother with those.
# XXX Damn. pickle.py is still quadratic-time here, due to
# XXX long(string, 16). cPickle runs this in an eyeblink, but I
# XXX gave up waiting for pickle.py to get beyond "loading". Giving
# XXX up for now.
return
print "building long"
nbase = long("deadbeeffeedface", 16)
nbase += nbase << 1000000
for n in nbase, -nbase:
print "dumping"
p = self.dumps(n, 2)
print "loading"
got = self.loads(p)
print "checking"
self.assertEqual(n, got)
def test_reduce(self):
pass

View File

@ -464,7 +464,7 @@ write_other(Picklerobject *self, char *s, int n)
static int
read_file(Unpicklerobject *self, char **s, int n)
read_file(Unpicklerobject *self, char **s, int n)
{
size_t nbytesread;
@ -472,7 +472,7 @@ read_file(Unpicklerobject *self, char **s, int n)
int size;
size = ((n < 32) ? 32 : n);
if (!( self->buf = (char *)malloc(size * sizeof(char)))) {
if (!( self->buf = (char *)malloc(size))) {
PyErr_NoMemory();
return -1;
}
@ -480,12 +480,11 @@ read_file(Unpicklerobject *self, char **s, int n)
self->buf_size = size;
}
else if (n > self->buf_size) {
self->buf = (char *)realloc(self->buf, n * sizeof(char));
self->buf = (char *)realloc(self->buf, n);
if (!self->buf) {
PyErr_NoMemory();
return -1;
}
self->buf_size = n;
}
@ -514,16 +513,16 @@ readline_file(Unpicklerobject *self, char **s)
int i;
if (self->buf_size == 0) {
if (!( self->buf = (char *)malloc(40 * sizeof(char)))) {
if (!( self->buf = (char *)malloc(40))) {
PyErr_NoMemory();
return -1;
}
self->buf_size = 40;
}
i = 0;
while (1) {
int bigger;
for (; i < (self->buf_size - 1); i++) {
if (feof(self->fp) ||
(self->buf[i] = getc(self->fp)) == '\n') {
@ -532,14 +531,17 @@ readline_file(Unpicklerobject *self, char **s)
return i + 1;
}
}
self->buf = (char *)realloc(self->buf,
(self->buf_size * 2) * sizeof(char));
bigger = self->buf_size << 1;
if (bigger <= 0) { /* overflow */
PyErr_NoMemory();
return -1;
}
self->buf = (char *)realloc(self->buf, bigger);
if (!self->buf) {
PyErr_NoMemory();
return -1;
}
self->buf_size *= 2;
self->buf_size = bigger;
}
}
@ -620,14 +622,18 @@ readline_other(Unpicklerobject *self, char **s)
return str_size;
}
/* Copy the first n bytes from s into newly malloc'ed memory, plus a
* trailing 0 byte. Return a pointer to that, or NULL if out of memory.
* The caller is responsible for free()'ing the return value.
*/
static char *
pystrndup(char *s, int l)
pystrndup(char *s, int n)
{
char *r;
if (!( r=malloc((l+1)*sizeof(char)))) return (char*)PyErr_NoMemory();
memcpy(r,s,l);
r[l]=0;
char *r = (char *)malloc(n+1);
if (r == NULL)
return (char*)PyErr_NoMemory();
memcpy(r, s, n);
r[n] = 0;
return r;
}
@ -1012,11 +1018,93 @@ save_int(Picklerobject *self, PyObject *args)
static int
save_long(Picklerobject *self, PyObject *args)
{
int size, res = -1;
PyObject *repr = 0;
int size;
int res = -1;
PyObject *repr = NULL;
static char l = LONG;
if (self->proto >= 2) {
/* Linear-time pickling. */
size_t nbits;
size_t nbytes;
unsigned char *pdata;
char c_str[5];
int i;
int sign = _PyLong_Sign(args);
if (sign == 0) {
/* It's 0 -- an empty bytestring. */
c_str[0] = LONG1;
c_str[1] = 0;
i = self->write_func(self, c_str, 2);
if (i < 0) goto finally;
res = 0;
goto finally;
}
nbits = _PyLong_NumBits(args);
if (nbits == (size_t)-1 && PyErr_Occurred())
goto finally;
/* How many bytes do we need? There are nbits >> 3 full
* bytes of data, and nbits & 7 leftover bits. If there
* are any leftover bits, then we clearly need another
* byte. Wnat's not so obvious is that we *probably*
* need another byte even if there aren't any leftovers:
* the most-significant bit of the most-significant byte
* acts like a sign bit, and it's usually got a sense
* opposite of the one we need. The exception is longs
* of the form -(2**(8*j-1)) for j > 0. Such a long is
* its own 256's-complement, so has the right sign bit
* even without the extra byte. That's a pain to check
* for in advance, though, so we always grab an extra
* byte at the start, and cut it back later if possible.
*/
nbytes = (nbits >> 3) + 1;
if ((int)nbytes < 0 || (size_t)(int)nbytes != nbytes) {
PyErr_SetString(PyExc_OverflowError, "long too large "
"to pickle");
goto finally;
}
repr = PyString_FromStringAndSize(NULL, (int)nbytes);
if (repr == NULL) goto finally;
pdata = (unsigned char *)PyString_AS_STRING(repr);
i = _PyLong_AsByteArray((PyLongObject *)args,
pdata, nbytes,
1 /* little endian */, 1 /* signed */);
if (i < 0) goto finally;
/* If the long is negative, this may be a byte more than
* needed. This is so iff the MSB is all redundant sign
* bits.
*/
if (sign < 0 && nbytes > 1 && pdata[nbytes - 1] == 0xff &&
(pdata[nbytes - 2] & 0x80) != 0)
--nbytes;
if (nbytes < 256) {
c_str[0] = LONG1;
c_str[1] = (char)nbytes;
size = 2;
}
else {
c_str[0] = LONG4;
size = (int)nbytes;
for (i = 1; i < 5; i++) {
c_str[i] = (char)(size & 0xff);
size >>= 8;
}
size = 5;
}
i = self->write_func(self, c_str, size);
if (i < 0) goto finally;
i = self->write_func(self, (char *)pdata, (int)nbytes);
if (i < 0) goto finally;
res = 0;
goto finally;
}
/* proto < 2: write the repr and newline. This is quadratic-time
* (in the number of digits), in both directions.
*/
if (!( repr = PyObject_Repr(args)))
goto finally;
@ -1038,7 +1126,6 @@ save_long(Picklerobject *self, PyObject *args)
finally:
Py_XDECREF(repr);
return res;
}
@ -2687,9 +2774,13 @@ load_int(Unpicklerobject *self)
return res;
}
/* s contains x bytes of a little-endian integer. Return its value as a
* C int. Obscure: when x is 1 or 2, this is an unsigned little-endian
* int, but when x is 4 it's a signed one. This is an historical source
* of x-platform bugs.
*/
static long
calc_binint(char *s, int x)
calc_binint(char *s, int x)
{
unsigned char c;
int i;
@ -2786,6 +2877,45 @@ load_long(Unpicklerobject *self)
return res;
}
/* 'size' bytes contain the # of bytes of little-endian 256's-complement
* data following.
*/
static int
load_counted_long(Unpicklerobject *self, int size)
{
int i;
char *nbytes;
unsigned char *pdata;
PyObject *along;
assert(size == 1 || size == 4);
i = self->read_func(self, &nbytes, size);
if (i < 0) return -1;
size = calc_binint(nbytes, size);
if (size < 0) {
/* Corrupt or hostile pickle -- we never write one like
* this.
*/
PyErr_SetString(PyExc_ValueError, "LONG pickle has negative "
"byte count");
return -1;
}
if (size == 0)
along = PyLong_FromLong(0L);
else {
/* Read the raw little-endian bytes & convert. */
i = self->read_func(self, &(char *)pdata, size);
if (i < 0) return -1;
along = _PyLong_FromByteArray(pdata, (size_t)size,
1 /* little endian */, 1 /* signed */);
}
if (along == NULL)
return -1;
PDATA_PUSH(self->stack, along, -1);
return 0;
}
static int
load_float(Unpicklerobject *self)
@ -3784,6 +3914,16 @@ load(Unpicklerobject *self)
break;
continue;
case LONG1:
if (load_counted_long(self, 1) < 0)
break;
continue;
case LONG4:
if (load_counted_long(self, 4) < 0)
break;
continue;
case FLOAT:
if (load_float(self) < 0)
break;

View File

@ -264,13 +264,13 @@ int
_PyLong_Sign(PyObject *vv)
{
PyLongObject *v = (PyLongObject *)vv;
const int ndigits = v->ob_size;
const int ndigits = ABS(v->ob_size);
assert(v != NULL);
assert(PyLong_Check(v));
assert(ndigits == 0 || v->ob_digit[ndigits - 1] != 0);
return ndigits == 0 ? 0 : (ndigits < 0 ? -1 : 1);
return v->ob_size == 0 ? 0 : (v->ob_size < 0 ? -1 : 1);
}
size_t