From 2e6a4b37baf560adeee13e9a5702b23f275cc7bd Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 4 May 2007 19:56:22 +0000 Subject: [PATCH] Checkpoint. Make pickle.py read/write bytes. Fix strict.py so it reads/writes bytes and accepts both string types as format. --- Lib/pickle.py | 392 ++++++++++++++++++++++++-------------------------- Lib/struct.py | 12 +- 2 files changed, 192 insertions(+), 212 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 26b2b8c6e3e..89129a7f538 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -33,6 +33,7 @@ import marshal import sys import struct import re +import io __all__ = ["PickleError", "PicklingError", "UnpicklingError", "Pickler", "Unpickler", "dump", "dumps", "load", "loads"] @@ -50,6 +51,9 @@ compatible_formats = ["1.0", # Original protocol 0 # know how to read. HIGHEST_PROTOCOL = 2 +# The protocol we write by default. May be less than HIGHEST_PROTOCOL. +DEFAULT_PROTOCOL = 2 + # Why use struct.pack() for pickling but marshal.loads() for # unpickling? struct.pack() is 40% faster than marshal.dumps(), but # marshal.loads() is twice as fast as struct.unpack()! @@ -89,75 +93,69 @@ try: except ImportError: PyStringMap = None -# UnicodeType may or may not be exported (normally imported from types) -try: - UnicodeType -except NameError: - UnicodeType = None - # Pickle opcodes. See pickletools.py for extensive docs. The listing # here is in kind-of alphabetical order of 1-character pickle code. # pickletools groups them by purpose. -MARK = '(' # push special markobject on stack -STOP = '.' # every pickle ends with STOP -POP = '0' # discard topmost stack item -POP_MARK = '1' # discard stack top through topmost markobject -DUP = '2' # duplicate top stack item -FLOAT = 'F' # push float object; decimal string argument -INT = 'I' # push integer or bool; decimal string argument -BININT = 'J' # push four-byte signed int -BININT1 = 'K' # push 1-byte unsigned int -LONG = 'L' # push long; decimal string argument -BININT2 = 'M' # push 2-byte unsigned int -NONE = 'N' # push None -PERSID = 'P' # push persistent object; id is taken from string arg -BINPERSID = 'Q' # " " " ; " " " " stack -REDUCE = 'R' # apply callable to argtuple, both on stack -STRING = 'S' # push string; NL-terminated string argument -BINSTRING = 'T' # push string; counted binary string argument -SHORT_BINSTRING = 'U' # " " ; " " " " < 256 bytes -UNICODE = 'V' # push Unicode string; raw-unicode-escaped'd argument -BINUNICODE = 'X' # " " " ; counted UTF-8 string argument -APPEND = 'a' # append stack top to list below it -BUILD = 'b' # call __setstate__ or __dict__.update() -GLOBAL = 'c' # push self.find_class(modname, name); 2 string args -DICT = 'd' # build a dict from stack items -EMPTY_DICT = '}' # push empty dict -APPENDS = 'e' # extend list on stack by topmost stack slice -GET = 'g' # push item from memo on stack; index is string arg -BINGET = 'h' # " " " " " " ; " " 1-byte arg -INST = 'i' # build & push class instance -LONG_BINGET = 'j' # push item from memo on stack; index is 4-byte arg -LIST = 'l' # build list from topmost stack items -EMPTY_LIST = ']' # push empty list -OBJ = 'o' # build & push class instance -PUT = 'p' # store stack top in memo; index is string arg -BINPUT = 'q' # " " " " " ; " " 1-byte arg -LONG_BINPUT = 'r' # " " " " " ; " " 4-byte arg -SETITEM = 's' # add key+value pair to dict -TUPLE = 't' # build tuple from topmost stack items -EMPTY_TUPLE = ')' # push empty tuple -SETITEMS = 'u' # modify dict by adding topmost key+value pairs -BINFLOAT = 'G' # push float; arg is 8-byte float encoding +MARK = b'(' # push special markobject on stack +STOP = b'.' # every pickle ends with STOP +POP = b'0' # discard topmost stack item +POP_MARK = b'1' # discard stack top through topmost markobject +DUP = b'2' # duplicate top stack item +FLOAT = b'F' # push float object; decimal string argument +INT = b'I' # push integer or bool; decimal string argument +BININT = b'J' # push four-byte signed int +BININT1 = b'K' # push 1-byte unsigned int +LONG = b'L' # push long; decimal string argument +BININT2 = b'M' # push 2-byte unsigned int +NONE = b'N' # push None +PERSID = b'P' # push persistent object; id is taken from string arg +BINPERSID = b'Q' # " " " ; " " " " stack +REDUCE = b'R' # apply callable to argtuple, both on stack +STRING = b'S' # push string; NL-terminated string argument +BINSTRING = b'T' # push string; counted binary string argument +SHORT_BINSTRING= b'U' # " " ; " " " " < 256 bytes +UNICODE = b'V' # push Unicode string; raw-unicode-escaped'd argument +BINUNICODE = b'X' # " " " ; counted UTF-8 string argument +APPEND = b'a' # append stack top to list below it +BUILD = b'b' # call __setstate__ or __dict__.update() +GLOBAL = b'c' # push self.find_class(modname, name); 2 string args +DICT = b'd' # build a dict from stack items +EMPTY_DICT = b'}' # push empty dict +APPENDS = b'e' # extend list on stack by topmost stack slice +GET = b'g' # push item from memo on stack; index is string arg +BINGET = b'h' # " " " " " " ; " " 1-byte arg +INST = b'i' # build & push class instance +LONG_BINGET = b'j' # push item from memo on stack; index is 4-byte arg +LIST = b'l' # build list from topmost stack items +EMPTY_LIST = b']' # push empty list +OBJ = b'o' # build & push class instance +PUT = b'p' # store stack top in memo; index is string arg +BINPUT = b'q' # " " " " " ; " " 1-byte arg +LONG_BINPUT = b'r' # " " " " " ; " " 4-byte arg +SETITEM = b's' # add key+value pair to dict +TUPLE = b't' # build tuple from topmost stack items +EMPTY_TUPLE = b')' # push empty tuple +SETITEMS = b'u' # modify dict by adding topmost key+value pairs +BINFLOAT = b'G' # push float; arg is 8-byte float encoding -TRUE = 'I01\n' # not an opcode; see INT docs in pickletools.py -FALSE = 'I00\n' # not an opcode; see INT docs in pickletools.py +TRUE = b'I01\n' # not an opcode; see INT docs in pickletools.py +FALSE = b'I00\n' # not an opcode; see INT docs in pickletools.py # Protocol 2 -PROTO = '\x80' # identify pickle protocol -NEWOBJ = '\x81' # build object by applying cls.__new__ to argtuple -EXT1 = '\x82' # push object from extension registry; 1-byte index -EXT2 = '\x83' # ditto, but 2-byte index -EXT4 = '\x84' # ditto, but 4-byte index -TUPLE1 = '\x85' # build 1-tuple from stack top -TUPLE2 = '\x86' # build 2-tuple from two topmost stack items -TUPLE3 = '\x87' # build 3-tuple from three topmost stack items -NEWTRUE = '\x88' # push True -NEWFALSE = '\x89' # push False -LONG1 = '\x8a' # push long from < 256 bytes -LONG4 = '\x8b' # push really big long +PROTO = b'\x80' # identify pickle protocol +NEWOBJ = b'\x81' # build object by applying cls.__new__ to argtuple +EXT1 = b'\x82' # push object from extension registry; 1-byte index +EXT2 = b'\x83' # ditto, but 2-byte index +EXT4 = b'\x84' # ditto, but 4-byte index +TUPLE1 = b'\x85' # build 1-tuple from stack top +TUPLE2 = b'\x86' # build 2-tuple from two topmost stack items +TUPLE3 = b'\x87' # build 3-tuple from three topmost stack items +NEWTRUE = b'\x88' # push True +NEWFALSE = b'\x89' # push False +LONG1 = b'\x8a' # push long from < 256 bytes +LONG4 = b'\x8b' # push really big long _tuplesize2code = [EMPTY_TUPLE, TUPLE1, TUPLE2, TUPLE3] @@ -170,15 +168,13 @@ __all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$",x)]) class Pickler: def __init__(self, file, protocol=None): - """This takes a file-like object for writing a pickle data stream. + """This takes a binary file for writing a pickle data stream. + + All protocols now read and write bytes. The optional protocol argument tells the pickler to use the given protocol; supported protocols are 0, 1, 2. The default - protocol is 0, to be backwards compatible. (Protocol 0 is the - only protocol that can be written to a file opened in text - mode and read back successfully. When using a protocol higher - than 0, make sure the file is opened in binary mode, both when - pickling and unpickling.) + protocol is 2; it's been supported for many years now. Protocol 1 is more efficient than protocol 0; protocol 2 is more efficient than protocol 1. @@ -194,7 +190,7 @@ class Pickler: """ if protocol is None: - protocol = 0 + protocol = DEFAULT_PROTOCOL if protocol < 0: protocol = HIGHEST_PROTOCOL elif not 0 <= protocol <= HIGHEST_PROTOCOL: @@ -219,7 +215,7 @@ class Pickler: def dump(self, obj): """Write a pickled representation of obj to the open file.""" if self.proto >= 2: - self.write(PROTO + chr(self.proto)) + self.write(PROTO + bytes([self.proto])) self.save(obj) self.write(STOP) @@ -249,21 +245,21 @@ class Pickler: def put(self, i, pack=struct.pack): if self.bin: if i < 256: - return BINPUT + chr(i) + return BINPUT + bytes([i]) else: return LONG_BINPUT + pack("= 0: if obj <= 0xff: - self.write(BININT1 + chr(obj)) + self.write(BININT1 + bytes([obj])) return if obj <= 0xffff: - self.write("%c%c%c" % (BININT2, obj&0xff, obj>>8)) + self.write(BININT2 + bytes([obj&0xff, obj>>8])) return # Next check for 4-byte signed ints: high_bits = obj >> 31 # note that Python shift sign-extends @@ -454,7 +450,7 @@ class Pickler: self.write(BININT + pack("= 0: if obj <= 0xff: - self.write(BININT1 + chr(obj)) + self.write(BININT1 + bytes([obj])) return if obj <= 0xffff: - self.write("%c%c%c" % (BININT2, obj&0xff, obj>>8)) + self.write(BININT2, bytes([obj&0xff, obj>>8])) return # Next check for 4-byte signed ints: high_bits = obj >> 31 # note that Python shift sign-extends @@ -479,74 +475,46 @@ class Pickler: self.write(BININT + pack("= 2: - bytes = encode_long(obj) - n = len(bytes) + encoded = encode_long(obj) + n = len(encoded) if n < 256: - self.write(LONG1 + chr(n) + bytes) + self.write(LONG1 + bytes([n]) + encoded) else: - self.write(LONG4 + pack("d', obj)) else: - self.write(FLOAT + repr(obj) + '\n') + self.write(FLOAT + bytes(repr(obj)) + b'\n') dispatch[FloatType] = save_float def save_string(self, obj, pack=struct.pack): if self.bin: n = len(obj) if n < 256: - self.write(SHORT_BINSTRING + chr(n) + obj) + self.write(SHORT_BINSTRING + bytes([n]) + bytes(obj)) else: - self.write(BINSTRING + pack(" 0 if code <= 0xff: - write(EXT1 + chr(code)) + write(EXT1 + bytes([code])) elif code <= 0xffff: - write("%c%c%c" % (EXT2, code&0xff, code>>8)) + write(EXT2 + bytes([code&0xff, code>>8])) else: write(EXT4 + pack("d', self.read(8))[0]) - dispatch[BINFLOAT] = load_binfloat + dispatch[BINFLOAT[0]] = load_binfloat def load_string(self): rep = self.readline()[:-1] @@ -948,60 +929,60 @@ class Unpickler: else: raise ValueError, "insecure string pickle" self.append(rep.decode("string-escape")) - dispatch[STRING] = load_string + dispatch[STRING[0]] = load_string def load_binstring(self): - len = mloads('i' + self.read(4)) + len = mloads(b'i' + self.read(4)) self.append(self.read(len)) - dispatch[BINSTRING] = load_binstring + dispatch[BINSTRING[0]] = load_binstring def load_unicode(self): - self.append(str(self.readline()[:-1],'raw-unicode-escape')) - dispatch[UNICODE] = load_unicode + self.append(str(self.readline()[:-1], 'raw-unicode-escape')) + dispatch[UNICODE[0]] = load_unicode def load_binunicode(self): - len = mloads('i' + self.read(4)) - self.append(str(self.read(len),'utf-8')) - dispatch[BINUNICODE] = load_binunicode + len = mloads(b'i' + self.read(4)) + self.append(str(self.read(len), 'utf-8')) + dispatch[BINUNICODE[0]] = load_binunicode def load_short_binstring(self): len = ord(self.read(1)) self.append(self.read(len)) - dispatch[SHORT_BINSTRING] = load_short_binstring + dispatch[SHORT_BINSTRING[0]] = load_short_binstring def load_tuple(self): k = self.marker() self.stack[k:] = [tuple(self.stack[k+1:])] - dispatch[TUPLE] = load_tuple + dispatch[TUPLE[0]] = load_tuple def load_empty_tuple(self): self.stack.append(()) - dispatch[EMPTY_TUPLE] = load_empty_tuple + dispatch[EMPTY_TUPLE[0]] = load_empty_tuple def load_tuple1(self): self.stack[-1] = (self.stack[-1],) - dispatch[TUPLE1] = load_tuple1 + dispatch[TUPLE1[0]] = load_tuple1 def load_tuple2(self): self.stack[-2:] = [(self.stack[-2], self.stack[-1])] - dispatch[TUPLE2] = load_tuple2 + dispatch[TUPLE2[0]] = load_tuple2 def load_tuple3(self): self.stack[-3:] = [(self.stack[-3], self.stack[-2], self.stack[-1])] - dispatch[TUPLE3] = load_tuple3 + dispatch[TUPLE3[0]] = load_tuple3 def load_empty_list(self): self.stack.append([]) - dispatch[EMPTY_LIST] = load_empty_list + dispatch[EMPTY_LIST[0]] = load_empty_list def load_empty_dictionary(self): self.stack.append({}) - dispatch[EMPTY_DICT] = load_empty_dictionary + dispatch[EMPTY_DICT[0]] = load_empty_dictionary def load_list(self): k = self.marker() self.stack[k:] = [self.stack[k+1:]] - dispatch[LIST] = load_list + dispatch[LIST[0]] = load_list def load_dict(self): k = self.marker() @@ -1012,7 +993,7 @@ class Unpickler: value = items[i+1] d[key] = value self.stack[k:] = [d] - dispatch[DICT] = load_dict + dispatch[DICT[0]] = load_dict # INST and OBJ differ only in how they get a class object. It's not # only sensible to do the rest in a common routine, the two routines @@ -1047,43 +1028,43 @@ class Unpickler: name = self.readline()[:-1] klass = self.find_class(module, name) self._instantiate(klass, self.marker()) - dispatch[INST] = load_inst + dispatch[INST[0]] = load_inst def load_obj(self): # Stack is ... markobject classobject arg1 arg2 ... k = self.marker() klass = self.stack.pop(k+1) self._instantiate(klass, k) - dispatch[OBJ] = load_obj + dispatch[OBJ[0]] = load_obj def load_newobj(self): args = self.stack.pop() cls = self.stack[-1] obj = cls.__new__(cls, *args) self.stack[-1] = obj - dispatch[NEWOBJ] = load_newobj + dispatch[NEWOBJ[0]] = load_newobj def load_global(self): module = self.readline()[:-1] name = self.readline()[:-1] klass = self.find_class(module, name) self.append(klass) - dispatch[GLOBAL] = load_global + dispatch[GLOBAL[0]] = load_global def load_ext1(self): code = ord(self.read(1)) self.get_extension(code) - dispatch[EXT1] = load_ext1 + dispatch[EXT1[0]] = load_ext1 def load_ext2(self): - code = mloads('i' + self.read(2) + '\000\000') + code = mloads(b'i' + self.read(2) + b'\000\000') self.get_extension(code) - dispatch[EXT2] = load_ext2 + dispatch[EXT2[0]] = load_ext2 def load_ext4(self): - code = mloads('i' + self.read(4)) + code = mloads(b'i' + self.read(4)) self.get_extension(code) - dispatch[EXT4] = load_ext4 + dispatch[EXT4[0]] = load_ext4 def get_extension(self, code): nil = [] @@ -1111,55 +1092,55 @@ class Unpickler: func = stack[-1] value = func(*args) stack[-1] = value - dispatch[REDUCE] = load_reduce + dispatch[REDUCE[0]] = load_reduce def load_pop(self): del self.stack[-1] - dispatch[POP] = load_pop + dispatch[POP[0]] = load_pop def load_pop_mark(self): k = self.marker() del self.stack[k:] - dispatch[POP_MARK] = load_pop_mark + dispatch[POP_MARK[0]] = load_pop_mark def load_dup(self): self.append(self.stack[-1]) - dispatch[DUP] = load_dup + dispatch[DUP[0]] = load_dup def load_get(self): self.append(self.memo[self.readline()[:-1]]) - dispatch[GET] = load_get + dispatch[GET[0]] = load_get def load_binget(self): i = ord(self.read(1)) self.append(self.memo[repr(i)]) - dispatch[BINGET] = load_binget + dispatch[BINGET[0]] = load_binget def load_long_binget(self): - i = mloads('i' + self.read(4)) + i = mloads(b'i' + self.read(4)) self.append(self.memo[repr(i)]) - dispatch[LONG_BINGET] = load_long_binget + dispatch[LONG_BINGET[0]] = load_long_binget def load_put(self): self.memo[self.readline()[:-1]] = self.stack[-1] - dispatch[PUT] = load_put + dispatch[PUT[0]] = load_put def load_binput(self): i = ord(self.read(1)) self.memo[repr(i)] = self.stack[-1] - dispatch[BINPUT] = load_binput + dispatch[BINPUT[0]] = load_binput def load_long_binput(self): - i = mloads('i' + self.read(4)) + i = mloads(b'i' + self.read(4)) self.memo[repr(i)] = self.stack[-1] - dispatch[LONG_BINPUT] = load_long_binput + dispatch[LONG_BINPUT[0]] = load_long_binput def load_append(self): stack = self.stack value = stack.pop() list = stack[-1] list.append(value) - dispatch[APPEND] = load_append + dispatch[APPEND[0]] = load_append def load_appends(self): stack = self.stack @@ -1167,7 +1148,7 @@ class Unpickler: list = stack[mark - 1] list.extend(stack[mark + 1:]) del stack[mark:] - dispatch[APPENDS] = load_appends + dispatch[APPENDS[0]] = load_appends def load_setitem(self): stack = self.stack @@ -1175,7 +1156,7 @@ class Unpickler: key = stack.pop() dict = stack[-1] dict[key] = value - dispatch[SETITEM] = load_setitem + dispatch[SETITEM[0]] = load_setitem def load_setitems(self): stack = self.stack @@ -1185,7 +1166,7 @@ class Unpickler: dict[stack[i]] = stack[i + 1] del stack[mark:] - dispatch[SETITEMS] = load_setitems + dispatch[SETITEMS[0]] = load_setitems def load_build(self): stack = self.stack @@ -1216,16 +1197,16 @@ class Unpickler: if slotstate: for k, v in slotstate.items(): setattr(inst, k, v) - dispatch[BUILD] = load_build + dispatch[BUILD[0]] = load_build def load_mark(self): self.append(self.mark) - dispatch[MARK] = load_mark + dispatch[MARK[0]] = load_mark def load_stop(self): value = self.stack.pop() raise _Stop(value) - dispatch[STOP] = load_stop + dispatch[STOP[0]] = load_stop # Helper class for load_inst/load_obj @@ -1332,16 +1313,11 @@ def decode_long(data): # Shorthands -try: - from cStringIO import StringIO -except ImportError: - from StringIO import StringIO - def dump(obj, file, protocol=None): Pickler(file, protocol).dump(obj) def dumps(obj, protocol=None): - file = StringIO() + file = io.BytesIO() Pickler(file, protocol).dump(obj) return file.getvalue() @@ -1349,7 +1325,7 @@ def load(file): return Unpickler(file).load() def loads(str): - file = StringIO(str) + file = io.BytesIO(str) return Unpickler(file).load() # Doctest diff --git a/Lib/struct.py b/Lib/struct.py index 07c21bf9bc3..1077437a4ae 100644 --- a/Lib/struct.py +++ b/Lib/struct.py @@ -25,7 +25,11 @@ Whitespace between formats is ignored. The variable struct.error is an exception raised on errors. """ -__version__ = '0.1' + +# XXX Move the bytes and str8 casts into the _struct module + +__version__ = '3.0' + from _struct import Struct, error @@ -36,7 +40,7 @@ def _compile(fmt): # Internal: compile struct pattern if len(_cache) >= _MAXCACHE: _cache.clear() - s = Struct(fmt) + s = Struct(str8(fmt)) _cache[fmt] = s return s @@ -60,7 +64,7 @@ def pack(fmt, *args): o = _cache[fmt] except KeyError: o = _compile(fmt) - return o.pack(*args) + return bytes(o.pack(*args)) def pack_into(fmt, buf, offset, *args): """ @@ -72,7 +76,7 @@ def pack_into(fmt, buf, offset, *args): o = _cache[fmt] except KeyError: o = _compile(fmt) - return o.pack_into(buf, offset, *args) + return bytes(o.pack_into(buf, offset, *args)) def unpack(fmt, s): """