Issue #19858: pickletools.optimize() now aware of the MEMOIZE opcode, can

produce more compact result and no longer produces invalid output if input
data contains MEMOIZE opcodes together with PUT or BINPUT opcodes.
This commit is contained in:
Serhiy Storchaka 2014-12-16 18:02:49 +02:00
commit f5553bbb0e
3 changed files with 92 additions and 24 deletions

View File

@ -2282,40 +2282,61 @@ def genops(pickle):
def optimize(p): def optimize(p):
'Optimize a pickle string by removing unused PUT opcodes' 'Optimize a pickle string by removing unused PUT opcodes'
not_a_put = object() put = 'PUT'
gets = { not_a_put } # set of args used by a GET opcode get = 'GET'
opcodes = [] # (startpos, stoppos, putid) oldids = set() # set of all PUT ids
newids = {} # set of ids used by a GET opcode
opcodes = [] # (op, idx) or (pos, end_pos)
proto = 0 proto = 0
protoheader = b''
for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True): for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True):
if 'PUT' in opcode.name: if 'PUT' in opcode.name:
opcodes.append((pos, end_pos, arg)) oldids.add(arg)
opcodes.append((put, arg))
elif opcode.name == 'MEMOIZE':
idx = len(oldids)
oldids.add(idx)
opcodes.append((put, idx))
elif 'FRAME' in opcode.name: elif 'FRAME' in opcode.name:
pass pass
else: elif 'GET' in opcode.name:
if 'GET' in opcode.name: if opcode.proto > proto:
gets.add(arg) proto = opcode.proto
elif opcode.name == 'PROTO': newids[arg] = None
assert pos == 0, pos opcodes.append((get, arg))
elif opcode.name == 'PROTO':
if arg > proto:
proto = arg proto = arg
opcodes.append((pos, end_pos, not_a_put)) if pos == 0:
prevpos, prevarg = pos, None protoheader = p[pos: end_pos]
else:
opcodes.append((pos, end_pos))
else:
opcodes.append((pos, end_pos))
del oldids
# Copy the opcodes except for PUTS without a corresponding GET # Copy the opcodes except for PUTS without a corresponding GET
out = io.BytesIO() out = io.BytesIO()
opcodes = iter(opcodes) # Write the PROTO header before any framing
if proto >= 2: out.write(protoheader)
# Write the PROTO header before any framing pickler = pickle._Pickler(out, proto)
start, stop, _ = next(opcodes)
out.write(p[start:stop])
buf = pickle._Framer(out.write)
if proto >= 4: if proto >= 4:
buf.start_framing() pickler.framer.start_framing()
for start, stop, putid in opcodes: idx = 0
if putid in gets: for op, arg in opcodes:
buf.commit_frame() if op is put:
buf.write(p[start:stop]) if arg not in newids:
if proto >= 4: continue
buf.end_framing() data = pickler.put(idx)
newids[arg] = idx
idx += 1
elif op is get:
data = pickler.get(newids[arg])
else:
data = p[op:arg]
pickler.framer.commit_frame()
pickler.write(data)
pickler.framer.end_framing()
return out.getvalue() return out.getvalue()
############################################################################## ##############################################################################

View File

@ -1,3 +1,4 @@
import struct
import pickle import pickle
import pickletools import pickletools
from test import support from test import support
@ -15,6 +16,48 @@ class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests):
# Test relies on precise output of dumps() # Test relies on precise output of dumps()
test_pickle_to_2x = None test_pickle_to_2x = None
def test_optimize_long_binget(self):
data = [str(i) for i in range(257)]
data.append(data[-1])
for proto in range(pickle.HIGHEST_PROTOCOL + 1):
pickled = pickle.dumps(data, proto)
unpickled = pickle.loads(pickled)
self.assertEqual(unpickled, data)
self.assertIs(unpickled[-1], unpickled[-2])
pickled2 = pickletools.optimize(pickled)
unpickled2 = pickle.loads(pickled2)
self.assertEqual(unpickled2, data)
self.assertIs(unpickled2[-1], unpickled2[-2])
self.assertNotIn(pickle.LONG_BINGET, pickled2)
self.assertNotIn(pickle.LONG_BINPUT, pickled2)
def test_optimize_binput_and_memoize(self):
pickled = (b'\x80\x04\x95\x15\x00\x00\x00\x00\x00\x00\x00'
b']\x94(\x8c\x04spamq\x01\x8c\x03ham\x94h\x02e.')
# 0: \x80 PROTO 4
# 2: \x95 FRAME 21
# 11: ] EMPTY_LIST
# 12: \x94 MEMOIZE
# 13: ( MARK
# 14: \x8c SHORT_BINUNICODE 'spam'
# 20: q BINPUT 1
# 22: \x8c SHORT_BINUNICODE 'ham'
# 27: \x94 MEMOIZE
# 28: h BINGET 2
# 30: e APPENDS (MARK at 13)
# 31: . STOP
self.assertIn(pickle.BINPUT, pickled)
unpickled = pickle.loads(pickled)
self.assertEqual(unpickled, ['spam', 'ham', 'ham'])
self.assertIs(unpickled[1], unpickled[2])
pickled2 = pickletools.optimize(pickled)
unpickled2 = pickle.loads(pickled2)
self.assertEqual(unpickled2, ['spam', 'ham', 'ham'])
self.assertIs(unpickled2[1], unpickled2[2])
self.assertNotIn(pickle.BINPUT, pickled2)
def test_main(): def test_main():
support.run_unittest(OptimizedPickleTests) support.run_unittest(OptimizedPickleTests)

View File

@ -196,6 +196,10 @@ Core and Builtins
Library Library
------- -------
- Issue #19858: pickletools.optimize() now aware of the MEMOIZE opcode, can
produce more compact result and no longer produces invalid output if input
data contains MEMOIZE opcodes together with PUT or BINPUT opcodes.
- Issue #22095: Fixed HTTPConnection.set_tunnel with default port. The port - Issue #22095: Fixed HTTPConnection.set_tunnel with default port. The port
value in the host header was set to "None". Patch by Demian Brecht. value in the host header was set to "None". Patch by Demian Brecht.