diff -r 1cad9e4bba40 Lib/pickletools.py --- a/Lib/pickletools.py Fri Nov 28 17:46:05 2014 +0100 +++ b/Lib/pickletools.py Sat Nov 29 15:53:04 2014 +0200 @@ -2282,40 +2282,61 @@ def genops(pickle): def optimize(p): 'Optimize a pickle string by removing unused PUT opcodes' - not_a_put = object() - gets = { not_a_put } # set of args used by a GET opcode - opcodes = [] # (startpos, stoppos, putid) + put = 'PUT' + get = 'GET' + oldids = set() # set of all PUT ids + newids = {} # set of ids used by a GET opcode + opcodes = [] # (op, idx) or (pos, end_pos) proto = 0 + protoheader = b'' for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True): if 'PUT' in opcode.name: - opcodes.append((pos, end_pos, arg)) + oldids.add(arg) + opcodes.append((put, arg)) + elif opcode.name == 'MEMOIZE': + idx = len(oldids) + oldids.add(idx) + opcodes.append((put, idx)) elif 'FRAME' in opcode.name: pass + elif 'GET' in opcode.name: + if opcode.proto > proto: + proto = opcode.proto + newids[arg] = None + opcodes.append((get, arg)) + elif opcode.name == 'PROTO': + if arg > proto: + proto = arg + if pos == 0: + protoheader = p[pos: end_pos] + else: + opcodes.append((pos, end_pos)) else: - if 'GET' in opcode.name: - gets.add(arg) - elif opcode.name == 'PROTO': - assert pos == 0, pos - proto = arg - opcodes.append((pos, end_pos, not_a_put)) - prevpos, prevarg = pos, None + opcodes.append((pos, end_pos)) + del oldids # Copy the opcodes except for PUTS without a corresponding GET out = io.BytesIO() - opcodes = iter(opcodes) - if proto >= 2: - # Write the PROTO header before any framing - start, stop, _ = next(opcodes) - out.write(p[start:stop]) - buf = pickle._Framer(out.write) + # Write the PROTO header before any framing + out.write(protoheader) + pickler = pickle._Pickler(out, proto) if proto >= 4: - buf.start_framing() - for start, stop, putid in opcodes: - if putid in gets: - buf.commit_frame() - buf.write(p[start:stop]) - if proto >= 4: - buf.end_framing() + pickler.framer.start_framing() + idx = 0 + for op, arg in opcodes: + if op is put: + if arg not in newids: + continue + data = pickler.put(idx) + newids[arg] = idx + idx += 1 + elif op is get: + data = pickler.get(newids[arg]) + else: + data = p[op:arg] + pickler.framer.commit_frame() + pickler.write(data) + pickler.framer.end_framing() return out.getvalue() ############################################################################## diff -r 1cad9e4bba40 Lib/test/test_pickletools.py --- a/Lib/test/test_pickletools.py Fri Nov 28 17:46:05 2014 +0100 +++ b/Lib/test/test_pickletools.py Sat Nov 29 15:53:04 2014 +0200 @@ -15,6 +15,35 @@ class OptimizedPickleTests(AbstractPickl # Test relies on precise output of dumps() test_pickle_to_2x = None + def test_optimize_long_binget(self): + data = [str(i) for i in range(257)] + data.append(data[-1]) + for proto in range(pickle.HIGHEST_PROTOCOL + 1): + pickled = pickle.dumps(data, proto) + unpickled = pickle.loads(pickled) + self.assertEqual(unpickled, data) + self.assertIs(unpickled[-1], unpickled[-2]) + + pickled2 = pickletools.optimize(pickled) + unpickled2 = pickle.loads(pickled2) + self.assertEqual(unpickled2, data) + self.assertIs(unpickled2[-1], unpickled2[-2]) + self.assertNotIn(pickle.LONG_BINGET, pickled2) + self.assertNotIn(pickle.LONG_BINPUT, pickled2) + + def test_optimize_binput_and_memoize(self): + pickled = (b'\x80\x04\x95\x15\x00\x00\x00\x00\x00\x00\x00' + b']\x94(\x8c\x04spamq\x01\x8c\x03ham\x94h\x02e.') + unpickled = pickle.loads(pickled) + self.assertEqual(unpickled, ['spam', 'ham', 'ham']) + self.assertIs(unpickled[1], unpickled[2]) + + pickled2 = pickletools.optimize(pickled) + unpickled2 = pickle.loads(pickled2) + self.assertEqual(unpickled2, ['spam', 'ham', 'ham']) + self.assertIs(unpickled2[1], unpickled2[2]) + self.assertNotIn(b'q', pickled2) + def test_main(): support.run_unittest(OptimizedPickleTests)