diff -r 4b3238923b01 -r d0c3a8d4947a Doc/library/operator.rst --- a/Doc/library/operator.rst Fri May 10 19:57:44 2013 -0700 +++ b/Doc/library/operator.rst Sat May 11 03:03:53 2013 +0300 @@ -11,9 +11,6 @@ import operator from operator import itemgetter, iadd -**Source code:** :source:`Lib/operator.py` - --------------- The :mod:`operator` module exports a set of efficient functions corresponding to the intrinsic operators of Python. For example, ``operator.add(x, y)`` is diff -r 4b3238923b01 -r d0c3a8d4947a Lib/copyreg.py --- a/Lib/copyreg.py Fri May 10 19:57:44 2013 -0700 +++ b/Lib/copyreg.py Sat May 11 03:03:53 2013 +0300 @@ -87,6 +87,12 @@ def __newobj__(cls, *args): return cls.__new__(cls, *args) +# Helper for __reduce_ex__ protocol 4 +# Similar to __newobj__, but allows passing keyword arguments to __new__ + +def __newobj_kw__(cls, kargs, kwargs): + return cls.__new__(cls, *kargs, **kwargs) + def _slotnames(cls): """Return a list of slot names for a given class. diff -r 4b3238923b01 -r d0c3a8d4947a Lib/pickle.py --- a/Lib/pickle.py Fri May 10 19:57:44 2013 -0700 +++ b/Lib/pickle.py Sat May 11 03:03:53 2013 +0300 @@ -17,13 +17,12 @@ Misc variables: - __version__ format_version compatible_formats """ -from types import FunctionType, BuiltinFunctionType +from types import FunctionType, BuiltinFunctionType, MethodType, ModuleType from copyreg import dispatch_table from copyreg import _extension_registry, _inverted_registry, _extension_cache from itertools import islice @@ -34,10 +33,44 @@ import io import codecs import _compat_pickle +import builtins +from inspect import ismodule, isclass __all__ = ["PickleError", "PicklingError", "UnpicklingError", "Pickler", "Unpickler", "dump", "dumps", "load", "loads"] +# Issue 15397: Unbinding of methods +# Adds the possibility to unbind methods as well as a few definitions missing +# from the types module. + +_MethodDescriptorType = type(list.append) +_WrapperDescriptorType = type(list.__add__) +_MethodWrapperType = type([].__add__) + +def _unbind(f): + """Unbinds a bound method.""" + self = getattr(f, '__self__', None) + if self is not None and not isinstance(self, ModuleType) \ + and not isinstance(self, type): + if hasattr(f, '__func__'): + return f.__func__ + return getattr(type(f.__self__), f.__name__) + raise TypeError('not a bound method') + + +def _bind_method(self, func): + """This method is used internally to pickle bound methods using the REDUCE + opcode.""" + return func.__get__(self) + +def _isclassmethod(func): + """Tests if a given function is a classmethod.""" + if type(func) not in [MethodType, BuiltinFunctionType]: + return False + if hasattr(func, '__self__') and type(func.__self__) is type: + return True + return False + # Shortcut for use in isinstance testing bytes_types = (bytes, bytearray) @@ -52,13 +85,25 @@ ] # Old format versions we can read # This is the highest protocol number we know how to read. -HIGHEST_PROTOCOL = 3 +HIGHEST_PROTOCOL = 4 # The protocol we write by default. May be less than HIGHEST_PROTOCOL. # We intentionally write a protocol that Python 2.x cannot read; # there are too many issues with that. DEFAULT_PROTOCOL = 3 +# XXX we should consider writing SecurePickler/SecureUnpickler in the future, +# which can be safely used with untrusted sources. They would likely only have +# a python implementation (less likely to create security flaws) and will take +# various security parameters, such as: max_memo, max_stack, max_depth, +# allow_globals, globals_whitelist. This will only work with picklev4 and +# obsolete opcodes will be disabled (i.e. opcodes that are never generated by a +# v4 pickler because they have newer alternatives, such as INST or GLOBAL). +# This is again useful for lowering the chances of a successful attack. We +# could create a list of builtin "profiles" with predefined values for the +# security parameters above, so the average user doesn't need to worry about +# what are good values for parameters such as "max_memo". + class PickleError(Exception): """A common base class for the other pickling exceptions.""" pass @@ -164,17 +209,46 @@ BINBYTES = b'B' # push bytes; counted binary string argument SHORT_BINBYTES = b'C' # " " ; " " " " < 256 bytes +# Protocol 4 (Python 3.x) +BINBYTES64 = b'\x8c' # push large bytes (64bit size) +BINUNICODE16 = b'\x8d' # push string (16bit size) +SHORT_BINUNICODE = b'\x8e' # push small string (8bit size) +BINUNICODE64 = b'\x8f' # push large string (64bit size) +BINBYTES16 = b'\x90' # push bytes (16bit size) +BINGLOBAL = b'\x91' # push a global (like GLOBAL) +BINGLOBAL_BIG = b'\x92' # push an unusually large global name +BINGLOBAL_COMMON = b'\x93' # push a with module in V4_COMMON_MODULES +EMPTY_SET = b'\x94' # push an empty set +EMPTY_FROZENSET = b'\x95' # push an empty frozenset +UPDATE_SET = b'\x96' # update the set in the stack with all the + # elements upto mark (union inplace) +FROZENSET = b'\x97' # create a new frozenset out of the top stack + # slice +NEWOBJ_KW = b'\x98' # like NEWOBJ, but with keyword args +BAIL_OUT = b'\xff' # opcode that forces the unpickler to abandon + # pickling with failure + +# Used by picklev4 for more efficient serialization of commonly used module +# names. +# Note: Once picklev4 is released, you may not add new module names here. +V4_COMMON_MODULES = [ + '__main__', + 'builtins', + 'collections', + 'pickle' +] +assert len(V4_COMMON_MODULES) <= 255 + __all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$",x)]) # Pickling machinery - class _Pickler: - - def __init__(self, file, protocol=None, *, fix_imports=True): + def __init__(self, file, protocol=None, *, fix_imports=True, + common_modules=None): """This takes a binary file for writing a pickle data stream. The optional protocol argument tells the pickler to use the - given protocol; supported protocols are 0, 1, 2, 3. The default + given protocol; supported protocols are 0, 1, 2, 3, 4. The default protocol is 3; a backward-incompatible protocol designed for Python 3.0. @@ -191,6 +265,14 @@ If fix_imports is True and protocol is less than 3, pickle will try to map the new Python 3.x names to the old module names used in Python 2.x, so that the pickle data stream is readable with Python 2.x. + + The optional common_modules argument is a list representing commonly + used module names. This value is ignored prior to version 4. In later + versions, this indicates module names that are likely to be serialized, + in order to provide more efficient serialization for their name + (space-wise). If this value is omitted, it defaults to + V4_COMMON_MODULES. If this argument is provided to the pickler, then + this exact same value must be provided to the unpickler. """ if protocol is None: protocol = DEFAULT_PROTOCOL @@ -202,11 +284,14 @@ self.write = file.write except AttributeError: raise TypeError("file must have a 'write' attribute") + + self.proto = int(protocol) self.memo = {} - self.proto = int(protocol) self.bin = protocol >= 1 self.fast = 0 self.fix_imports = fix_imports and protocol < 3 + self.common_modules = V4_COMMON_MODULES if common_modules is None \ + else common_modules def clear_memo(self): """Clears the pickler's "memo". @@ -228,7 +313,14 @@ "%s.__init__()" % (self.__class__.__name__,)) if self.proto >= 2: self.write(PROTO + pack("= 4: + self.write(BAIL_OUT) + raise + self.write(STOP) def memoize(self, obj): @@ -242,15 +334,48 @@ # pickling. # The use of the Unpickler memo length as the memo key is just a - # convention. The only requirement is that the memo values be unique. - # But there appears no advantage to any other scheme, and this - # scheme allows the Unpickler memo to be implemented as a plain (but - # growable) array, indexed by memo key. + # convention up to version 3. The only requirement is that the memo + # values be unique. But there appears no advantage to any other + # scheme, and this scheme allows the Unpickler memo to be implemented + # as a plain (but growable) array, indexed by memo key. + + # Note: As of version 4, this is no longer just a convention. The + # pickler and unpickler agree on this indexing scheme so the BINPUT + # operations can be eliminated completely. + # Generally, if save_X() does memoization, then so does load_X(). + # "memos" in pickletools.py indicate which opcodes perform memoization + + # Basically, as of v4, if save_bytes calls self.memoize() in + # the pickler, then load_bytes should also call self.memoize() + # in the unpickler. Because the PUT opcodes are not generated + # anymore, the pickler and the unpickler need to perform + # memoization in the same places. + # + # For instance: + # dis(dumps( (b'abc', b'abc'), 4)) + # 0: \x80 PROTO 4 + # 2: C SHORT_BINBYTES 'abc' <-- automatic memoization + # 7: h BINGET 0 + # 9: \x86 TUPLE2 <-- here too + # 10: . STOP + # + # The pickler performs memoization at the indicated places i.e. + # in save_bytes and save_tuple. This means that the unpickler + # must also perform memoization in those places, so load_bytes + # and load_tuple2 also have to call self.memoize. + if self.fast: return assert id(obj) not in self.memo memo_len = len(self.memo) - self.write(self.put(memo_len)) + + # XXX weird things can happen when, for instance, you're trying to + # reuse the pickler/unpickler after a dump/load have thrown an + # exception without clearing the memo, but that doesn't sound like a + # real scenario + if self.proto < 4: + self.write(self.put(memo_len)) + self.memo[id(obj)] = memo_len, obj # Return a PUT (BINPUT, LONG_BINPUT) opcode string, with argument i. @@ -403,6 +528,48 @@ save(cls) save(args) write(NEWOBJ) + elif self.proto >= 4 and \ + getattr(func, '__name__', '') == '__newobj_kw__': + # This is similar to __newobj__ above, but also allows calling + # __new__ with keyword parameters. + # + if len(args) != 3: + # TODO test that this exception is raised + raise PicklingError( + 'When pickling an object via the reduce protocol, ' + '__newobj_kw__ was called with %d arguments ' + '(should have been 3: class name, tuple of kargs, ' + 'dictionary of kwargs)' % len(args)) + cls = args[0] + kargs = args[1] + kwargs = args[2] + + if not isclass(cls): + # TODO test that this exception is raised + raise PicklingError( + 'When pickling an object via the reduce protocol, ' + '__newobj_kw__ was called with a first argument of ' + 'type %s, when it should have been a class type' % + cls.__class__.__name__) + elif not isinstance(kargs, tuple): + # TODO test that this exception is raised + raise PicklingError( + 'When pickling an object via the reduce protocol, ' + '__newobj_kw__ was called with a second argument of ' + 'type %s, when it should have been a tuple' % + kargs.__class__.__name__) + elif not isinstance(kwargs, dict): + # TODO test that this exception is raised + raise PicklingError( + 'When pickling an object via the reduce protocol, ' + '__newobj_kw__ was called with a third argument of ' + 'type %s, when it should have been a dict' % + kwargs.__class__.__name__) + + save(cls) + save(kargs) + save(kwargs) + write(NEWOBJ_KW) else: save(func) save(args) @@ -495,16 +662,28 @@ n = len(obj) if n < 256: self.write(SHORT_BINBYTES + pack("= 4 and (n>>16) == 0: + self.write(BINBYTES16 + pack('= 4 and n>>31: + self.write(BINBYTES64 + pack('= 4 and n >> 8 == 0: + self.write(SHORT_BINUNICODE + bytes([n]) + encoded) + elif self.proto >= 4 and n >> 16 == 0: + self.write(BINUNICODE16 + pack('= 4 and n>>31: + self.write(BINUNICODE64 + pack('= 4 else getattr + if name is None: - name = obj.__name__ + if self.proto >= 4: + name = obj.__qualname__ + else: + name = obj.__name__ module = getattr(obj, "__module__", None) if module is None: - module = whichmodule(obj, name) + module = whichmodule(obj, name, getattr_func) try: __import__(module, level=0) mod = sys.modules[module] - klass = getattr(mod, name) - except (ImportError, KeyError, AttributeError): + klass = getattr_func(mod, name) + except (ImportError, KeyError, AttributeError) as e: raise PicklingError( "Can't pickle %r: it's not found as %s.%s" % - (obj, module, name)) + (obj, module, name)) from e else: - if klass is not obj: + # Note: The 'is' operator does not currently work as expected when + # applied on functions which are classmethods ("dict.fromkeys is + # dict.fromkeys" is False). Therefore, we only perform the check + # below if the object we are dealing with ("obj") is not a + # classmethod. + # XXX remove the additional check when this is fixed + if klass is not obj and not _isclassmethod(obj): raise PicklingError( "Can't pickle %r: it's not the same object as %s.%s" % (obj, module, name)) @@ -683,36 +873,215 @@ else: write(EXT4 + pack("= 4 and module == '__builtins__': + module = 'builtins' + + if self.fix_imports: + if (module, name) in _compat_pickle.REVERSE_NAME_MAPPING: + module, name = _compat_pickle.REVERSE_NAME_MAPPING[ + (module, name)] + if module in _compat_pickle.REVERSE_IMPORT_MAPPING: + module = _compat_pickle.REVERSE_IMPORT_MAPPING[module] + # Non-ASCII identifiers are supported only with protocols >= 3. if self.proto >= 3: - write(GLOBAL + bytes(module, "utf-8") + b'\n' + - bytes(name, "utf-8") + b'\n') + module_bin = bytes(module, 'utf-8') + name_bin = bytes(name, 'utf-8') + if self.proto >= 4 and len(module_bin) <= 255 and \ + len(name_bin) <= 255: + # see if the module name to be serialized is one of the common + # ones + try: + common_id = self.common_modules.index(module) + except ValueError: + write(BINGLOBAL + bytes([len(module_bin)]) + + module_bin + bytes([len(name_bin)]) + name_bin) + else: + write(BINGLOBAL_COMMON + bytes([common_id]) + + bytes([len(name_bin)]) + name_bin) + # use BINGLOBAL_BIG for representing unusually large globals in + # pickle >= 4 + elif self.proto >= 4: + assert len(module_bin) <= 65535 + assert len(name_bin) <= 65535 + write(BINGLOBAL_BIG + pack('>> getattr(sys.modules['os'], 'path.isdir') + Traceback (most recent call last): + ... + AttributeError: 'module' object has no attribute 'path.isdir' + >>> getattr_recurse(sys.modules['os'], 'path.isdir')('.') + True + >>> getattr_recurse(sys.modules['os'], 'path.foo') + Traceback (most recent call last): + ... + AttributeError: 'module' object has no attribute 'foo' + """ + ret = module + for attr in name.split('.'): + if attr == '': + raise TypeError('Cannot work with the locals of '+ + ret.__qualname__) + + if default is _None: + ret = getattr(ret, attr) + """ + raise AttributeError('\'%s\' object has no attribute \'%s\'' % + (type(ret), n)) + ret = ret_ + """ + else: + try: + ret = getattr(ret, attr) + except AttributeError: + return default + return ret + + +def whichmodule(func, funcname, getattr_func=getattr): """Figure out the module in which a function occurs. Search sys.modules for the module. @@ -724,13 +1093,20 @@ mod = getattr(func, "__module__", None) if mod is not None: return mod + # XXX this is for classmethods. since whichmodule() uses `is' to compare + # for equality of functions and "dict.fromkeys is dict.fromkeys" evaluates + # to False, whichmodule(dict.fromkeys, 'dict.fromkeys') would incorrectly + # return '__main__' + elif hasattr(func, "__self__") and hasattr(func.__self__, "__module__") \ + and func.__self__.__module__ is not None: + return func.__self__.__module__ if func in classmap: return classmap[func] for name, module in list(sys.modules.items()): if module is None: continue # skip dummy package entries - if name != '__main__' and getattr(module, funcname, None) is func: + if name != '__main__' and getattr_func(module, funcname, None) is func: break else: name = '__main__' @@ -743,7 +1119,7 @@ class _Unpickler: def __init__(self, file, *, fix_imports=True, - encoding="ASCII", errors="strict"): + encoding="ASCII", errors="strict", common_modules=None): """This takes a binary file for reading a pickle data stream. The protocol version of the pickle is detected automatically, so no @@ -771,6 +1147,18 @@ self.errors = errors self.proto = 0 self.fix_imports = fix_imports + self.common_modules = V4_COMMON_MODULES if common_modules is None \ + else common_modules + + def _read(self, size): + """Like self.read(), but raises an exception if it + failed to read all `size' bytes. + """ + data = self.read(size) + if len(data) < size: + raise EOFError + else: + return data def load(self): """Read a pickled object representation from the open file. @@ -793,10 +1181,27 @@ if not key: raise EOFError assert isinstance(key, bytes_types) - dispatch[key[0]](self) + try: + dispatch_func = dispatch[key[0]] + except KeyError: + raise UnpicklingError('Nonexistent opcode ' + str(key[0])) + else: + # TODO maybe I should just wrap this in a try-except clause + # and get rid of the exceptions raised all over the place + dispatch_func(self) except _Stop as stopinst: return stopinst.value + def memoize(self, obj): + """Store an object in the memo.""" + + # This is only used in pickle v4 and later for automatic memoization of + # objects. For more details, see Pickler.memoize + if self.proto < 4: + return + + self.memo.append(obj) + # Return largest index k such that self.stack[k] is self.mark. # If the stack doesn't contain a mark, eventually raises IndexError. # This could be sped by maintaining another stack, of indices at which @@ -808,8 +1213,13 @@ def marker(self): stack = self.stack mark = self.mark - k = len(stack)-1 - while stack[k] is not mark: k = k-1 + try: + k = len(stack)-1 + while stack[k] is not mark: + k = k-1 + except IndexError: + raise UnpicklingError("Couldn't find MARK in " + "the unpickler's stack") return k def persistent_load(self, pid): @@ -818,10 +1228,26 @@ dispatch = {} def load_proto(self): - proto = self.read(1)[0] + proto = self._read(1)[0] if not 0 <= proto <= HIGHEST_PROTOCOL: raise ValueError("unsupported pickle protocol: %d" % proto) self.proto = proto + + # In versions prior to 4, the pickler was allowed to memoize objects at + # any desired position by providing any index to the PUT opcode. This + # required that the unpickler's memo structure was a dict. + # However, as of pickle4, the pickler and the unpickler agree to use + # the same indexing technique of consecutive indices, so explicit PUT + # opcodes are not necessary anymore. In addition, the unpickler's memo + # can now be a list instead (it has always been a list-like structure + # for _pickle.c). + if self.proto >= 4: + if self.memo and type(self.memo) is dict: + raise UnpicklingError("Can't mix unpickling older versions " + "of pickle with newer versions") + if type(self.memo) is dict: + # invariant: self.memo = {} + self.memo = [] dispatch[PROTO[0]] = load_proto def load_persid(self): @@ -830,6 +1256,9 @@ dispatch[PERSID[0]] = load_persid def load_binpersid(self): + if not self.stack: + raise UnpicklingError('Applying BINPERSID on empty ' + 'unpickling stack') pid = self.stack.pop() self.append(self.persistent_load(pid)) dispatch[BINPERSID[0]] = load_binpersid @@ -858,15 +1287,15 @@ dispatch[INT[0]] = load_int def load_binint(self): - self.append(unpack('d', self.read(8))[0]) + self.append(unpack('>d', self._read(8))[0]) dispatch[BINFLOAT[0]] = load_binfloat + # Note that self.memoize() in the unpickler is only used for + # pickle>=4 to do the automatic memoization, but load_string never + # gets called as of pickle3, because the STRING opcode has been + # superseded by BINUNICODE/BINBYTES. The pickler never generates the + # STRING opcode as of pickle3. def load_string(self): data = self.readline()[:-1] # Strip outermost quotes @@ -908,24 +1342,27 @@ raise UnpicklingError("the STRING opcode argument must be quoted") self.append(codecs.escape_decode(data)[0] .decode(self.encoding, self.errors)) + dispatch[STRING[0]] = load_string def load_binstring(self): # Deprecated BINSTRING uses signed 32-bit length - len, = unpack(' maxsize: raise UnpicklingError("BINBYTES exceeds system's maximum size " "of %d bytes" % maxsize) - self.append(self.read(len)) + data = self._read(len) + self.append(data) + self.memoize(data) dispatch[BINBYTES[0]] = load_binbytes def load_unicode(self): @@ -933,28 +1370,35 @@ dispatch[UNICODE[0]] = load_unicode def load_binunicode(self): - len, = unpack(' maxsize: raise UnpicklingError("BINUNICODE exceeds system's maximum size " "of %d bytes" % maxsize) - self.append(str(self.read(len), 'utf-8', 'surrogatepass')) + data = str(self._read(len), 'utf-8', 'surrogatepass') + self.append(data) + self.memoize(data) dispatch[BINUNICODE[0]] = load_binunicode def load_short_binstring(self): - len = self.read(1)[0] - data = self.read(len) + len = self._read(1)[0] + data = self._read(len) value = str(data, self.encoding, self.errors) self.append(value) + self.memoize(value) dispatch[SHORT_BINSTRING[0]] = load_short_binstring def load_short_binbytes(self): - len = self.read(1)[0] - self.append(self.read(len)) + len = self._read(1)[0] + data = self._read(len) + self.append(data) + self.memoize(data) dispatch[SHORT_BINBYTES[0]] = load_short_binbytes def load_tuple(self): k = self.marker() - self.stack[k:] = [tuple(self.stack[k+1:])] + t = tuple(self.stack[k+1:]) + self.stack[k:] = [t] + self.memoize(t) dispatch[TUPLE[0]] = load_tuple def load_empty_tuple(self): @@ -962,28 +1406,59 @@ dispatch[EMPTY_TUPLE[0]] = load_empty_tuple def load_tuple1(self): - self.stack[-1] = (self.stack[-1],) + if not len(self.stack): + raise UnpicklingError('Found opcode TUPLE1 on an empty stack') + elif self.stack[-1] is self.mark: + raise UnpicklingError('Unexpected MARK on stack when applying ' + 'TUPLE1') + t = (self.stack[-1],) + self.stack[-1] = t + self.memoize(t) dispatch[TUPLE1[0]] = load_tuple1 def load_tuple2(self): - self.stack[-2:] = [(self.stack[-2], self.stack[-1])] + if len(self.stack) < 2: + raise UnpicklingError('Found opcode TUPLE2 on a stack with ' + 'less than 2 elements') + elif self.stack[-1] is self.mark or self.stack[-2] is self.mark: + raise UnpicklingError('Unexpected MARK on stack when applying ' + 'TUPLE2') + t = (self.stack[-2], self.stack[-1]) + self.stack[-2:] = [t] + self.memoize(t) dispatch[TUPLE2[0]] = load_tuple2 def load_tuple3(self): - self.stack[-3:] = [(self.stack[-3], self.stack[-2], self.stack[-1])] + if len(self.stack) < 3: + raise UnpicklingError('Found opcode TUPLE3 on a stack with ' + 'less than 3 elements') + elif self.stack[-1] is self.mark or \ + self.stack[-2] is self.mark or \ + self.stack[-3] is self.mark: + raise UnpicklingError('Unexpected MARK on stack when applying ' + 'TUPLE3') + data = (self.stack[-3], self.stack[-2], self.stack[-1]) + self.stack[-3:] = [data] + self.memoize(data) dispatch[TUPLE3[0]] = load_tuple3 def load_empty_list(self): - self.append([]) + l = [] + self.append(l) + self.memoize(l) dispatch[EMPTY_LIST[0]] = load_empty_list def load_empty_dictionary(self): - self.append({}) + d = {} + self.append(d) + self.memoize(d) dispatch[EMPTY_DICT[0]] = load_empty_dictionary def load_list(self): k = self.marker() - self.stack[k:] = [self.stack[k+1:]] + l = self.stack[k+1:] + self.stack[k:] = [l] + self.memoize(l) dispatch[LIST[0]] = load_list def load_dict(self): @@ -992,6 +1467,7 @@ d = {items[i]: items[i+1] for i in range(0, len(items), 2)} self.stack[k:] = [d] + self.memoize(d) dispatch[DICT[0]] = load_dict # INST and OBJ differ only in how they get a class object. It's not @@ -1020,6 +1496,9 @@ self._instantiate(klass, self.marker()) dispatch[INST[0]] = load_inst + # TODO should we mark this operator, INST and some others as deprecated in + # v4 and disallow them when unpickling? less attack vectors when you want a + # secure (sandboxed) unpickler. def load_obj(self): # Stack is ... markobject classobject arg1 arg2 ... k = self.marker() @@ -1028,10 +1507,25 @@ dispatch[OBJ[0]] = load_obj def load_newobj(self): + if len(self.stack) < 2: + raise UnpicklingError('NEWOBJ applied with a stack ' + 'size of %d (expected >= 2)' % len(self.stack)) args = self.stack.pop() cls = self.stack[-1] - obj = cls.__new__(cls, *args) + + if not isclass(cls): + raise UnpicklingError('NEWOBJ applied with a first ' + 'parameter not of class type') + elif not isinstance(args, tuple): + raise UnpicklingError('NEWOBJ applied with a second ' + 'parameter which is not a tuple') + try: + obj = cls.__new__(cls, *args) + except Exception as e: + raise UnpicklingError('NEWOBJ failed to instantiate') from e self.stack[-1] = obj + if obj is not None: + self.memoize(obj) dispatch[NEWOBJ[0]] = load_newobj def load_global(self): @@ -1039,20 +1533,21 @@ name = self.readline()[:-1].decode("utf-8") klass = self.find_class(module, name) self.append(klass) + self.memoize(klass) dispatch[GLOBAL[0]] = load_global def load_ext1(self): - code = self.read(1)[0] + code = self._read(1)[0] self.get_extension(code) dispatch[EXT1[0]] = load_ext1 def load_ext2(self): - code, = unpack('= 4: + klass = getattr_recurse(mod, name) + else: + klass = getattr(mod, name) + return klass + except Exception as e: + raise UnpicklingError("Couldn't find class %s.%s" % + (module,name)) from e def load_reduce(self): + if len(self.stack) < 2: + raise UnpicklingError('Found opcode REDUCE on a stack ' + 'with less than 2 elements') stack = self.stack args = stack.pop() func = stack[-1] + + if not isinstance(args, tuple): + raise UnpicklingError('Second argument of REDUCE should be a ' + 'tuple') + + if not callable(func): + raise UnpicklingError('REDUCE applied on non-callable') + try: value = func(*args) - except: - print(sys.exc_info()) - print(func, args) - raise - stack[-1] = value + except Exception as e: + raise UnpicklingError('REDUCE failed to instantiate object') from e + else: + stack[-1] = value + if value is not None: + self.memoize(value) dispatch[REDUCE[0]] = load_reduce def load_pop(self): - del self.stack[-1] + try: + del self.stack[-1] + except IndexError: + raise UnpicklingError('POP on an empty stack') dispatch[POP[0]] = load_pop def load_pop_mark(self): @@ -1107,53 +1627,98 @@ dispatch[POP_MARK[0]] = load_pop_mark def load_dup(self): - self.append(self.stack[-1]) + try: + self.append(self.stack[-1]) + except IndexError: + raise UnpicklingError('DUP on an empty stack') dispatch[DUP[0]] = load_dup + def _get(self, i): + try: + self.append(self.memo[i]) + except IndexError: + raise UnpicklingError('Couldn\'t get memoized index %d (only have ' + '0-%d' % (i, len(self.memo))) + def load_get(self): i = int(self.readline()[:-1]) - self.append(self.memo[i]) + self._get(i) dispatch[GET[0]] = load_get def load_binget(self): - i = self.read(1)[0] - self.append(self.memo[i]) + i = self._read(1)[0] + self._get(i) dispatch[BINGET[0]] = load_binget def load_long_binget(self): - i, = unpack('= 4: + self._put(i) + else: + self.memo[i] = self.stack[-1] dispatch[PUT[0]] = load_put def load_binput(self): - i = self.read(1)[0] + i = self._read(1)[0] if i < 0: raise ValueError("negative BINPUT argument") - self.memo[i] = self.stack[-1] + if self.proto >= 4: + self._put(i) + else: + self.memo[i] = self.stack[-1] dispatch[BINPUT[0]] = load_binput def load_long_binput(self): - i, = unpack(' maxsize: raise ValueError("negative LONG_BINPUT argument") - self.memo[i] = self.stack[-1] + if self.proto >= 4: + self._put(i) + else: + self.memo[i] = self.stack[-1] dispatch[LONG_BINPUT[0]] = load_long_binput def load_append(self): + if len(self.stack) < 2: + raise UnpicklingError('Found opcode APPEND on a stack ' + 'with less than 2 elements') stack = self.stack value = stack.pop() list = stack[-1] - list.append(value) + if value is self.mark: + raise UnpicklingError('APPEND does not work with MARK') + try: + list.append(value) + except Exception as e: + raise UnpicklingError('APPEND failed on object of type %s' % + type(list).__name__) from e + dispatch[APPEND[0]] = load_append def load_appends(self): + if len(self.stack) < 2: + raise UnpicklingError('Found opcode APPENDS on a stack ' + 'with less than 2 elements') stack = self.stack mark = self.marker() list_obj = stack[mark - 1] @@ -1168,27 +1733,54 @@ dispatch[APPENDS[0]] = load_appends def load_setitem(self): + # TODO if there isn't too much runtime overhead, maybe we should + # decorate all these functions with something like + # @needs_stack(3) + if len(self.stack) < 3: + raise UnpicklingError('Found opcode SETITEM on a stack with ' + 'less than 3 elements') stack = self.stack value = stack.pop() key = stack.pop() dict = stack[-1] - dict[key] = value + if key is self.mark or value is self.mark: + raise UnpicklingError('SETITEM cannot work with MARK') + try: + dict[key] = value + except Exception as e: + raise UnpicklingError('SETITEM assignment failed') from e dispatch[SETITEM[0]] = load_setitem def load_setitems(self): + if len(self.stack) < 2: + raise UnpicklingError('Found opcode SETITEMS on a stack with ' + 'less than 2 elements') stack = self.stack mark = self.marker() + if mark == 0: + raise UnpicklingError('No dict supplied to SETITEMS') dict = stack[mark - 1] - for i in range(mark + 1, len(stack), 2): - dict[stack[i]] = stack[i + 1] + if len(stack)-mark-1 % 2 == 1: + raise UnpicklingError('SETITEMS needs an even amount of ' + 'objects pushed after MARK') + try: + for i in range(mark + 1, len(stack), 2): + dict[stack[i]] = stack[i + 1] + except Exception as e: + raise UnpicklingError('SETITEMS assignment failed') from e del stack[mark:] dispatch[SETITEMS[0]] = load_setitems def load_build(self): + if len(self.stack) < 2: + raise UnpicklingError('Found opcode BUILD on a stack with ' + 'less than 2 elements') stack = self.stack state = stack.pop() inst = stack[-1] + if state is self.mark or inst is self.mark: + raise UnpicklingError('BUILD cannot work with MARK') setstate = getattr(inst, "__setstate__", None) if setstate is not None: setstate(state) @@ -1197,7 +1789,12 @@ if isinstance(state, tuple) and len(state) == 2: state, slotstate = state if state: - inst_dict = inst.__dict__ + try: + inst_dict = inst.__dict__ + except: + raise UnpicklingError('Attempting to apply BUILD opcode ' + 'on an object that doesn\'t support __setstate__ ' + 'or __dict__') intern = sys.intern for k, v in state.items(): if type(k) is str: @@ -1205,8 +1802,12 @@ else: inst_dict[k] = v if slotstate: - for k, v in slotstate.items(): - setattr(inst, k, v) + try: + for k, v in slotstate.items(): + setattr(inst, k, v) + except Exception as e: + raise UnpicklingError('Failed applying BUILD opcode\'s ' + 'slotstate') from e dispatch[BUILD[0]] = load_build def load_mark(self): @@ -1214,10 +1815,152 @@ dispatch[MARK[0]] = load_mark def load_stop(self): - value = self.stack.pop() + try: + value = self.stack.pop() + except: + raise UnpicklingError('Finished unpickling, but the stack was ' + 'empty') raise _Stop(value) dispatch[STOP[0]] = load_stop + def load_binbytes64(self): + len = unpack('= 3)' % len(self.stack)) + + kwargs = self.stack.pop() + kargs = self.stack.pop() + cls = self.stack[-1] + + if not isinstance(kargs, tuple): + raise UnpicklingError('NEWOBJ_KW applied with a second ' + 'parameter which is not a tuple') + elif not isinstance(kwargs, dict): + raise UnpicklingError('NEWOBJ_KW applied with a third ' + 'parameter which is not a dict') + + try: + obj = cls.__new__(cls, *kargs, **kwargs) + except Exception as e: + raise UnpicklingError('NEWOBJ_KW failed to instantiate ' + 'object') from e + self.stack[-1] = obj + if obj is not None: + self.memoize(self.stack[-1]) + dispatch[NEWOBJ_KW[0]] = load_newobj_kw + + def load_bail_out(self): + raise UnpicklingError('This stream does not contain correctly pickled ' + 'data') + dispatch[BAIL_OUT[0]] = load_bail_out + + # Encode/decode longs. def encode_long(x): diff -r 4b3238923b01 -r d0c3a8d4947a Lib/pickletools.py --- a/Lib/pickletools.py Fri May 10 19:57:44 2013 -0700 +++ b/Lib/pickletools.py Sat May 11 03:03:53 2013 +0300 @@ -8,6 +8,10 @@ dis(pickle, out=None, memo=None, indentlevel=4) Print a symbolic disassembly of a pickle. + +maxversion(pickle) + The minimum version necessary to load `pickle' (i.e. the maximum version + amongst its opcodes) ''' import codecs @@ -15,7 +19,7 @@ import re import sys -__all__ = ['dis', 'genops', 'optimize'] +__all__ = ['dis', 'genops', 'optimize', 'maxversion'] bytes_types = pickle.bytes_types @@ -168,6 +172,8 @@ TAKEN_FROM_ARGUMENT1 = -2 # num bytes is 1-byte unsigned int TAKEN_FROM_ARGUMENT4 = -3 # num bytes is 4-byte signed little-endian int TAKEN_FROM_ARGUMENT4U = -4 # num bytes is 4-byte unsigned little-endian int +TAKEN_FROM_ARGUMENTu8 = -4 # num bytes is 8-byte little-endian unsigned int +TAKEN_FROM_ARGUMENTu2 = -5 # num bytes is 2-byte little-endian unsigned short class ArgumentDescriptor(object): __slots__ = ( @@ -196,7 +202,9 @@ n in (UP_TO_NEWLINE, TAKEN_FROM_ARGUMENT1, TAKEN_FROM_ARGUMENT4, - TAKEN_FROM_ARGUMENT4U)) + TAKEN_FROM_ARGUMENT4U, + TAKEN_FROM_ARGUMENTu8, + TAKEN_FROM_ARGUMENTu2)) self.n = n self.reader = reader @@ -266,6 +274,28 @@ reader=read_int4, doc="Four-byte signed integer, little-endian, 2's complement.") +def read_uint8(f): + r""" + >>> import io + >>> read_uint8(io.BytesIO(b'\xff'+7*b'\x00')) + 255 + >>> read_uint8(io.BytesIO(b'\x00'+7*b'\x00')) + 0 + >>> e = 0x12 + 256 * 0x34 * (256**7 - 1)//255 + >>> read_uint8(io.BytesIO(b'\x12'+7*b'\x34')) == e + True + """ + + data = f.read(8) + if len(data) == 8: + return _unpack('>> import io + >>> read_unicodestring1_pair(io.BytesIO(b"\x00\x00whatever")) + ' ' + >>> read_unicodestring1_pair(io.BytesIO(b"\x05hello\x06world!blabla")) + 'hello world!' + """ + return "%s %s" % (read_unicodestring1(f), read_unicodestring1(f)) + +unicodestring1_pair = ArgumentDescriptor( + name='unicodestring1_pair', + n=TAKEN_FROM_ARGUMENT1, + reader=read_unicodestring1_pair, + doc="""Read a pair of small unicode strings. + + Both of the strings are preceded by an uint1 + indicating the length of the utf-8 encoded + string to follow""") + +def read_unicodestringu2_pair(f): + r""" + >>> import io + >>> read_unicodestringu2_pair(io.BytesIO(b"\x00\x00\x00\x00whatever")) + ' ' + >>> read_unicodestringu2_pair(io.BytesIO( + ... b"\x05\x00hello\x06\x00world!blabla")) + 'hello world!' + """ + return "%s %s" % (read_unicodestringu2(f), read_unicodestringu2(f)) + +unicodestringu2_pair = ArgumentDescriptor( + name='unicodestringu2_pair', + n=TAKEN_FROM_ARGUMENTu2, + reader=read_unicodestringu2_pair, + doc="""Read a pair of semi-small unicode strings. + + Both of the strings are preceded by a + little-endian uint2 indicating the length + of the utf-8 encoded string to follow""") + + +def read_uint1_unicodestring1_pair(f): + r""" + >>> import io + >>> read_uint1_unicodestring1_pair(io.BytesIO(b"\x20\x00whatever")) + '32 ' + >>> read_uint1_unicodestring1_pair( + ... io.BytesIO(b"\x00\x0chello world!blabla")) + '0 hello world!' + """ + return "%d %s" % (read_uint1(f), read_unicodestring1(f)) + +uint1_unicodestring1_pair = ArgumentDescriptor( + name='uint1_unicodestring1_pair', + n=TAKEN_FROM_ARGUMENT1, # ? + reader=read_uint1_unicodestring1_pair, + doc="""Read a pair: an uint1 and a small unicode + string. + + The small unicode string is also preceded + by an uint1 indicating the length of the + utf-8 encoded string. + """) + + def read_string4(f): r""" >>> import io @@ -414,6 +510,73 @@ that many bytes. """) +def read_stringu2(f): + r""" + >>> import io + >>> read_stringu2(io.BytesIO(b'\x00\x00hello, world!')) + '' + >>> read_stringu2(io.BytesIO(b'\x05\x00hello, world!')) + 'hello' + >>> read_stringu2(io.BytesIO(b'\xff\xff'+b'a'*65535)) == 'a'*65535 + True + >>> read_stringu2(io.BytesIO(b'\xff\xff'+b'a'*65530)) + Traceback (most recent call last): + ... + ValueError: expected 65535 bytes in a stringu2, but only 65530 remain + """ + + n = read_uint2(f) + assert n >= 0 + + data = f.read(n) + if len(data) == n: + return data.decode('latin-1') + raise ValueError("expected %d bytes in a stringu2, but only %d remain" % + (n, len(data))) + +stringu2 = ArgumentDescriptor( + name='stringu2', + n=TAKEN_FROM_ARGUMENTu2, + reader=read_stringu2, + doc="""A counted semi-short string (type bytes). + + The first argument is a 2-byte little-endian unsigned int + giving the number of bytes in the string and the second + argument is that amount of bytes. + """) + +def read_stringu8(f): + r""" + >>> import io + >>> read_stringu8(io.BytesIO(b'\x04'+ b'\x00'*7 + b'abcde')) + 'abcd' + >>> read_stringu8(io.BytesIO(b'\xff\x01\x01' + b'\x00'*5 + b'a'*0x0101ff)) == 'a'*0x0101ff + True + >>> read_stringu8(io.BytesIO(b'\x05' + b'\x00'*7 + b'a'*4)) + Traceback (most recent call last): + ... + ValueError: expected 5 bytes in a stringu8, but only 4 remain + """ + n = read_uint8(f) + assert n >= 0 + + data = f.read(n) + if len(data) == n: + return data.decode("latin-1") + raise ValueError("expected %d bytes in a stringu8, but only %d remain" % + (n, len(data))) + +stringu8 = ArgumentDescriptor( + name="stringu8", + n=TAKEN_FROM_ARGUMENTu8, + reader=read_stringu8, + doc="""A counted long string (type bytes). + + The first argument is a 8-byte little-endian unsigned int giving + the number of bytes in the string, and the second argument + consists of that many bytes. + """) + def read_string1(f): r""" @@ -426,6 +589,7 @@ n = read_uint1(f) assert n >= 0 + data = f.read(n) if len(data) == n: return data.decode("latin-1") @@ -530,6 +694,74 @@ escape sequences. """) +def read_unicodestringu2(f): + r""" + >>> import io + >>> read_unicodestringu2(io.BytesIO(b'\x00\x00abc')) + '' + >>> read_unicodestringu2(io.BytesIO(b'\x03\x00abc')) + 'abc' + >>> read_unicodestringu2(io.BytesIO(b'\x04\x00' + ('\U0001D223'.encode('utf-8')))) + '\U0001d223' + >>> read_unicodestringu2(io.BytesIO(b'\x0d\x00' + ('\ufb93' * 4).encode('utf-8'))) + Traceback (most recent call last): + ... + ValueError: expected 13 bytes in a unicodestringu2, but only 12 remain + """ + n = read_uint2(f) + assert n >= 0 + + data = f.read(n) + if len(data) == n: + return str(data, 'utf-8', 'surrogatepass') + raise ValueError("expected %d bytes in a unicodestringu2, but only %d " + "remain" % (n, len(data))) + +unicodestringu2 = ArgumentDescriptor( + name='unicodestringu2', + n=TAKEN_FROM_ARGUMENTu2, + reader=read_unicodestringu2, + doc="""A counted semi-short Unicode string. + + The first argument is a 2-byte little-endian unsigned short, giving + the number of bytes in the string, and the second argument is + the UTF-8 encoding of the Unicode string + """) + +def read_unicodestring1(f): + r""" + >>> import io + >>> read_unicodestring1(io.BytesIO(b'\x00abc')) + '' + >>> read_unicodestring1(io.BytesIO(b'\xff' + ('\ufb93'*85).encode('utf-8'))) == '\ufb93'*85 + True + >>> read_unicodestring1(io.BytesIO(b'\x00abc')) + '' + >>> read_unicodestring1(io.BytesIO(b'\xff' + b'a'*254)) + Traceback (most recent call last): + ... + ValueError: expected 255 bytes in a unicodestring1, but only 254 remain + """ + n = read_uint1(f) + assert n >= 0 + + data = f.read(n) + if len(data) == n: + return str(data, 'utf-8', 'surrogatepass') + raise ValueError("expected %d bytes in a unicodestring1, but only " + "%d remain" % (n, len(data))) + +unicodestring1 = ArgumentDescriptor( + name='unicodestring1', + n=TAKEN_FROM_ARGUMENT1, + reader=read_unicodestring1, + doc="""A counted short Unicode string. + + The first argument is a 1-byte unsigned integer giving the number + of bytes in the string, and the second argument is the UTF-8 + encoding of the Unicode string + """) + def read_unicodestring4(f): r""" >>> import io @@ -569,6 +801,38 @@ contains that many bytes. """) +def read_unicodestringu8(f): + r""" + >>> import io + >>> read_unicodestringu8(io.BytesIO(b'\x00'*8 + b'a'*254)) + '' + >>> read_unicodestringu8(io.BytesIO(b'\xff\xff' + b'\x00'*6 + ('\ufb93'*21845).encode('utf-8'))) == '\ufb93'*21845 + True + >>> read_unicodestringu8(io.BytesIO(b'\xff\xff\xff\x7f' + b'\x00'*4)) + Traceback (most recent call last): + ... + ValueError: expected 2147483647 bytes in a unicodestringu8, but only 0 remain + """ + n = read_uint8(f) + assert n >= 0 + + data = f.read(n) + if len(data) == n: + return str(data, 'utf-8', 'surrogatepass') + raise ValueError("expected %d bytes in a unicodestringu8, but only %d " + "remain" % (n, len(data))) + +unicodestringu8 = ArgumentDescriptor( + name="unicodestringu8", + n=TAKEN_FROM_ARGUMENTu8, + reader=read_unicodestringu8, + doc="""A counted long unicode string. + + The first argument is an 8-byte little-endian unsigned int + giving the number of bytes in the unicode string, and + the second argument is the UTF-8 representation of that + unicode string + """) def read_decimalnl_short(f): r""" @@ -900,6 +1164,31 @@ topmost markobject too). """) +# v4 +pyset = StackObject( + name="set", + obtype=set, + doc="A Python set object.") + +pyfrozenset = StackObject( + name="frozenset", + obtype=frozenset, + doc="A Python frozenset object.") + +# TODO implement using issubclass(T, object) or something similar +pyclass = anyobject + +""" +pyset_or_frozenset = StackObject( + name="set_or_frozenset", + obtype=(set,frozenset), + doc="A Python set or frozenset object.") +""" + +# TODO. +pyunbound_method = pybound_method = anyobject + + ############################################################################## # Descriptors for pickle opcodes. @@ -927,6 +1216,15 @@ # what the stack looks like after this opcode runs; a list 'stack_after', + # only used in versions >= 4. indicates how many objects are to be + # memoized + # "old" opcodes do not perform automatic memoization (i.e. the ones + # that can't be generated by pickle >= 4) + # if for some weird reason you decide to reintroduce the use of some + # outdated opcode, you need to add automatic memoization to it too (or + # at least manual using BINPUT) + 'memos', + # the protocol number in which this opcode was introduced; an int 'proto', @@ -935,7 +1233,7 @@ ) def __init__(self, name, code, arg, - stack_before, stack_after, proto, doc): + stack_before, stack_after, memos, proto, doc): assert isinstance(name, str) self.name = name @@ -956,7 +1254,12 @@ assert isinstance(x, StackObject) self.stack_after = stack_after + assert memos >= 0 assert isinstance(proto, int) and 0 <= proto <= pickle.HIGHEST_PROTOCOL + self.memos = memos + + assert isinstance(proto, int) and \ + 0 <= proto <= 4 self.proto = proto assert isinstance(doc, str) @@ -972,6 +1275,7 @@ arg=decimalnl_short, stack_before=[], stack_after=[pyinteger_or_bool], + memos=0, proto=0, doc="""Push an integer or bool. @@ -997,6 +1301,7 @@ arg=int4, stack_before=[], stack_after=[pyint], + memos=0, proto=1, doc="""Push a four-byte signed integer. @@ -1011,6 +1316,7 @@ arg=uint1, stack_before=[], stack_after=[pyint], + memos=0, proto=1, doc="""Push a one-byte unsigned integer. @@ -1023,6 +1329,7 @@ arg=uint2, stack_before=[], stack_after=[pyint], + memos=0, proto=1, doc="""Push a two-byte unsigned integer. @@ -1036,6 +1343,7 @@ arg=decimalnl_long, stack_before=[], stack_after=[pylong], + memos=0, proto=0, doc="""Push a long integer. @@ -1054,6 +1362,7 @@ arg=long1, stack_before=[], stack_after=[pylong], + memos=0, proto=2, doc="""Long integer using one-byte length. @@ -1065,6 +1374,7 @@ arg=long4, stack_before=[], stack_after=[pylong], + memos=0, proto=2, doc="""Long integer using found-byte length. @@ -1078,6 +1388,7 @@ arg=stringnl, stack_before=[], stack_after=[pystring], + memos=0, proto=0, doc="""Push a Python string object. @@ -1093,6 +1404,7 @@ arg=string4, stack_before=[], stack_after=[pystring], + memos=0, proto=1, doc="""Push a Python string object. @@ -1108,6 +1420,7 @@ arg=string1, stack_before=[], stack_after=[pystring], + memos=1, proto=1, doc="""Push a Python string object. @@ -1116,6 +1429,9 @@ which are taken literally as the string content. (Actually, they are decoded into a str instance using the encoding given to the Unpickler constructor. or the default, 'ASCII'.) + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), # Bytes (protocol 3 only; older protocols don't support bytes at all) @@ -1125,12 +1441,16 @@ arg=bytes4, stack_before=[], stack_after=[pybytes], + memos=1, proto=3, doc="""Push a Python bytes object. There are two arguments: the first is a 4-byte little-endian unsigned int giving the number of bytes, and the second is that many bytes, which are taken literally as the bytes content. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), I(name='SHORT_BINBYTES', @@ -1138,12 +1458,16 @@ arg=bytes1, stack_before=[], stack_after=[pybytes], + memos=1, proto=3, doc="""Push a Python bytes object. There are two arguments: the first is a 1-byte unsigned int giving the number of bytes, and the second is that many bytes, which are taken literally as the string content. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), # Ways to spell None. @@ -1153,6 +1477,7 @@ arg=None, stack_before=[], stack_after=[pynone], + memos=0, proto=0, doc="Push None on the stack."), @@ -1164,6 +1489,7 @@ arg=None, stack_before=[], stack_after=[pybool], + memos=0, proto=2, doc="""True. @@ -1174,6 +1500,7 @@ arg=None, stack_before=[], stack_after=[pybool], + memos=0, proto=2, doc="""True. @@ -1186,6 +1513,7 @@ arg=unicodestringnl, stack_before=[], stack_after=[pyunicode], + memos=0, proto=0, # this may be pure-text, but it's a later addition doc="""Push a Python Unicode string object. @@ -1199,12 +1527,16 @@ arg=unicodestring4, stack_before=[], stack_after=[pyunicode], + memos=1, proto=1, doc="""Push a Python Unicode string object. There are two arguments: the first is a 4-byte little-endian unsigned int giving the number of bytes in the string. The second is that many bytes, and is the UTF-8 encoding of the Unicode string. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), # Ways to spell floats. @@ -1214,6 +1546,7 @@ arg=floatnl, stack_before=[], stack_after=[pyfloat], + memos=0, proto=0, doc="""Newline-terminated decimal float literal. @@ -1234,6 +1567,7 @@ arg=float8, stack_before=[], stack_after=[pyfloat], + memos=0, proto=1, doc="""Float stored in binary form, with 8 bytes of data. @@ -1252,14 +1586,20 @@ arg=None, stack_before=[], stack_after=[pylist], + memos=1, proto=1, - doc="Push an empty list."), + doc="""Push an empty list. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). + """), I(name='APPEND', code='a', arg=None, stack_before=[pylist, anyobject], stack_after=[pylist], + memos=0, proto=0, doc="""Append an object to a list. @@ -1274,6 +1614,7 @@ arg=None, stack_before=[pylist, markobject, stackslice], stack_after=[pylist], + memos=0, proto=1, doc="""Extend a list by a slice of stack objects. @@ -1288,8 +1629,9 @@ arg=None, stack_before=[markobject, stackslice], stack_after=[pylist], + memos=1, proto=0, - doc="""Build a list out of the topmost stack slice, after markobject. + doc="""Build a list out of the topmost stack slice. All the stack entries following the topmost markobject are placed into a single Python list, which single list object replaces all of the @@ -1297,6 +1639,9 @@ Stack before: ... markobject 1 2 3 'abc' Stack after: ... [1, 2, 3, 'abc'] + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), # Ways to build tuples. @@ -1306,6 +1651,7 @@ arg=None, stack_before=[], stack_after=[pytuple], + memos=0, proto=1, doc="Push an empty tuple."), @@ -1314,6 +1660,7 @@ arg=None, stack_before=[markobject, stackslice], stack_after=[pytuple], + memos=1, proto=0, doc="""Build a tuple out of the topmost stack slice, after markobject. @@ -1323,6 +1670,9 @@ Stack before: ... markobject 1 2 3 'abc' Stack after: ... (1, 2, 3, 'abc') + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), I(name='TUPLE1', @@ -1330,6 +1680,7 @@ arg=None, stack_before=[anyobject], stack_after=[pytuple], + memos=1, proto=2, doc="""Build a one-tuple out of the topmost item on the stack. @@ -1338,6 +1689,9 @@ words: stack[-1] = tuple(stack[-1:]) + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), I(name='TUPLE2', @@ -1345,6 +1699,7 @@ arg=None, stack_before=[anyobject, anyobject], stack_after=[pytuple], + memos=1, proto=2, doc="""Build a two-tuple out of the top two items on the stack. @@ -1353,6 +1708,9 @@ words: stack[-2:] = [tuple(stack[-2:])] + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), I(name='TUPLE3', @@ -1360,6 +1718,7 @@ arg=None, stack_before=[anyobject, anyobject, anyobject], stack_after=[pytuple], + memos=1, proto=2, doc="""Build a three-tuple out of the top three items on the stack. @@ -1368,6 +1727,9 @@ words: stack[-3:] = [tuple(stack[-3:])] + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), # Ways to build dicts. @@ -1377,14 +1739,20 @@ arg=None, stack_before=[], stack_after=[pydict], + memos=1, proto=1, - doc="Push an empty dict."), + doc="""Push an empty dict. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). + """), I(name='DICT', code='d', arg=None, stack_before=[markobject, stackslice], stack_after=[pydict], + memos=1, proto=0, doc="""Build a dict out of the topmost stack slice, after markobject. @@ -1395,6 +1763,9 @@ Stack before: ... markobject 1 2 3 'abc' Stack after: ... {1: 2, 3: 'abc'} + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), I(name='SETITEM', @@ -1402,6 +1773,7 @@ arg=None, stack_before=[pydict, anyobject, anyobject], stack_after=[pydict], + memos=0, proto=0, doc="""Add a key+value pair to an existing dict. @@ -1416,6 +1788,7 @@ arg=None, stack_before=[pydict, markobject, stackslice], stack_after=[pydict], + memos=0, proto=1, doc="""Add an arbitrary number of key+value pairs to an existing dict. @@ -1439,6 +1812,7 @@ arg=None, stack_before=[anyobject], stack_after=[], + memos=0, proto=0, doc="Discard the top stack item, shrinking the stack by one item."), @@ -1447,6 +1821,7 @@ arg=None, stack_before=[anyobject], stack_after=[anyobject, anyobject], + memos=0, proto=0, doc="Push the top stack item onto the stack again, duplicating it."), @@ -1455,6 +1830,7 @@ arg=None, stack_before=[], stack_after=[markobject], + memos=0, proto=0, doc="""Push markobject onto the stack. @@ -1468,6 +1844,7 @@ arg=None, stack_before=[markobject, stackslice], stack_after=[], + memos=0, proto=1, doc="""Pop all the stack objects at and above the topmost markobject. @@ -1484,6 +1861,7 @@ arg=decimalnl_short, stack_before=[], stack_after=[anyobject], + memos=0, proto=0, doc="""Read an object from the memo and push it on the stack. @@ -1497,6 +1875,7 @@ arg=uint1, stack_before=[], stack_after=[anyobject], + memos=0, proto=1, doc="""Read an object from the memo and push it on the stack. @@ -1509,6 +1888,7 @@ arg=uint4, stack_before=[], stack_after=[anyobject], + memos=0, proto=1, doc="""Read an object from the memo and push it on the stack. @@ -1521,6 +1901,7 @@ arg=decimalnl_short, stack_before=[], stack_after=[], + memos=1, proto=0, doc="""Store the stack top into the memo. The stack is not popped. @@ -1534,6 +1915,7 @@ arg=uint1, stack_before=[], stack_after=[], + memos=1, proto=1, doc="""Store the stack top into the memo. The stack is not popped. @@ -1546,6 +1928,7 @@ arg=uint4, stack_before=[], stack_after=[], + memos=1, proto=1, doc="""Store the stack top into the memo. The stack is not popped. @@ -1561,6 +1944,7 @@ arg=uint1, stack_before=[], stack_after=[anyobject], + memos=0, proto=2, doc="""Extension code. @@ -1583,6 +1967,7 @@ arg=uint2, stack_before=[], stack_after=[anyobject], + memos=0, proto=2, doc="""Extension code. @@ -1594,6 +1979,7 @@ arg=int4, stack_before=[], stack_after=[anyobject], + memos=0, proto=2, doc="""Extension code. @@ -1608,6 +1994,7 @@ arg=stringnl_noescape_pair, stack_before=[], stack_after=[anyobject], + memos=1, proto=0, doc="""Push a global object (module.attr) on the stack. @@ -1616,6 +2003,9 @@ object module.class is pushed on the stack. More accurately, the object returned by self.find_class(module, class) is pushed on the stack, so unpickling subclasses can override this form of lookup. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), # Ways to build objects of classes pickle doesn't know about directly @@ -1628,8 +2018,10 @@ arg=None, stack_before=[anyobject, anyobject], stack_after=[anyobject], + memos=1, proto=0, - doc="""Push an object built from a callable and an argument tuple. + doc="""Push an object built from a callable and an argument tuple. This + object is memoized automatically. The opcode is named to remind of the __reduce__() method. @@ -1650,6 +2042,9 @@ '__safe_for_unpickling__' attribute with a true value. I'm not sure why it does this, but I've sure seen this complaint often enough when I didn't want to . + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), I(name='BUILD', @@ -1657,6 +2052,7 @@ arg=None, stack_before=[anyobject, anyobject], stack_after=[anyobject], + memos=0, proto=0, doc="""Finish building an object, via __setstate__ or dict update. @@ -1682,6 +2078,7 @@ arg=stringnl_noescape_pair, stack_before=[markobject, stackslice], stack_after=[anyobject], + memos=0, proto=0, doc="""Build a class instance. @@ -1732,6 +2129,7 @@ arg=None, stack_before=[markobject, anyobject, stackslice], stack_after=[anyobject], + memos=0, proto=1, doc="""Build a class instance. @@ -1764,6 +2162,7 @@ arg=None, stack_before=[anyobject, anyobject], stack_after=[anyobject], + memos=1, proto=2, doc="""Build an object instance. @@ -1772,6 +2171,9 @@ top). Call these cls and args. They are popped off the stack, and the value returned by cls.__new__(cls, *args) is pushed back onto the stack. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). """), # Machine control. @@ -1781,6 +2183,7 @@ arg=uint1, stack_before=[], stack_after=[], + memos=0, proto=2, doc="""Protocol version indicator. @@ -1793,6 +2196,7 @@ arg=None, stack_before=[anyobject], stack_after=[], + memos=0, proto=0, doc="""Stop the unpickling machine. @@ -1808,6 +2212,7 @@ arg=stringnl_noescape, stack_before=[], stack_after=[anyobject], + memos=0, proto=0, doc="""Push an object identified by a persistent ID. @@ -1825,6 +2230,7 @@ arg=None, stack_before=[anyobject], stack_after=[anyobject], + memos=0, proto=1, doc="""Push an object identified by a persistent ID. @@ -1833,6 +2239,248 @@ ID is passed to self.persistent_load(), and whatever object that returns is pushed on the stack. See PERSID for more detail. """), + + # Protocol 4 only + + I(name='BINBYTES64', + code='\x8c', + arg=stringu8, + stack_before=[], + stack_after=[pybytes], + memos=1, + proto=4, + doc="""Push a large Python bytes object. + + There are two arguments: the first is an 8-byte little-endian unsigned + int, providing the number of bytes in the string and the second is formed + of that many bytes, taken literally. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). + """), + + I(name='BINUNICODE16', + code='\x8d', + arg=unicodestringu2, + stack_before=[], + stack_after=[pyunicode], + memos=1, + proto=4, + doc="""Push a semi-short Python string object. + + There are two arguments: the first is a 2-byte little-endian unsigned + short giving the number of bytes in the unicode string. The second is + that amount of bytes, and represents the UTF-8 encoding of the Unicode + string. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). + """ + ), + + I(name='SHORT_BINUNICODE', + code='\x8e', + arg=unicodestring1, + stack_before=[], + stack_after=[pyunicode], + memos=1, + proto=4, + doc="""Push a short Python string object. + + There are two arguments: the first is a 1-byte unsigned integer giving + the number of bytes in the unicode string. The second is that amount of + bytes, and represents the UTF-8 encoding of the Unicode string. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). + """ + ), + + I(name='BINUNICODE64', + code='\x8f', + arg=unicodestringu8, + stack_before=[], + stack_after=[pyunicode], + memos=1, + proto=4, + doc="""Push a long Python string object (unicode). + + There are two arguments: the first is an 8-byte unsigned integer giving + the number of bytes in the unicode string. The second is that amount of + bytes, and represents the UTF-8 encoding of the Unicode string. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). + """ + ), + + I(name='BINBYTES16', + code='\x90', + arg=stringu2, + stack_before=[], + stack_after=[pybytes], + memos=1, + proto=4, + doc="""Push a Python semi-short bytes object. + + There are two arguments: the first is an 8-byte little-endian unsigned + integer giving the number of bytes in the string. The second is that + amount of bytes, which are taken literally as the bytes content. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). + """ + ), + + I(name='BINGLOBAL', + code='\x91', + arg=unicodestring1_pair, + stack_before=[], + stack_after=[anyobject], + memos=1, + proto=4, + doc="""Push a global object (module.obj) on the stack. + + This works in a similar way to GLOBAL, but instead of taking a pair of + newline-terminated strings as parameters (representing the module name + and the attribute respectively), it takes a pair of two small utf-8 + encoded strings, with their 8bit size prepended to them (the + equivalent of two consecutive SHORT_BINUNICODE opcodes). + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). + """), + + I(name='BINGLOBAL_BIG', + code='\x92', + arg=unicodestringu2_pair, + stack_before=[], + stack_after=[anyobject], + memos=1, + proto=4, + doc="""Push a global object (module.obj) on the stack. + + This is used instead of BINGLOBAL for unusually large global names (i.e. + >255 bytes). + """), + + I(name='BINGLOBAL_COMMON', + code='\x93', + arg=uint1_unicodestring1_pair, + stack_before=[], + stack_after=[anyobject], + memos=1, + proto=4, + doc="""Push a global object (module.obj) on the stack. + + This works in a similar way to BINGLOBAL (which in turn works similar to + GLOBAL), but instead of taking the module name as a string for the first + parameter, it takes an uint1 representing the index of the module in a + predefined list of commonly used modules (pickle.COMMON_MODULES). + """), + + # the advantage of not having an opcode that constructs a set with a given + # list of elements is that sets may be self-referential, in which case the + # set itself must be created prior to unpickling its elements. + I(name='EMPTY_SET', + code='\x94', + arg=None, + stack_before=[], + stack_after=[pyset], + memos=1, + proto=4, + doc="""Push an empty set. + + On versions 4 and above, this object is automatically memoized by the + unpickler (there's no need for BINPUT after this opcode). + """ + ), + + I(name='EMPTY_FROZENSET', + code='\x95', + arg=None, + stack_before=[], + stack_after=[pyfrozenset], + memos=0, + proto=4, + doc="""Push an empty frozenset."""), + + I(name='UPDATE_SET', + code='\x96', + arg=None, + stack_before=[pyset, markobject, stackslice], + stack_after=[pyset], + memos=0, + proto=4, + doc="""Update a set (inplace union) with a slice of stack objects. + + Stack before: ... pyset markobject stackslice + Stack after: ... pyset|stackslice. (without creating a new object) + + This can be used to construct a set from more stack slices as follows: + 1. EMPTY_SET + 2. MARK + 3. .. serialize multiple set elements into the stack .. + 4. UPDATE_SET + 5. MARK + 6. .. serialize more set elements into the stack .. + 7. UPDATE_SET + + Note that this opcode does not need to perform any memoization. EMPTY_SET + already memoizes the empty set, and UPDATE_SET does not create a new + set. + """), + + I(name='FROZENSET', + code='\x97', + arg=None, + stack_before=[markobject, stackslice], + stack_after=[pyfrozenset], + memos=1, + proto=4, + doc="""Create a frozenset from a slice of stack objects. + + Stack before: ... markobject stackslice + Stack after: ... pyfrozenset(stackslice) + + The assymetry between the way sets and frozensets are pickled is caused + by the limitations of immutable containers such as frozensets. They + cannot be pickled until they are filled, so self-referential frozensets + cannot easily BINGET themselves. See save_frozenset or save_tuple for + details on how this issue is circumvented. + """), + I(name='NEWOBJ_KW', + code='\x98', + arg=None, + stack_before=[anyobject, pytuple, pydict], + stack_after=[anyobject], + memos=1, + proto=4, + doc="""Create an object instance. + + The stack should initially contain the class to be instantiated, a tuple + indicating the positional parameters (kargs) and a dictionary indicating + the keyword parameters (kwargs). + + The value returned by cls.__new__(cls, *kargs, *kwargs) is pushed into + the stack. + + Note that this opcode differs from the version 2 opcode NEWOBJ in that it + allows calling __new__ with keyword arguments. + """), + + I(name='BAIL_OUT', + code='\xff', + arg=None, + stack_before=[], + stack_after=[], + memos=0, + proto=4, + doc="""Forces the unpickler to exit unsuccessfully. + + Whenever the pickler fails after some data has already been written into + the stream, this opcode is also written. + """), ] del I @@ -1863,6 +2511,7 @@ code2op[d.code] = d del d + def assure_pickle_consistency(verbose=False): copy = code2op.copy() @@ -1959,6 +2608,8 @@ if code == b'.': assert opcode.name == 'STOP' break + elif code == b'\xff': + break ############################################################################## # A pickle optimizer. @@ -1995,7 +2646,7 @@ 'pickle' is a file-like object, or string, containing a (at least one) pickle. The pickle is disassembled from the current position, through - the first STOP opcode encountered. + the first STOP or BAIL_OUT opcode encountered. Optional arg 'out' is a file-like object to which the disassembly is printed. It defaults to sys.stdout. @@ -2043,6 +2694,8 @@ indentchunk = ' ' * indentlevel errormsg = None annocol = annotate # columnt hint for annotations + failed = False + used_proto = 0 for opcode, arg, pos in genops(pickle): if pos is not None: print("%5d:" % pos, end=' ', file=out) @@ -2051,11 +2704,17 @@ indentchunk * len(markstack), opcode.name) + if opcode.name == 'PROTO': + used_proto = arg + if used_proto >= 4: + memo = [] + maxproto = max(maxproto, opcode.proto) before = opcode.stack_before # don't mutate after = opcode.stack_after # don't mutate numtopop = len(before) + # See whether a MARK should be popped. markmsg = None if markobject in before or (opcode.name == "POP" and @@ -2084,24 +2743,27 @@ else: errormsg = markmsg = "no MARK exists on stack" - # Check for correct memo usage. - if opcode.name in ("PUT", "BINPUT", "LONG_BINPUT"): - assert arg is not None - if arg in memo: - errormsg = "memo key %r already defined" % arg - elif not stack: - errormsg = "stack is empty -- can't store into memo" - elif stack[-1] is markobject: - errormsg = "can't store markobject in the memo" - else: - memo[arg] = stack[-1] - - elif opcode.name in ("GET", "BINGET", "LONG_BINGET"): - if arg in memo: - assert len(after) == 1 - after = [memo[arg]] # for better stack emulation - else: - errormsg = "memo key %r has never been stored into" % arg + if used_proto < 4: + # Check for correct memo usage in pickle <4 + if opcode.name in ("PUT", "BINPUT", "LONG_BINPUT"): + assert arg is not None + if arg in memo: + errormsg = "memo key %r already defined" % arg + elif not stack: + errormsg = "stack is empty -- can't store into memo" + elif stack[-1] is markobject: + errormsg = "can't store markobject in the memo" + else: + memo[arg] = stack[-1] + elif opcode.name in ("GET", "BINGET", "LONG_BINGET"): + if arg in memo: + assert len(after) == 1 + after = [memo[arg]] # for better stack emulation + else: + errormsg = "memo key %r has never been stored into" % arg + + if opcode.name == 'BAIL_OUT': + failed = True if arg is not None or markmsg: # make a mild effort to align arguments @@ -2135,11 +2797,53 @@ markstack.append(pos) stack.extend(after) + + if used_proto >= 4: + # Check for correct memo usage in pickle>=4. + if opcode.name in ("PUT", "BINPUT", "LONG_BINPUT"): + if len(memo) > arg: + errormsg = "memo key %r already defined" % arg + elif not stack: + errormsg = "stack is empty -- can't store into memo" + elif stack[-1] is markobject: + errormsg = "can't store markobject in the memo" + else: + if len(memo) == arg: + memo.append(stack[-1]) + else: + errormsg = "memo index %r too large" % arg + elif opcode.name in ("GET", "BINGET", "LONG_BINGET"): + if arg < len(memo): + assert len(after) == 1 + after = [memo[arg]] # for better stack emulation + else: + errormsg = "memo index %r has never been stored into" % arg + else: + # we're dealing with an opcode which does automatic memoization + for i in range(opcode.memos): + # don't know exactly what it memoizes, but top of the stack + # is a good guess since we currently only have memos=0/1 + memo.append(stack[-1]) + print("highest protocol among opcodes =", maxproto, file=out) - if stack: + if not failed and stack: raise ValueError("stack not empty after STOP: %r" % stack) +def maxversion(pickled_data): + """Find the maximum version amongst the used opcodes in the given pickled + data. + + Like in `dis', pickle is a file-like object, or string, containing at least + one pickle. The pickle is disassembled from the current position until the + first STOP opcode is encountered, and the maximum version of the + encountered opcodes is returned. + """ + ret = -1 + for opcode, arg, pos in genops(pickled_data): + ret = max(ret, opcode.proto) + return ret + # For use in the doctest, simply as an example of a class to pickle. class _Example: def __init__(self, value): diff -r 4b3238923b01 -r d0c3a8d4947a Lib/test/pickletester.py --- a/Lib/test/pickletester.py Fri May 10 19:57:44 2013 -0700 +++ b/Lib/test/pickletester.py Sat May 11 03:03:53 2013 +0300 @@ -6,6 +6,7 @@ import copyreg import weakref from http.cookies import SimpleCookie +from struct import pack from test.support import ( TestFailed, TESTFN, run_with_locale, no_tracing, @@ -405,8 +406,7 @@ b'coded_valueq\x0fU\x05valueq\x10h\x10h\x10h\x02h\x02ubs}q\x11b.') # set([3]) pickled from 2.x with protocol 2 -DATA6 = b'\x80\x02c__builtin__\nset\nq\x00]q\x01K\x03a\x85q\x02Rq\x03.' - +DATA6 = b'\x80\x02c__builtin__\nset\nq\x00)Rq\x01]q\x02K\x03ab.' def create_data(): c = C() @@ -428,6 +428,104 @@ x.append(5) return x + +class Nested: + n = 'Nested' + + class B: + n = 'Nested.B' + + def f(): + return 'Nested.B.f' + def ff(self): + return 'Nested.B.ff' + + @classmethod + def cm(klass): + return klass.n + + @staticmethod + def sm(): + return 'sm' + + class C: + n = 'Nested.B.C' + + def __init__(self): + self.a = 123 + + def f(): + return 'Nested.B.C.f' + def ff(self): + return 'Nested.B.C.ff' + + def get_a(self): + return self.a +# used to test pickling of unusually large names +class _aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa: + pass + +class MySet(set): + pass + +class MyDict(dict): + pass + +class MyList(list): + pass + +class KwargsNew: + def __new__(cls, *kargs, **kwargs): + self = super(KwargsNew, cls).__new__(cls) + self.kargs = kargs + self.kwargs = kwargs + return self + + def __init__(self, *kargs, **kwargs): + pass + + __dict__ = {} + + def __reduce_ex__(self, proto): + if proto < 4 and self.kwargs: + raise RuntimeError('Can\'t pickle pickletester.KwargsNew ' + 'with kwargs in pickle<4') + # note that this function always takes 3 arguments: class name, kargs + # and kwargs (as a tuple and a dict respectively) + from copyreg import __newobj_kw__ + return (__newobj_kw__, (self.__class__, self.kargs, self.kwargs)) + +class KwargsNew2: + def __new__(cls, *kargs, **kwargs): + self = super(KwargsNew2, cls).__new__(cls) + self.kargs = kargs + self.kwargs = kwargs + self.called_getnewargs = False + self.called_getnewargs_kw = False + self._state = [] + self._new_called = True + return self + + __dict__ = {} + + def __getstate__(self): + return [1,2,3] + + def __setstate__(self, state): + self._state = state + + def __getnewargs_kw__(self): + self.called_getnewargs_kw = True + return self.kargs, self.kwargs + + def __getnewargs__(self): + self.called_getnewargs = True + return self.kargs + +class BadKwargsNew: + pass + + class AbstractPickleTests(unittest.TestCase): # Subclass must define self.dumps, self.loads. @@ -681,6 +779,38 @@ got = self.loads(pickle) self.assertEqual(value, got) + def test_set(self): + sets=[set([]), set([1]), set([1,2]), + set(range(1,1000)), set(range(1,1001))] + for proto in protocols: + for s in sets: + s_ = self.loads(self.dumps(s, proto)) + self.assertEqual(s_, s) + self.assertEqual(s_.__getstate__(), s.__getstate__()) + + # self-referential sets + c=myint(1) + s=set([c]) + c.s=s + for proto in protocols: + s_ = self.loads(self.dumps(s, proto)) + self.assertIsNot(s_, s) + self.assertIs( next(iter(s_)).s, s_) + self.assertEqual(1, len(s_)) + + # setting state + s=set([]) + s.__setstate__([1,2,3]) + self.assertEqual(set([1,2,3]), s) + s.__setstate__([]) + self.assertEqual(set([]), s) + self.assertRaises(AttributeError, set.__setstate__, s, ([1], {1:2})) + self.assertRaises(TypeError, set.__setstate__, s, 2) + self.assertRaises(TypeError, set.__setstate__, s, ([],{},3)) + self.assertRaises(TypeError, set.__setstate__, s, ((),{})) + self.assertRaises(TypeError, set.__setstate__, s, ([],[])) + + @run_with_locale('LC_ALL', 'de_DE', 'fr_FR') def test_float_format(self): # make sure that floats are formatted locale independent with proto 0 @@ -806,6 +936,12 @@ (3, 2): pickle.TUPLE2, (3, 3): pickle.TUPLE3, (3, 4): pickle.TUPLE, + + (4, 0): pickle.EMPTY_TUPLE, + (4, 1): pickle.TUPLE1, + (4, 2): pickle.TUPLE2, + (4, 3): pickle.TUPLE3, + (4, 4): pickle.TUPLE, } a = () b = (1,) @@ -826,16 +962,19 @@ (1, None): pickle.NONE, (2, None): pickle.NONE, (3, None): pickle.NONE, + (4, None): pickle.NONE, (0, True): pickle.INT, (1, True): pickle.INT, (2, True): pickle.NEWTRUE, (3, True): pickle.NEWTRUE, + (4, True): pickle.NEWTRUE, (0, False): pickle.INT, (1, False): pickle.INT, (2, False): pickle.NEWFALSE, (3, False): pickle.NEWFALSE, + (4, False): pickle.NEWFALSE, } for proto in protocols: for x in None, False, True: @@ -1233,6 +1372,15 @@ self._check_pickling_with_opcode(obj, pickle.APPEND, proto) else: self._check_pickling_with_opcode(obj, pickle.APPENDS, proto) + + def test_appends_on_non_lists(self): + # Issue #17720 + obj = REX_six([1, 2, 3]) + for proto in protocols: + if proto == 0: + self._check_pickling_with_opcode(obj, pickle.APPEND, proto) + else: + self._check_pickling_with_opcode(obj, pickle.APPENDS, proto) def test_setitems_on_non_dicts(self): obj = REX_seven({1: -1, 2: -2, 3: -3}) @@ -1242,6 +1390,941 @@ else: self._check_pickling_with_opcode(obj, pickle.SETITEMS, proto) + def _loads(self, data, version=pickle.HIGHEST_PROTOCOL, minversion=-1, + *kargs, **kwargs): + """Uses loads, but first makes sure there aren't any opcodes of too + high or too low of a version number. + + Usecase: + data = self.dumps([1, 2, 3], proto) + undata = self._loads(data, proto) + + v3_feature = .. + data = self.dumps(v3_feature, 4) + undata = self._loads(v3_feature, 4, 3) + """ + maxv = pickletools.maxversion(data) + self.assertLessEqual(maxv, version) + self.assertLessEqual(minversion, maxv) + return self.loads(data, *kargs, **kwargs) + + def _test_v4_efficient_opcodes(self, strings): + for proto in range(3, 1+pickle.HIGHEST_PROTOCOL): + for expected_opcode, grouped_strings in strings: + for s in grouped_strings: + if s is None: + continue + data = self.dumps(s, proto) + undata = self._loads(data, proto) + + self.assertEqual(s, undata) + + # make sure that version 3 still does it "the old way" + if proto == 3: + if expected_opcode in [ + pickle.SHORT_BINUNICODE, + pickle.BINUNICODE16]: + expected_opcode = pickle.BINUNICODE + elif expected_opcode == pickle.BINBYTES16: + expected_opcode = pickle.BINBYTES + + argument_size = pickletools.code2op[ + expected_opcode.decode('latin-1')].arg.n + + if isinstance(s, str): + s = s.encode('utf-8') + + expected_size = len(s) + if argument_size == pickletools.TAKEN_FROM_ARGUMENT1: + expected_size = bytes([expected_size]) + elif argument_size == pickletools.TAKEN_FROM_ARGUMENTu2: + expected_size = pack('= 0xffff + else None + ] + + # strings of sizes representable on unsigned 16bits are considered + # semi-small + semishort = [ + 'a' * 255 + '\u06ca', # 255+2 + 'a' * 256, + 'a' * 65535, + '\u06ca' * 32767, + + # 21600*3 + 183*4 + 3 = 65535 + ('\ufb93' * 21600 + '\U0001D223' * 183 + 3*'z') + if sys.maxunicode >= 0xffff + else None + ] + + # TODO BINUNICODE64 tests in BigmemPickleTests + normal = [ + 'a' * 65536, + '\ufb93' * 65432, # 65432 * 3 = 196296 + # 65533 + 4 = 65537 + ('a' * 65533 + '\U0001D223') + if sys.maxunicode >= 0xffff + else None + ] + + strings = [(pickle.SHORT_BINUNICODE , short), + (pickle.BINUNICODE16, semishort), + (pickle.BINUNICODE , normal)] + self._test_v4_efficient_opcodes(strings) + + def test_v4_efficient_binbytes(self): + """test pickling various sizes of bytes in v4""" + short = [ + b'', + b'a' * 255, + b'\x00' * 255, + b'\x00' + ] + + semishort = [ + b'\x00\x01' * 32767 + b'a', + b'a' * 65535, + b'\x00' * 65535, + b'd' * 12345 + ] + + # TODO BINBYTES64 tests with BigmemPickleTests + normal = [ + b'\x00' * 65536, + '\u06ca'.encode('utf-8') * 32767 + b'aa' + ] + + binbytes = [(pickle.SHORT_BINBYTES , short), + (pickle.BINBYTES16, semishort), + (pickle.BINBYTES , normal)] + self._test_v4_efficient_opcodes(binbytes) + + def test_v4_nested_classes(self): + """test pickling nested classes""" + for proto in range(4, 1+pickle.HIGHEST_PROTOCOL): + for klass in (Nested, Nested.B, Nested.B.C): + data = self.dumps(klass, proto) + undata = self._loads(data, proto, 4) + + self.assertEqual(klass.n, undata.n) + self.assertEqual(klass.n, undata.__qualname__) + self.assertEqual(klass.__qualname__, undata.__qualname__) + + for func in (Nested.B.f, Nested.B.C.f): + data = self.dumps(func, proto) + undata = self._loads(data, proto, 4) + + self.assertEqual(func.__qualname__, undata.__qualname__) + self.assertEqual(func(), undata()) + self.assertLessEqual(4, pickletools.maxversion(data)) + + inst = Nested.B.C() + inst.a = 42 + + data = self.dumps(inst, proto) + undata = self._loads(data, proto, 4) + + self.assertEqual(inst.a, undata.get_a()) + + data = self.dumps( [(inst, Nested.B), (Nested.B.C.f, Nested.B.f, + Nested.B.C.f), + Nested, Nested.B.C, inst, Nested.B.f], proto) + inst.a = -42 + undata = self._loads(data, proto, 4) + + self.assertEqual(42, undata[0][0].a) + self.assertEqual('Nested.B.f', undata[0][1].f()) + self.assertEqual('Nested.B.C.f', undata[1][0]()) + self.assertEqual('Nested.B.f', undata[1][1]()) + self.assertEqual('Nested.B.C.f', undata[1][2]()) + self.assertEqual('Nested', undata[2].n) + self.assertEqual('Nested.B.C', undata[3].n) + self.assertEqual(42, undata[4].get_a()) + self.assertEqual('Nested.B.f', undata[5]()) + + # TODO test classes in other modules, (anon funcs)?, __main__, + # __builtins__ + + def test_v4_binglobal_common(self): + """test pickling of globals using BINGLOBAL_COMMON""" + for proto in range(4, 1+pickle.HIGHEST_PROTOCOL): + from glob import glob + globals = [glob, ArithmeticError, UnicodeDecodeError] + data = self.dumps(globals, proto) + undata = self._loads(data, proto, 4) + self.assertEqual(undata, globals) + + globals_dis_actual = io.StringIO() + pickletools.dis(data, globals_dis_actual) + + builtins_id = bytes([pickle.V4_COMMON_MODULES.index('builtins')]) + + # must contain a BINGLOBAL 'glob glob' + self.assertTrue(data.find(pickle.BINGLOBAL + + b'\x04glob\x04glob') != -1) + # must contain a BINGLOBAL_COMMON '{builtins_id} ArithmeticError' + self.assertTrue(data.find(pickle.BINGLOBAL_COMMON + + builtins_id + + b'\x0fArithmeticError') != -1) + # must contain a BINGLOBAL_COMMON '{builtins_id} UnicodeDecodeError' + self.assertTrue(data.find(pickle.BINGLOBAL_COMMON + + builtins_id + + b'\x12UnicodeDecodeError') != -1) + + def test_v4_sets(self): + """test pickling of sets and frozensets""" + # ll contains a list of the objects that will be pickled + ll = [set(), frozenset(), [set()], [set([1])], [set(),set()], + [set([b'a']), set(['a'])], + [set([b'a', b'a']), frozenset([b'a'])], + [frozenset()], [frozenset([1])], [frozenset(), frozenset()], + [frozenset(), frozenset([1])] + ] + # create an ugly object to be pickled + ugly = [[], [1], [1,2], [1,2,3], + range(1,1000), range(1,2000), range(1,1001), range(1,1234), + range(1,2345), + [1, 2, frozenset([1,2,3]), frozenset([4,'abc',b'hello'])] + ] + ugly = list(map(frozenset, ugly)) + # nasty frozenset + ugly.append( + frozenset([frozenset([frozenset([frozenset([1])]),2]),3]) + ) + # nasty set + ugly.append( set(ugly[:]) ) + # add ugly to the list of objects that will be pickled and tested + ll.append(ugly) + for proto in protocols: + for obj in ll: + data = self.dumps(obj, proto) + undata = self._loads(data, proto) + self.assertEqual(obj, undata) + if proto == 3: + data3_len = len(data) + elif proto >= 4: + # make sure that v4 does it better + self.assertLess(len(data), data3_len) + + def test_v4_set_refs(self): + """test that set()s and frozenset()s can be memoized correctly""" + for proto in protocols: + for datastructure in (set, frozenset): + s=datastructure([1,2]) + data = self.dumps( (s, s), proto) + ss1, ss2 = self._loads(data, proto) + + self.assertIs(ss1, ss2) + self.assertEqual(ss1, s) + + def _make_pickle_str(self, s, **vars): + """ + Allows creating of serialized data to be loaded and tested. + + E.g.: + self._make_pickle_str('{PROTO}\x04{EMPTY_LIST}{STOP}') + """ + # convert opcode values from bytes to str so we can use them in format + # note that '{0}'.format(b'a') = "b'a'" + b2s = lambda c: c.decode('latin-1') if type(c)==bytes else c + # add opcodes from pickle.* and their corresponding values to `vars' + vars.update( + dict(map(lambda var:(var, b2s(getattr(pickle,var))), + pickle.__all__)) + ) + + return s.replace(' ','') \ + .replace('\n','') \ + .replace('\r','') \ + .format(**vars) \ + .encode('latin-1') + + def test_v4_sets_opcodes(self): + """test unpickling of sets""" + for proto in range(4, 1+pickle.HIGHEST_PROTOCOL): + proto_byte = bytes([proto]).decode('latin-1') + # test creating a set in v4: + # 1. call empty_set + # 2. add mark,1,2,3,4 to stack + # 3. call update_set + # 4. add mark,5,1,2,6 to stack + # 5. call update_set + # 6. stop + data = self._make_pickle_str( + ''' + {PROTO} {proto_byte} + {EMPTY_SET} + {MARK} + {BININT1} \x01 {BININT1} \x02 {BININT1} \x03 {BININT1} \x04 + {UPDATE_SET} + {MARK} + {BININT1} \x05 {BININT1} \x01 {BININT1} \x02 {BININT1} \x06 + {UPDATE_SET} + {STOP} + ''', proto_byte=proto_byte) + self.assertEqual( set([1,2,3,4,5,6]), + self._loads(data, minversion=4) ) + + # test creating a set of two frozensets in v4: + data = self._make_pickle_str( + ''' + {PROTO} {proto_byte} + {EMPTY_SET} + {MARK} + {MARK} + {BININT1} \x01 {BININT2} \xff\x7f {BININT1} \x03 + {FROZENSET} + {MARK} + {BININT1} \x05 {BININT1} \x06 {BININT1} \x07 + {FROZENSET} + {UPDATE_SET} + {STOP} + ''', proto_byte=proto_byte) + self.assertEqual( set([frozenset([1,32767,3]), + frozenset([5,6,7]) ]), + self._loads(data, minversion=4) ) + + def _assert_v4_bad_pickle(self, opcodes, additional_exc=(), **kwargs): + number = (self._make_pickle_str('{BININT1} \x01') + .decode('latin-1')) + func = (self._make_pickle_str('{BINGLOBAL} \x08operator\x03add') + .decode('latin-1')) + klass = (self._make_pickle_str('{BINGLOBAL} \x08builtins\x04list') + .decode('latin-1')) + + kwargs['number'] = number + kwargs['func'] = func + kwargs['klass'] = klass + allowed_exceptions = [pickle.UnpicklingError] + list(additional_exc) + for proto in range(4, 1+pickle.HIGHEST_PROTOCOL): + proto_byte = bytes([proto]).decode('latin-1') + opcodes_ = '{PROTO} {proto_byte} %s {STOP}' % opcodes + with self.assertRaises(tuple(allowed_exceptions)): + data = self._make_pickle_str(opcodes_, + proto_byte=proto_byte, **kwargs) + # self._loads(data, proto) + self.loads(data) + + def test_v4_exceptions(self): + """test that the unpickler raises the right exceptions""" + + # try to apply BINPERSID with an empty stack + self._assert_v4_bad_pickle('{BINPERSID}') + + # try TUPLE on stack with no marker + self._assert_v4_bad_pickle('{number} {TUPLE}') + + # try TUPLE1 on empty stack + self._assert_v4_bad_pickle('{TUPLE1}') + + # try TUPLE2 on one-element stack + self._assert_v4_bad_pickle('{number} {TUPLE2}') + + # try TUPLE3 on two-element stack + self._assert_v4_bad_pickle('{number} {number} {TUPLE3}') + + # try MARK + TUPLE1 + self._assert_v4_bad_pickle('{MARK} {TUPLE1}') + + # try number + MARK + TUPLE2 + self._assert_v4_bad_pickle('{number} {MARK} {TUPLE2}') + + # try number + MARK + number + TUPLE3 + self._assert_v4_bad_pickle('{number} {MARK} {number} {TUPLE3}') + + # try LIST with no MARK + self._assert_v4_bad_pickle('{LIST}') + + # try DICT with no MARK + self._assert_v4_bad_pickle('{DICT}') + + # try NEWOBJ on empty stack + self._assert_v4_bad_pickle('{NEWOBJ}') + + # try NEWOBJ on one-element stack + self._assert_v4_bad_pickle('{klass} {NEWOBJ}') + + # try NEWOBJ when the first argument is not a class + self._assert_v4_bad_pickle('{func} {number} {TUPLE1} {NEWOBJ}') + + # try NEWOBJ when the second argument is not a tuple + self._assert_v4_bad_pickle('{klass} {number} {NEWOBJ}') + + # try NEWOBJ_KW on too small of a stack (needs 3 elements) + self._assert_v4_bad_pickle('{func} {number} {TUPLE1} {NEWOBJ_KW}') + + # try NEWOBJ_KW when the first argument is not a class + self._assert_v4_bad_pickle('{func} {number} {TUPLE1} {EMPTY_DICT}' + '{NEWOBJ_KW}') + + # try NEWOBJ_KW when the second argument is not a tuple + self._assert_v4_bad_pickle('{klass} {number} {EMPTY_DICT} {NEWOBJ_KW}') + + # try NEWOBJ_KW when the third argument is not a dict + self._assert_v4_bad_pickle('{klass} {number} {TUPLE1} {EMPTY_TUPLE}' + '{NEWOBJ_KW}') + + # try REDUCE on a 1-element stack + self._assert_v4_bad_pickle('{func} {REDUCE}') + + # try REDUCE when the first argument is not a callable + self._assert_v4_bad_pickle('{number} {EMPTY_TUPLE} {REDUCE}') + + # try REDUCE when the second argument is not a tuple + self._assert_v4_bad_pickle('{func} {number} {REDUCE}', + additional_exc=(TypeError,)) + + # try POP on an empty stack + self._assert_v4_bad_pickle('{POP}') + self._assert_v4_bad_pickle('{number} {POP} {number} {POP} {POP}') + + # try DUP on an empty stack + self._assert_v4_bad_pickle('{DUP}') + + # try BINGET on nonexistent index + self._assert_v4_bad_pickle('{BINGET} \x00') + self._assert_v4_bad_pickle('{BINPUT} \x00') + + # try APPEND on bad stack size + self._assert_v4_bad_pickle('{APPEND}') + self._assert_v4_bad_pickle('{EMPTY_LIST} {APPEND}') + + # try APPEND when the first argument is not a list + self._assert_v4_bad_pickle('{number} {number} {APPEND}', + additional_exc=(AttributeError,)) + + # try APPEND when the second argument is MARK + self._assert_v4_bad_pickle('{EMPTY_LIST} {MARK} {APPEND}') + + # try APPENDS on bad stack size + self._assert_v4_bad_pickle('{APPENDS}') + + # try APPENDS on no marker + self._assert_v4_bad_pickle('{EMPTY_LIST} {number} {number} {APPENDS}') + + # try APPENDS on no list + self._assert_v4_bad_pickle('{MARK} {number} {number} {APPENDS}', + additional_exc=(AttributeError,)) + + # try APPENDS with non-list first argument + self._assert_v4_bad_pickle('{number} {MARK} {number} {APPENDS}', + additional_exc=(AttributeError,)) + + # try SETITEM with too small of a stack (needs 3 elements) + self._assert_v4_bad_pickle('{EMPTY_DICT} {number} {SETITEM}') + + # try SETITEM with non-dict first argument + self._assert_v4_bad_pickle('{number} {number} {number} {SETITEM}', + additional_exc=(TypeError,)) + + # try SETITEM with MARK as key + self._assert_v4_bad_pickle('{EMPTY_DICT} {MARK} {number} {SETITEM}') + + # try SETITEM with MARK as value + self._assert_v4_bad_pickle('{EMPTY_DICT} {number} {MARK} {SETITEM}') + + # try SETITEMS on empty stack + self._assert_v4_bad_pickle('{SETITEMS}') + + # try SETITEMS on stack with no marker + self._assert_v4_bad_pickle('{EMPTY_DICT} {number} {SETITEMS}') + + # try SETITEMS on stack with no dict + self._assert_v4_bad_pickle('{MARK} {number} {number} {SETITEMS}') + + # try SETITEMS on stack with non-dict first argument + self._assert_v4_bad_pickle('{number} {MARK} {number} {number}' + '{SETITEMS}', + additional_exc=(TypeError,)) + + # try SETITEMS on stack with odd amount of elements + self._assert_v4_bad_pickle('{number} {MARK} {number} {number} {number}' + '{SETITEMS}') + + # try BUILD on too small stack + self._assert_v4_bad_pickle('{BUILD}') + self._assert_v4_bad_pickle('{number} {BUILD}') + + # try BUILD on an object that doesn't support __setstate__ or __dict__ + self._assert_v4_bad_pickle( # BUILD 1 {1:1} + '{number} {EMPTY_DICT} {number} {number} {SETITEM} {BUILD}', + additional_exc=(AttributeError,)) + + # try BUILD with MARK + self._assert_v4_bad_pickle( + '{number} {MARK} {SETITEM} {BUILD}') + self._assert_v4_bad_pickle( + '{MARK} {EMPTY_DICT} {SETITEM} {BUILD}') + + # try BINGLOBAL on nonexistent module name + self._assert_v4_bad_pickle('{BINGLOBAL} \x0bqwertyuiop_\x01list', + additional_exc=(ImportError,)) + + # try BINGLOBAL on nonexistent name + self._assert_v4_bad_pickle('{BINGLOBAL} \x08builtins\x0bqwertyuiop_', + additional_exc=(AttributeError,)) + + # try BINGLOBAL on invalid string size + self._assert_v4_bad_pickle('{BINGLOBAL} \x09builtins\x04list', + additional_exc=(EOFError,)) + self._assert_v4_bad_pickle('{BINGLOBAL} \x08builtins\x05list', + additional_exc=(AttributeError,)) + + # try invalid sizes on some other operators + # TODO IMPORTANT FIX! + self._assert_v4_bad_pickle('{BINBYTES64} \x01'+'\x00'*7, + additional_exc=(EOFError,OverflowError)) + self._assert_v4_bad_pickle('{BINBYTES64} \x02'+'\x00'*7 + '\xff'*3, + additional_exc=(OverflowError,)) + self._assert_v4_bad_pickle('{BINUNICODE16} \x02\x00a', + additional_exc=(EOFError,)) + self._assert_v4_bad_pickle('{SHORT_BINUNICODE} \x02a', + additional_exc=(EOFError,)) + self._assert_v4_bad_pickle('{BINUNICODE64} \x02'+'\x00'*7+'a'*255, + additional_exc=(OverflowError,)) + self._assert_v4_bad_pickle('{BINUNICODE64} \x02'+'\x00'*7, + additional_exc=(OverflowError,EOFError)) + self._assert_v4_bad_pickle('{BINBYTES16} \x02\x00a', + additional_exc=(EOFError,)) + self._assert_v4_bad_pickle('{BINBYTES16} \x01\x00aa') + self._assert_v4_bad_pickle('{BINBYTES16} \x01\x00aaa') + + # try BINGLOBAL_COMMON on no name/invalid name + # TODO BINGLOBAL_COMMON? + self._assert_v4_bad_pickle('{BINGLOBAL} \x00\x00', + additional_exc=(ValueError,)) + self._assert_v4_bad_pickle('{BINGLOBAL} \xff\x00', + additional_exc=(EOFError,)) + self._assert_v4_bad_pickle('{BINGLOBAL} \x00', + additional_exc=(EOFError,)) + self._assert_v4_bad_pickle('{BINGLOBAL} \xff\x0bqwertyuiop_', + additional_exc=(EOFError,)) + + # try to apply UPDATE_SET without a MARK + self._assert_v4_bad_pickle('{number} {UPDATE_SET}') + + # try to apply UPDATE_SET without an EMPTY_SET before MARK + self._assert_v4_bad_pickle('{MARK} {number} {UPDATE_SET}') + + # try to apply UPDATE_SET with the wrong thing before MARK + # TODO fix SystemError + self._assert_v4_bad_pickle( + '{EMPTY_LIST} {MARK} {number} {UPDATE_SET}', + additional_exc=(SystemError,)) + + # try STOP on empty stack + self._assert_v4_bad_pickle('') + self._assert_v4_bad_pickle('{number} {POP}') + + # XXX this can only be enabled after we've disabled deprecated operators + """ + # starting with a random character + for i in range(0,256): + # POP is to make sure that we don't accidentaly run into a valid + # opcode such as EMPTY_DICT, so we empty the stack to trigger an + # error. + self._assert_v4_bad_pickle('{c} {POP}', + c=bytes([i]).decode('latin-1')) + """ + + # TODO test NEWOBJ with just one argument in reduce (save) + + def test_v4_weird_funcs(self): + funcs = [list.append, list.__add__, dict.fromkeys, len, Nested.B.cm, + Nested.B.sm] + for proto in range(4, 1+pickle.HIGHEST_PROTOCOL): + data=self.dumps(funcs, proto) + funcs_=self._loads(data, proto) + l=[] + funcs_[0](l, 1) # l.append(1) + l=funcs_[1](l, [2,3]) # l += [2,3] + self.assertEqual([1,2,3], l) + self.assertEqual(3, funcs_[3](l)) # len(l) + # dict.fromkeys([1,2]) = {1: None, 2: None} + self.assertEqual({1 : None, 2 : None}, funcs_[2]([1,2])) + self.assertEqual('Nested.B', funcs_[4]()) # Nested.B.cm() + self.assertEqual('sm', funcs_[5]()) # Nested.B.sm() + + + def test_v4_cycles(self): + """test pickling cyclic data structures""" + for proto in range(4, 1+pickle.HIGHEST_PROTOCOL): + # creating a cycle + a=Nested.B() + a.cycle = a + + data = self.dumps(a, proto) + aa = self._loads(data, proto) + + self.assertIs(aa, aa.cycle) + + # creating a cycle using a list + a=Nested.B() + b=[a] + b.append(b) + a.cycle = b + + data = self.dumps((b,a), proto) + bb, aa = self._loads(data, proto) + + self.assertIs(aa, aa.cycle[0]) + self.assertIs(aa.cycle, aa.cycle[1]) + + # creating a cycle using a dict + a=Nested.B() + b={1 : a, 2 : a, 3 : a, a : 4 } + b[5] = b + a.cycle = b + a.cycle_again = b + + data = self.dumps((b, a), proto) + bb, aa = self._loads(data, proto) + + self.assertIs(aa, aa.cycle[1]) + self.assertIs(aa, aa.cycle[2]) + self.assertIs(aa, aa.cycle[3]) + self.assertEquals(4, aa.cycle[aa]) + self.assertIs(aa.cycle, aa.cycle_again) + self.assertIs(aa.cycle[5], aa.cycle) + + # creating a cycle using a set + # http://bugs.python.org/issue998998#msg77200 + a=Nested.B() + b=set([1,2,a,4]) + a.cycle = b + + data = self.dumps((b,a), proto) + bb, aa = self._loads(data, proto, 4) + + self.assertIs(aa.cycle, bb) + self.assertIn(1, bb) + self.assertIn(2, bb) + self.assertIn(aa, bb) + self.assertIn(4, bb) + self.assertIn(aa, bb) + + # creating a cycle using a frozenset + a=Nested.B() + b=frozenset([1,2,a,4, b'hello','hello']) + a.cycle = b + + data = self.dumps((b,a), proto) + bb, aa = self._loads(data, proto, 4) + + self.assertIn(1, bb) + self.assertIn(2, bb) + self.assertIn(aa, bb) + self.assertIn(4, bb) + self.assertIn(b'hello', bb) + self.assertIn('hello', bb) + self.assertIs(aa.cycle, bb) + + a = Nested.B() + b = frozenset([frozenset([frozenset([a])])]) + a.cycle = b + a.cycle2 = ((b,),) + + data = self.dumps(b, proto) + bb = self._loads(data, proto, 4) + + # beautiful! + aa = next(iter( + next(iter( + next(iter( + bb + )) )) )) + self.assertIs(aa.cycle, bb) + self.assertIs(aa.cycle2[0][0], bb) + + # 3-cycle + a=Nested.B() + b=[ (a, a), {1 : a, 2 : a, a : 3}, [a, a, a], set([1,2,a]), + frozenset([1,2,a])] + c=(b, a) + b.append(c) + a.cycle = c + + data = self.dumps(a, proto) + aa = self._loads(data, proto, 4) + + bb = aa.cycle[0] + self.assertIs(aa, bb[0][0]) + self.assertIs(aa, bb[0][1]) + + self.assertIs(aa, bb[1][1]) + self.assertIs(aa, bb[1][2]) + self.assertEqual(3, bb[1][aa]) + + self.assertIs(aa, bb[2][0]) + self.assertIs(aa, bb[2][1]) + self.assertIs(aa, bb[2][2]) + + self.assertIn(1, bb[3]) + self.assertIn(2, bb[3]) + self.assertIn(aa, bb[3]) + + self.assertIn(1, bb[4]) + self.assertIn(2, bb[4]) + self.assertIn(aa, bb[4]) + + self.assertIs(aa.cycle[1], aa) + + # TODO XXX URGENT the next two tests will need fixing in the C code and do + # not pass yet + def test_v4_reduce_cycles(self): + """test reduce-pickling self-referential data structures""" + for proto in range(4, 1+pickle.HIGHEST_PROTOCOL): + a=MySet() + b=Nested.B() + b.a = a + a.add(b) + + data = self.dumps( (a, b), proto) + aa,bb = self._loads(data) + + self.assertIs(type(aa), MySet) + self.assertIs(type(bb), Nested.B) + self.assertIs(next(iter(aa)), bb) + self.assertIs(bb.a, aa) + + def test_v4_bound_methods(self): + obj1 = Nested.B.C() + obj2 = Nested.B.C() + obj3 = Nested.B() + + obj1.a = obj2 + obj2.a = 42 + obj2.b = obj3 + obj3.c = 'hey' + + obj4 = MySet() + obj4.test = 'some test' + obj4.add('other test') + + l = [1,2,3,4] + + for proto in range(4, 1+pickle.HIGHEST_PROTOCOL): + # pickle obj1.__getattr__ + data = self.dumps(obj1.__getattribute__, proto) + obj1_getattr_ = self._loads(data, proto, 4) + + obj2_ = obj1_getattr_('a') # obj1.a + self.assertIs(type(obj2_), Nested.B.C) + obj3_ = obj2_.b + self.assertIs(type(obj3_), Nested.B) + self.assertEqual(obj3_.c, 'hey') + + # pickle list functions + data = self.dumps( (l.__repr__, l.append), 4) + # trailing _ indicates the function is not bound to obj3, but + # another object identical to obj3 + l_repr_, l_append_ = self._loads(data, proto, 4) + self.assertEqual('[1, 2, 3, 4]', l_repr_()) + l_append_(5); l_append_(6); l_append_(l_append_.__self__) + self.assertEqual('[1, 2, 3, 4, 5, 6, [...]]', l_repr_()) + + # pickle unbound list functions + data = self.dumps( (l, list.__repr__, list.append), 4) + l_, l_repr_, l_append_ = self._loads(data, proto) + l_repr_ = l_repr_.__get__(l_) # bind + l_append_ = l_append_.__get__(l_) + self.assertEqual('[1, 2, 3, 4]', l_repr_()) + l_append_(5); l_append_(6); l_append_(l_) + self.assertEqual('[1, 2, 3, 4, 5, 6, [...]]', l_repr_()) + + # pickle a set's add and lookup methods and then use them + s = set([]) + data = self.dumps( (s.add, s.__contains__), proto) + add_func, contains_func = self._loads(data, proto, 4) + for i in range(1,2000): + add_func(i) + + for i in range(1,2000): + self.assertTrue(contains_func(i)) + self.assertFalse(contains_func(2000)) + self.assertFalse(s) # the original set should still be empty + + # pickle obj3 stuff + data = self.dumps(obj3.ff, proto) + obj3_ff_ = self._loads(data, proto, 4) + self.assertEqual(obj3_ff_(), 'Nested.B.ff') + + # pickle obj2 stuff + data = self.dumps(obj2.get_a, proto) + obj2_get_a_ = self._loads(data, proto, 4) + self.assertEqual(42, obj2_get_a_()) + + # like above, but pickle the object and the function separately + data = self.dumps( (obj2, Nested.B.C.get_a), proto) + obj2_, obj2_get_a_ = self._loads(data, proto, 4) + self.assertEqual(42, obj2_get_a_(obj2_)) + + # pickle obj4 stuff + data = self.dumps( (obj4.__getattribute__, obj4.__contains__, + obj4.__repr__), proto) + obj4_getattr_, obj4_contains_, obj4_repr_ = \ + self._loads(data, proto, 4) + self.assertEqual('some test', obj4_getattr_('test')) + self.assertTrue(obj4_contains_('other test')) + self.assertFalse(obj4_contains_('yet another test')) + self.assertEquals("MySet({'other test'})", obj4_repr_()) + + def test_v4_bail_out(self): + """test that the BAIL_OUT opcode is written on failure""" + # can't unpickle locals + class ImpossibleToUnpickle: + pass + for proto in range(4, 1+pickle.HIGHEST_PROTOCOL): + stream = io.BytesIO() + with self.assertRaises(BaseException): + p=self.pickler(stream, proto) + p.dump( (1,2, ImpossibleToUnpickle) ) + + with self.assertRaisesRegex(pickle.UnpicklingError, + 'does not contain correctly pickled ' + 'data'): + stream.seek(0) + u=self.unpickler(stream) + u.load() + + def test_v4_new_kwargs(self): + """test __new__ with keyword arguments""" + obj1 = KwargsNew(1,2,3,foo=4,bar=5) + obj2 = KwargsNew(x=obj1, y=obj1) + obj3 = KwargsNew2(1,2,a=obj1,b=obj1,c=obj2,d=obj2) + + for proto in range(4, 1+pickle.HIGHEST_PROTOCOL): + self.assertEqual('__newobj_kw__', + obj3.__reduce_ex__(proto)[0].__name__) + data = self.dumps(obj3, proto) + obj3_ = self._loads(data, proto, 4) + + # memoization test + obj2_ = obj3_.kwargs['c'] + obj1_ = obj3_.kwargs['a'] + self.assertIs(obj2_, obj3_.kwargs['d']) + self.assertIs(obj1_, obj3_.kwargs['b']) + self.assertIs(obj2_.kwargs['x'], obj2_.kwargs['y']) + self.assertIs(obj1_, obj2_.kwargs['x']) + + self.assertEqual(obj2.kargs, obj2_.kargs) # empty tuple + self.assertEqual(obj1.kargs, obj1_.kargs) + self.assertEqual(obj1.kwargs, obj1_.kwargs) + + def test_v4_getnewargs_and_getstate(self): + kargs = tuple(range(1,50)) + x = 'x' + y = 'y' + kwargs = {x : y, y : x} + # implements __getstate__, __setstate__, __getnewargs__, + # __getnewargs_kw__ + obj = KwargsNew2(*kargs, **kwargs) + for proto in protocols: + # KwargsNew2 sets one of these to true, depending on whether + # __getnewargs__ or __getnewargs_kw__ was called + obj.called_getnewargs = False + obj.called_getnewargs_kw = False + data = self.dumps(obj, proto) + # pickle>=4 will consider __getnewargs_kw__ as well, and will + # therefore generate a NEWOBJ_KW opcode, thus the pickled data must + # contain at least one >=4 opcode. + obj_ = self._loads(data, proto, 0 if proto < 4 else 4) + # __getstate__ was used in pickling/unpickling + self.assertEqual([1, 2, 3], obj_._state) + if proto < 2: + # 0 and 1 have __getstate__, but no __getnewargs__ + # XXX __init__ and __new__ are not called on pickle<2. is this + # a bug in _reconstructor? + self.assertFalse(obj.called_getnewargs) + self.assertFalse(obj.called_getnewargs_kw) + # self.assertFalse(obj_.kargs) + # self.assertFalse(obj_.kwargs) + elif proto < 4: + # 2 and 3 have __getstate__ and __getnewargs__ + self.assertTrue(obj.called_getnewargs) + self.assertFalse(obj.called_getnewargs_kw) + self.assertEqual(obj_.kargs, kargs) + else: + # 4 and above have __getstate__, __getnewargs__ and + # __getnewargs_kw__ + self.assertFalse(obj.called_getnewargs) + self.assertTrue(obj.called_getnewargs_kw) + self.assertEqual(obj_.kargs, kargs) + self.assertEqual(obj_.kwargs, kwargs) + + def test_v4_bad_getnewargs_kw(self): + obj = BadKwargsNew() + getnewargs_kw_funcs = [ + lambda: [(1,2), {3:4}], + lambda: ('abc', {2:3}), + lambda: ((1,2), (3,4)), + lambda: ((1,2), {3:4}, {3:4}) + ] + for proto in range(4, 1+pickle.HIGHEST_PROTOCOL): + for func in getnewargs_kw_funcs: + obj.__getnewargs_kw__ = func + with self.assertRaises(TypeError): + obj.__reduce_ex__(proto) + + + def _used_opcodes(self, data): + opcodes=set() + for opcode, arg, pos in pickletools.genops(data): + opcodes.update([opcode.name]) + return opcodes + + # TODO test pickling when supplying common_modules + # TODO test "can't work with the locals of" + + def test_v4_binglobal_big(self): + klass=_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + for proto in protocols: + data=self.dumps(klass, proto) + klass_=self._loads(data, proto) + self.assertEqual(klass_, klass) + opcodes=self._used_opcodes(data) + if proto < 4: + self.assertNotIn('BINGLOBAL', opcodes) + self.assertNotIn('BINGLOBAL_BIG', opcodes) + self.assertIn('GLOBAL', opcodes) + else: + self.assertNotIn('GLOBAL', opcodes) + self.assertNotIn('BINGLOBAL', opcodes) + self.assertIn('BINGLOBAL_BIG', opcodes) class BigmemPickleTests(unittest.TestCase): @@ -1464,7 +2547,7 @@ def test_highest_protocol(self): # Of course this needs to be changed when HIGHEST_PROTOCOL changes. - self.assertEqual(pickle.HIGHEST_PROTOCOL, 3) + self.assertEqual(pickle.HIGHEST_PROTOCOL, 4) def test_callapi(self): f = io.BytesIO() diff -r 4b3238923b01 -r d0c3a8d4947a Lib/test/test_pickletools.py --- a/Lib/test/test_pickletools.py Fri May 10 19:57:44 2013 -0700 +++ b/Lib/test/test_pickletools.py Sat May 11 03:03:53 2013 +0300 @@ -5,6 +5,8 @@ from test.pickletester import AbstractPickleModuleTests class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests): + pickler = pickle.Pickler + unpickler = pickle.Unpickler def dumps(self, arg, proto=None): return pickletools.optimize(pickle.dumps(arg, proto)) diff -r 4b3238923b01 -r d0c3a8d4947a Modules/_pickle.c --- a/Modules/_pickle.c Fri May 10 19:57:44 2013 -0700 +++ b/Modules/_pickle.c Sat May 11 03:03:53 2013 +0300 @@ -6,7 +6,7 @@ /* Bump this when new opcodes are added to the pickle protocol. */ enum { - HIGHEST_PROTOCOL = 3, + HIGHEST_PROTOCOL = 4, DEFAULT_PROTOCOL = 3 }; @@ -71,7 +71,23 @@ /* Protocol 3 (Python 3.x) */ BINBYTES = 'B', - SHORT_BINBYTES = 'C' + SHORT_BINBYTES = 'C', + + /* Protocol 4 (Python 3.) */ + BINBYTES64 = '\x8c', + BINUNICODE16 = '\x8d', + SHORT_BINUNICODE = '\x8e', + BINUNICODE64 = '\x8f', + BINBYTES16 = '\x90', + BINGLOBAL = '\x91', + BINGLOBAL_BIG = '\x92', + BINGLOBAL_COMMON = '\x93', + EMPTY_SET = '\x94', + EMPTY_FROZENSET = '\x95', + UPDATE_SET = '\x96', + FROZENSET = '\x97', + NEWOBJ_KW = '\x98', + BAIL_OUT = '\xff' }; /* These aren't opcodes -- they're ways to pickle bools before protocol 2 @@ -136,6 +152,161 @@ /* For looking up name pairs in copyreg._extension_registry. */ static PyObject *two_tuple = NULL; +static PyObject *v4_common_modules = NULL; + +static PyObject * +unbind (PyObject *func) +{ + PyObject *self = NULL, *unbound = NULL, *name; + static PyObject *self_str = NULL, *func_str = NULL, *name_str = NULL; + + if (!self_str) { + self_str = PyUnicode_InternFromString("__self__"); + if (!self_str) return NULL; + } + + self = PyObject_GetAttr(func, self_str); + PyErr_Clear(); + if (!self || PyModule_Check(self) || PyType_Check(self)) { + PyErr_SetString(PyExc_TypeError, "not a bound method"); + Py_XDECREF(self); + return NULL; + } + else { + if (!func_str) { + func_str = PyUnicode_InternFromString("__func__"); + if (!func_str) goto done; + } + unbound = PyObject_GetAttr(func, func_str); + if (unbound) goto done; + else { + if (PyErr_ExceptionMatches(PyExc_AttributeError)) + PyErr_Clear(); + else return NULL; + if (!name_str) { + name_str = PyUnicode_InternFromString("__name__"); + if (!name_str) goto done; + } + name = PyObject_GetAttr(func, name_str); + if (!name) goto done; + unbound = PyObject_GetAttr((PyObject*)Py_TYPE(self), name); + Py_DECREF(name); + } + } + +done: + Py_DECREF(self); + return unbound; +} + +static int isclassmethod (PyObject *func) +{ + PyObject *self; + static PyObject *self_str = NULL; + + if (Py_TYPE(func) != &PyMethod_Type && + Py_TYPE(func) != &PyCFunction_Type && + Py_TYPE(func) != &PyClassMethod_Type && + Py_TYPE(func) != &PyClassMethodDescr_Type) return 0; + + if (!self_str) { + self_str = PyUnicode_InternFromString("__self__"); + if (!self_str) return 0; + } + + self = PyObject_GetAttr(func, self_str); + if (self && PyType_Check(self)) { Py_DECREF(self); return 1; } + Py_XDECREF(self); + return 0; +} + +static PyObject * +getattr_recurse (PyObject *obj, PyObject *attr) +{ + static PyObject *locals_str = NULL, *qualname_str = NULL, *dot = NULL; + PyObject *attr_parts, *iter, *item, *crt = obj, *prev; + + assert(PyUnicode_Check(attr)); + + if (locals_str == NULL) { + /* + appears as a token in __qualname__. E.g.: + >>> def f(): + ... def g(): + ... pass + ... return g.__qualname__ + ... + >>> f() + 'f..g' + */ + locals_str = PyUnicode_InternFromString(""); + if (locals_str == NULL) return NULL; + } + if (qualname_str == NULL) { + qualname_str = PyUnicode_InternFromString("__qualname__"); + if (qualname_str == NULL) return NULL; + } + if (dot == NULL) { + dot = PyUnicode_InternFromString("."); + if (dot == NULL) return NULL; + } + + attr_parts = PyUnicode_Split(attr, dot, 128); + if (!attr_parts) + return NULL; + + iter = PyObject_GetIter(attr_parts); + + // Making sure that the first call to Py_DECREF(prev) below won't decrement + // obj's refcount + Py_INCREF(obj); + + while ( (item = PyIter_Next(iter)) ) { + //check item=="" + PyObject *is_locals = PyUnicode_RichCompare(item, locals_str, Py_EQ); + + if (is_locals == Py_True) { + PyObject *qualname = PyObject_GetAttr(crt, qualname_str); + if (qualname == NULL) { crt = NULL; goto error; } + PyErr_Format(PyExc_TypeError, + "Cannot work with the locals of %U", qualname); + Py_DECREF(item); + Py_DECREF(qualname); + Py_DECREF(is_locals); + crt = NULL; + goto error; + } + else if (is_locals == Py_NotImplemented) { + PyErr_BadInternalCall(); + crt = NULL; + Py_DECREF(item); + Py_DECREF(is_locals); + goto error; + } + else if (is_locals == NULL) { + crt = NULL; + Py_DECREF(item); + goto error; + } + + prev = crt; + crt = PyObject_GetAttr(crt, item); + Py_DECREF(prev); + Py_DECREF(is_locals); + if (crt == NULL) { Py_DECREF(item); goto error; } + + Py_DECREF(item); + } + + //iteration failed + if (PyErr_Occurred()) crt = NULL; + +error: + Py_DECREF(iter); + Py_DECREF(attr_parts); + return crt; +} + static int stack_underflow(void) { @@ -200,6 +371,7 @@ return 0; while (--i >= clearto) { + //XXX are you sure Py_CLEAR is desired here as opposed to Py_(X)DECREF? Py_CLEAR(self->data[i]); } Py_SIZE(self) = clearto; @@ -340,6 +512,7 @@ int fix_imports; /* Indicate whether Pickler should fix the name of globals for Python 2.x. */ PyObject *fast_memo; + PyObject *common_modules; } PicklerObject; typedef struct UnpicklerObject { @@ -349,7 +522,11 @@ /* The unpickler memo is just an array of PyObject *s. Using a dict is unnecessary, since the keys are contiguous ints. */ PyObject **memo; + /* memo_size indicates the size allocated for the memo */ Py_ssize_t memo_size; + /* memo_real_size is used in pickle>=4 to indicate the first free index for + memoization */ + Py_ssize_t memo_real_size; PyObject *arg; PyObject *pers_func; /* persistent_load() method, can be NULL. */ @@ -377,6 +554,7 @@ int proto; /* Protocol of the pickle loaded. */ int fix_imports; /* Indicate whether Unpickler should fix the name of globals pickled by Python 2.x. */ + PyObject *common_modules; } UnpicklerObject; /* Forward declarations */ @@ -774,6 +952,7 @@ self->fast_nesting = 0; self->fix_imports = 0; self->fast_memo = NULL; + self->common_modules = v4_common_modules; self->memo = PyMemoTable_New(); if (self->memo == NULL) { @@ -1120,6 +1299,7 @@ if (self->memo == NULL) return; self->memo = NULL; + self->memo_real_size = 0; i = self->memo_size; while (--i >= 0) { Py_XDECREF(memo[i]); @@ -1144,6 +1324,7 @@ memset(&self->buffer, 0, sizeof(Py_buffer)); self->memo_size = 32; + self->memo_real_size = 0; self->memo = _Unpickler_NewMemo(self->memo_size); if (self->memo == NULL) { Py_DECREF(self); @@ -1167,6 +1348,7 @@ self->marks_size = 0; self->proto = 0; self->fix_imports = 0; + self->common_modules = v4_common_modules; return self; } @@ -1222,6 +1404,18 @@ return 0; } +// copy (i-1)-th byte from integer SRC into i-th byte of char* dest +// ITH should be a constant, as it's used twice in the macro definition +// COPY_BYTE(pdata, size, 1) --> pdata[1] = (unsigned char)(size & 0xff) +// COPY_BYTE(pdata, size, 2) --> pdata[2] = (unsigned char)((size>>8)&0xff) +// COPY_BYTE(pdata, size, 3) --> pdata[3] = (unsigned char)((size>>16)&0xff) +#define COPY_BYTE(DEST, SRC, ITH) \ + (DEST)[(ITH)] = (unsigned char)( ((SRC) >> (((ITH)-1)<<3)) & 0xff ) +//like above, but copy i-th byte from integer SRC into i-th byte of char* dest +#define COPY_BYTE_NO_OFFSET(DEST, SRC, ITH) \ + (DEST)[(ITH)] = (unsigned char)( ((SRC) >> ((ITH)<<3)) & 0xff ) + + /* Generate a GET opcode for an object stored in the memo. */ static int memo_get(PicklerObject *self, PyObject *key) @@ -1269,8 +1463,13 @@ return 0; } -/* Store an object in the memo, assign it a new unique ID based on the number - of objects currently stored in the memo and generate a PUT opcode. */ +/** + * Store an object in the memo, assign it a new unique ID based on the number + of objects currently stored in the memo. + * Until v4, this generates a PUT opcode, but as of v4, the pickler and + unpickler agree to use the same indexing technique, making the writing of a + PUT opcode unnecessary. This considerably reduces the size of a pickle. + */ static int memo_put(PicklerObject *self, PyObject *obj) { @@ -1286,35 +1485,37 @@ if (PyMemoTable_Set(self->memo, obj, x) < 0) goto error; - if (!self->bin) { - pdata[0] = PUT; - PyOS_snprintf(pdata + 1, sizeof(pdata) - 1, - "%" PY_FORMAT_SIZE_T "d\n", x); - len = strlen(pdata); - } - else { - if (x < 256) { - pdata[0] = BINPUT; - pdata[1] = (unsigned char)x; - len = 2; - } - else if (x <= 0xffffffffL) { - pdata[0] = LONG_BINPUT; - pdata[1] = (unsigned char)(x & 0xff); - pdata[2] = (unsigned char)((x >> 8) & 0xff); - pdata[3] = (unsigned char)((x >> 16) & 0xff); - pdata[4] = (unsigned char)((x >> 24) & 0xff); - len = 5; - } - else { /* unlikely */ - PyErr_SetString(PicklingError, - "memo id too large for LONG_BINPUT"); - return -1; - } - } - - if (_Pickler_Write(self, pdata, len) < 0) - goto error; + //report the memoization on older versions + if (self->proto < 4) { + if (!self->bin) { + pdata[0] = PUT; + PyOS_snprintf(pdata + 1, sizeof(pdata) - 1, + "%" PY_FORMAT_SIZE_T "d\n", x); + len = strlen(pdata); + } + else { + if (x < 256) { + pdata[0] = BINPUT; + pdata[1] = (unsigned char)x; + len = 2; + } + else if (x <= 0xffffffffL) { + pdata[0] = LONG_BINPUT; + pdata[1] = (unsigned char)(x & 0xff); + pdata[2] = (unsigned char)((x >> 8) & 0xff); + pdata[3] = (unsigned char)((x >> 16) & 0xff); + pdata[4] = (unsigned char)((x >> 24) & 0xff); + len = 5; + } + else { /* unlikely */ + PyErr_SetString(PicklingError, + "memo id too large for LONG_BINPUT"); + return -1; + } + } + if (_Pickler_Write(self, pdata, len) < 0) + goto error; + } if (0) { error: @@ -1324,15 +1525,29 @@ return status; } +/** + * In pickle>=4, this is called by the unpickler when an object that is loaded + * is to be memoized. + */ +static int +_Unpickler_Memoize(UnpicklerObject *self, PyObject *obj) +{ + if (self->proto < 4) return 0; + + return _Unpickler_MemoPut(self, self->memo_real_size++, obj); +} + static PyObject * whichmodule(PyObject *global, PyObject *global_name) { Py_ssize_t i, j; static PyObject *module_str = NULL; static PyObject *main_str = NULL; + static PyObject *self_str = NULL; PyObject *module_name; PyObject *modules_dict; PyObject *module; + PyObject *self; PyObject *obj; if (module_str == NULL) { @@ -1342,27 +1557,47 @@ main_str = PyUnicode_InternFromString("__main__"); if (main_str == NULL) return NULL; + self_str = PyUnicode_InternFromString("__self__"); + if (self_str == NULL) + return NULL; } module_name = PyObject_GetAttr(global, module_str); /* In some rare cases (e.g., bound methods of extension types), - __module__ can be None. If it is so, then search sys.modules - for the module of global. */ - if (module_name == Py_None) { + __module__ can be None. If it is so, then search sys.modules for the + module of global. Before doing so, check if the global has a __self__ + attribute which in turn has a __module__. */ + if (!module_name) { + if (PyErr_ExceptionMatches(PyExc_AttributeError)) + PyErr_Clear(); + else + return NULL; + } + else if (module_name == Py_None) { Py_DECREF(module_name); - goto search; - } - - if (module_name) { - return module_name; - } - if (PyErr_ExceptionMatches(PyExc_AttributeError)) - PyErr_Clear(); - else - return NULL; - - search: + } + else return module_name; + + self = PyObject_GetAttr(global, self_str); + if (!self) { + if (PyErr_ExceptionMatches(PyExc_AttributeError)) + PyErr_Clear(); + else + return NULL; + } + else { + module_name = PyObject_GetAttr(self, module_str); + Py_DECREF(self); + if (!module_name) { + if (PyErr_ExceptionMatches(PyExc_AttributeError)) + PyErr_Clear(); + else + return NULL; + } + else return module_name; + } + modules_dict = PySys_GetObject("modules"); if (modules_dict == NULL) return NULL; @@ -1373,7 +1608,7 @@ if (PyObject_RichCompareBool(module_name, main_str, Py_EQ) == 1) continue; - obj = PyObject_GetAttr(module, global_name); + obj = getattr_recurse(module, global_name); if (obj == NULL) { if (PyErr_ExceptionMatches(PyExc_AttributeError)) PyErr_Clear(); @@ -1778,18 +2013,37 @@ header[1] = (unsigned char)size; len = 2; } - else if (size <= 0xffffffffL) { - header[0] = BINBYTES; - header[1] = (unsigned char)(size & 0xff); - header[2] = (unsigned char)((size >> 8) & 0xff); - header[3] = (unsigned char)((size >> 16) & 0xff); - header[4] = (unsigned char)((size >> 24) & 0xff); - len = 5; - } - else { + else if (self->proto >= 4 && size <= 0xffff) { + header[0] = BINBYTES16; + COPY_BYTE(header, size, 1); + COPY_BYTE(header, size, 2); + len = 3; + } + else if (size > 0xffffffffL) { + //TODO find better types to use +#if SIZEOF_SIZE_T > 4 + if (self->proto <= 3) { + PyErr_SetString(PyExc_OverflowError, + "cannot serialize a bytes object larger than 4GB"); + return -1; /* string too large */ + } + header[0] = BINBYTES64; + COPY_BYTE(header, size, 1); COPY_BYTE(header, size, 2); + COPY_BYTE(header, size, 3); COPY_BYTE(header, size, 4); + COPY_BYTE(header, size, 5); COPY_BYTE(header, size, 6); + COPY_BYTE(header, size, 7); COPY_BYTE(header, size, 8); + len = 9; +#else PyErr_SetString(PyExc_OverflowError, "cannot serialize a bytes object larger than 4 GiB"); return -1; /* string too large */ +#endif + } + else { + header[0] = BINBYTES; + COPY_BYTE(header, size, 1); COPY_BYTE(header, size, 2); + COPY_BYTE(header, size, 3); COPY_BYTE(header, size, 4); + len = 5; } if (_Pickler_Write(self, header, len) < 0) @@ -1875,24 +2129,51 @@ static int write_utf8(PicklerObject *self, char *data, Py_ssize_t size) { - char pdata[5]; - + char pdata[9]; + Py_ssize_t len; + + // 8bit size + if (self->proto >= 4 && size < 256) { + pdata[0] = SHORT_BINUNICODE; + COPY_BYTE(pdata, size, 1); + len = 2; + } + //16bit size + else if (self->proto >= 4 && size <= 0xffff) { + pdata[0] = BINUNICODE16; + COPY_BYTE(pdata, size, 1); + COPY_BYTE(pdata, size, 2); + len = 3; + } + //64bit size (or fail on pickle<4) + else if (size > 0xffffffffL) { #if SIZEOF_SIZE_T > 4 - if (size > 0xffffffffUL) { - /* string too large */ + if (self->proto < 4) { + PyErr_SetString(PyExc_OverflowError, + "cannot serialize a string larger than 4GiB"); + return -1; /* string too large */ + } + pdata[0] = BINUNICODE64; + COPY_BYTE(pdata, size, 1); COPY_BYTE(pdata, size, 2); + COPY_BYTE(pdata, size, 3); COPY_BYTE(pdata, size, 4); + COPY_BYTE(pdata, size, 5); COPY_BYTE(pdata, size, 6); + COPY_BYTE(pdata, size, 7); COPY_BYTE(pdata, size, 8); + len = 9; +#else PyErr_SetString(PyExc_OverflowError, "cannot serialize a string larger than 4GiB"); - return -1; - } + return -1; /* string too large */ #endif - - pdata[0] = BINUNICODE; - pdata[1] = (unsigned char)(size & 0xff); - pdata[2] = (unsigned char)((size >> 8) & 0xff); - pdata[3] = (unsigned char)((size >> 16) & 0xff); - pdata[4] = (unsigned char)((size >> 24) & 0xff); - - if (_Pickler_Write(self, pdata, sizeof(pdata)) < 0) + } + //32bit size + else { + pdata[0] = BINUNICODE; + COPY_BYTE(pdata, size, 1); COPY_BYTE(pdata, size, 2); + COPY_BYTE(pdata, size, 3); COPY_BYTE(pdata, size, 4); + len = 5; + } + + if (_Pickler_Write(self, pdata, len) < 0) return -1; if (_Pickler_Write(self, data, size) < 0) @@ -2590,30 +2871,252 @@ return status; } +static int save_global_nonbinary( + PicklerObject *self, + PyObject *module_name, + PyObject *global_name) +{ + static char global_op = GLOBAL; + PyObject *encoded; + PyObject *(*unicode_encoder)(PyObject *); + + /* Since Python 3.0 now supports non-ASCII identifiers, we encode both + the module name and the global name using UTF-8. We do so only when + we are using the pickle protocol newer than version 3. This is to + ensure compatibility with older Unpickler running on Python 2.x. */ + if (self->proto >= 3) { + unicode_encoder = PyUnicode_AsUTF8String; + } + else { + unicode_encoder = PyUnicode_AsASCIIString; + } + + if ( _Pickler_Write(self, &global_op, 1) < 0) + return -1; + + /* Save the name of the module. */ + encoded = unicode_encoder(module_name); + if (encoded == NULL) { + if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) + PyErr_Format(PicklingError, + "can't pickle module identifier '%S' using " + "pickle protocol %i", module_name, self->proto); + return -1; + } + if (_Pickler_Write(self, PyBytes_AS_STRING(encoded), + PyBytes_GET_SIZE(encoded)) < 0) { + Py_DECREF(encoded); + return -1; + } + Py_DECREF(encoded); + if(_Pickler_Write(self, "\n", 1) < 0) + return -1; + + /* Save the name of the global. */ + encoded = unicode_encoder(global_name); + if (encoded == NULL) { + if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) + PyErr_Format(PicklingError, + "can't pickle global identifier '%S' using " + "pickle protocol %i", global_name, self->proto); + return -1; + } + if (_Pickler_Write(self, PyBytes_AS_STRING(encoded), + PyBytes_GET_SIZE(encoded)) < 0) { + Py_DECREF(encoded); + return -1; + } + Py_DECREF(encoded); + if(_Pickler_Write(self, "\n", 1) < 0) + return -1; + + return 0; +} + +/* + * Only for pickle >= 4. + * Uses opcodes BINGLOBAL, BINGLOBAL_COMMON, BINGLOBAL_BIG + */ +static int save_global_binary( + PicklerObject *self, + PyObject *module_name, + PyObject *global_name) +{ + char global_op; + int return_code = 0; + PyObject *encoded_module_name, *encoded_global_name; + Py_ssize_t common_module_id, encoded_module_size, encoded_global_size; + + assert(module_name != NULL && global_name != NULL); + + if ( (common_module_id = PySequence_Index(self->common_modules, + module_name)) < 0) { + if (!PyErr_ExceptionMatches(PyExc_ValueError)) + return -1; + PyErr_Clear(); + } + + encoded_module_name = PyUnicode_AsUTF8String(module_name); + if (encoded_module_name == NULL) { + if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) + PyErr_Format(PicklingError, + "can't pickle module identifier '%S' using " + "pickle protocol %i", encoded_module_name, + self->proto); + return -1; + } + encoded_module_size = PyBytes_GET_SIZE(encoded_module_name); + if (encoded_module_size < 0) { + Py_DECREF(encoded_module_name); + return -1; + } + + encoded_global_name = PyUnicode_AsUTF8String(global_name); + if (encoded_global_name == NULL) { + if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) + PyErr_Format(PicklingError, + "can't pickle global identifier '%S' using " + "pickle protocol %i", global_name, self->proto); + Py_DECREF(encoded_module_name); + return -1; + } + encoded_global_size = PyBytes_GET_SIZE(encoded_global_name); + if (encoded_global_size < 0) goto error; + + /* BINGLOBAL_COMMON */ + if (common_module_id >= 0 && common_module_id <= 0xff && + encoded_global_size <= 0xff) { + char module_id_byte = common_module_id, + global_size_byte = encoded_global_size; + + /* write the opcode */ + global_op = BINGLOBAL_COMMON; + if (_Pickler_Write(self, &global_op, 1) < 0) + goto error; + + /* write the module id (1 byte) */ + if (_Pickler_Write(self, &module_id_byte, 1) < 0) + goto error; + + /* write the size of the global (1 byte) */ + if (_Pickler_Write(self, &global_size_byte, 1) < 0) + goto error; + + /* write the global name */ + if (_Pickler_Write(self, PyBytes_AS_STRING(encoded_global_name), + encoded_global_size) < 0) + goto error; + } + /* BINGLOBAL */ + else if (encoded_module_size <= 0xff && encoded_global_size <= 0xff) { + char module_size_byte = encoded_module_size, + global_size_byte = encoded_global_size; + + /* write the opcode */ + global_op = BINGLOBAL; + if (_Pickler_Write(self, &global_op, 1) < 0) + goto error; + + /* write the size of the module (1 byte) */ + if (_Pickler_Write(self, &module_size_byte, 1) < 0) + goto error; + + /* write the module name */ + if (_Pickler_Write(self, PyBytes_AS_STRING(encoded_module_name), + encoded_module_size) < 0) + goto error; + + /* write the size of the global (1 byte) */ + if (_Pickler_Write(self, &global_size_byte, 1) < 0) + goto error; + + /* write the global name */ + if (_Pickler_Write(self, PyBytes_AS_STRING(encoded_global_name), + encoded_global_size) < 0) + goto error; + + } + /* BINGLOBAL_BIG */ + else { + char data[2]; + /* nearly useless checks */ + if (encoded_module_size > 0xffff) { + PyErr_Format(PyExc_OverflowError, "Unusually large module name."); + goto error; + } + else if (encoded_global_size > 0xffff) { + PyErr_Format(PyExc_OverflowError, "Unusually large global name."); + goto error; + } + + /* write the opcode */ + global_op = BINGLOBAL_BIG; + if (_Pickler_Write(self, &global_op, 1) < 0) + goto error; + + /* write the size of the module (2 bytes) */ + COPY_BYTE_NO_OFFSET(data, encoded_module_size, 0); + COPY_BYTE_NO_OFFSET(data, encoded_module_size, 1); + if (_Pickler_Write(self, data, 2) < 0) + goto error; + + /* write the module name */ + if (_Pickler_Write(self, PyBytes_AS_STRING(encoded_module_name), + encoded_module_size) < 0) + goto error; + + /* write the size of the global (2 bytes) */ + COPY_BYTE_NO_OFFSET(data, encoded_global_size, 0); + COPY_BYTE_NO_OFFSET(data, encoded_global_size, 1); + if (_Pickler_Write(self, data, 2) < 0) + goto error; + + /* write the global name */ + if (_Pickler_Write(self, PyBytes_AS_STRING(encoded_global_name), + encoded_global_size) < 0) + goto error; + } + + if (0) { + // only goto error after both encoded_global_name + // and encoded_module_name have been initialized +error: + return_code = -1; + } + Py_DECREF(encoded_module_name); + Py_DECREF(encoded_global_name); + return return_code; +} + static int save_global(PicklerObject *self, PyObject *obj, PyObject *name) { - static PyObject *name_str = NULL; + static PyObject *name_str = NULL, + *qualname_str = NULL; PyObject *global_name = NULL; PyObject *module_name = NULL; PyObject *module = NULL; PyObject *cls; int status = 0; - const char global_op = GLOBAL; - - if (name_str == NULL) { + if (self->proto < 4 && name_str == NULL) { name_str = PyUnicode_InternFromString("__name__"); if (name_str == NULL) goto error; } + else if (self->proto >= 4 && qualname_str == NULL) { + qualname_str = PyUnicode_InternFromString("__qualname__"); + if (qualname_str == NULL) + goto error; + } if (name) { global_name = name; Py_INCREF(global_name); } else { - global_name = PyObject_GetAttr(obj, name_str); + global_name = PyObject_GetAttr(obj, + self->proto >= 4 ? qualname_str : name_str); if (global_name == NULL) goto error; } @@ -2637,14 +3140,21 @@ obj, module_name); goto error; } - cls = PyObject_GetAttr(module, global_name); + if (self->proto < 4) { + cls = PyObject_GetAttr(module, global_name); + } + else { + cls = getattr_recurse(module, global_name); + } if (cls == NULL) { PyErr_Format(PicklingError, "Can't pickle %R: attribute lookup %S.%S failed", obj, module_name, global_name); goto error; } - if (cls != obj) { + // we ignore this step for classmethods because + // "dict.fromkeys is dict.fromkeys" evaluates to false + if (cls != obj && !isclassmethod(obj)) { Py_DECREF(cls); PyErr_Format(PicklingError, "Can't pickle %R: it's not the same object as %S.%S", @@ -2685,8 +3195,8 @@ if (code <= 0 || code > 0x7fffffffL) { if (!PyErr_Occurred()) PyErr_Format(PicklingError, - "Can't pickle %R: extension code %ld is out of range", - obj, code); + "Can't pickle %R: extension code %ld is out of range", + obj, code); goto error; } @@ -2698,16 +3208,14 @@ } else if (code <= 0xffff) { pdata[0] = EXT2; - pdata[1] = (unsigned char)(code & 0xff); - pdata[2] = (unsigned char)((code >> 8) & 0xff); + COPY_BYTE(pdata, code, 1); + COPY_BYTE(pdata, code, 2); n = 3; } else { pdata[0] = EXT4; - pdata[1] = (unsigned char)(code & 0xff); - pdata[2] = (unsigned char)((code >> 8) & 0xff); - pdata[3] = (unsigned char)((code >> 16) & 0xff); - pdata[4] = (unsigned char)((code >> 24) & 0xff); + COPY_BYTE(pdata, code, 1); COPY_BYTE(pdata, code, 2); + COPY_BYTE(pdata, code, 3); COPY_BYTE(pdata, code, 4); n = 5; } @@ -2718,23 +3226,9 @@ /* Generate a normal global opcode if we are using a pickle protocol <= 2, or if the object is not registered in the extension registry. */ - PyObject *encoded; - PyObject *(*unicode_encoder)(PyObject *); gen_global: - if (_Pickler_Write(self, &global_op, 1) < 0) - goto error; - - /* Since Python 3.0 now supports non-ASCII identifiers, we encode both - the module name and the global name using UTF-8. We do so only when - we are using the pickle protocol newer than version 3. This is to - ensure compatibility with older Unpickler running on Python 2.x. */ - if (self->proto >= 3) { - unicode_encoder = PyUnicode_AsUTF8String; - } - else { - unicode_encoder = PyUnicode_AsASCIIString; - } + /* For protocol < 3 and if the user didn't request against doing so, we convert module names to the old 2.x module names. */ @@ -2792,42 +3286,18 @@ goto error; } } - - /* Save the name of the module. */ - encoded = unicode_encoder(module_name); - if (encoded == NULL) { - if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) - PyErr_Format(PicklingError, - "can't pickle module identifier '%S' using " - "pickle protocol %i", module_name, self->proto); - goto error; - } - if (_Pickler_Write(self, PyBytes_AS_STRING(encoded), - PyBytes_GET_SIZE(encoded)) < 0) { - Py_DECREF(encoded); - goto error; - } - Py_DECREF(encoded); - if(_Pickler_Write(self, "\n", 1) < 0) - goto error; - - /* Save the name of the module. */ - encoded = unicode_encoder(global_name); - if (encoded == NULL) { - if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) - PyErr_Format(PicklingError, - "can't pickle global identifier '%S' using " - "pickle protocol %i", global_name, self->proto); - goto error; - } - if (_Pickler_Write(self, PyBytes_AS_STRING(encoded), - PyBytes_GET_SIZE(encoded)) < 0) { - Py_DECREF(encoded); - goto error; - } - Py_DECREF(encoded); - if(_Pickler_Write(self, "\n", 1) < 0) - goto error; + + if (self->proto < 4) { + //uses opcode GLOBAL + if (save_global_nonbinary(self, module_name, global_name) < 0) + goto error; + } + else if (self->proto >= 4) { + //uses one of the opcodes + //BINGLOBAL, BINGLOBAL_BIG, BINGLOBAL_COMMON + if (save_global_binary(self, module_name, global_name) < 0) + goto error; + } /* Memoize the object. */ if (memo_put(self, obj) < 0) @@ -2846,6 +3316,183 @@ } static int +save_global_or_method(PicklerObject *self, PyObject *obj) +{ + PyObject *unbound, *obj_self = NULL, *tuple, *inner_tuple; + static PyObject *str_self = NULL, *binding_function = NULL, + *pickle_str = NULL; + int ret = -1; + + unbound = unbind(obj); + if (unbound == NULL) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) { + PyErr_Clear(); + return save_global(self, obj, NULL); + } + return -1; + } + else if (self->proto < 4) { + PyErr_SetString(PicklingError, + "Can't pickle bound methods in pickle<4"); + Py_DECREF(unbound); + return -1; + } + else { + if (pickle_str == NULL) { + pickle_str = PyUnicode_InternFromString("pickle"); + if (pickle_str == NULL) { + Py_DECREF(unbound); + return -1; + } + } + if (binding_function == NULL) { + PyObject *pickle_module = PyImport_Import(pickle_str); + if (pickle_module == NULL) { + Py_DECREF(unbound); + return -1; + } + binding_function = PyObject_GetAttrString(pickle_module, + "_bind_method"); + if (binding_function == NULL) { + Py_DECREF(unbound); + return -1; + } + } + if (str_self == NULL) { + str_self = PyUnicode_InternFromString("__self__"); + if (str_self == NULL) { + Py_DECREF(unbound); + return -1; + } + } + + obj_self = PyObject_GetAttr(obj, str_self); + if (obj_self == NULL) { + Py_DECREF(unbound); + return -1; + } + + inner_tuple = PyTuple_Pack(2, obj_self, unbound); + if (!inner_tuple) goto done; + + tuple = PyTuple_Pack(2, binding_function, inner_tuple); + if (!tuple) goto done; + + ret = save_reduce(self, tuple, obj); + Py_DECREF(tuple); +done: + Py_DECREF(obj_self); + Py_DECREF(unbound); + return ret; + } +} + +static int +save_set(PicklerObject *self, PyObject *obj) +{ + static const char empty_set_opcode = EMPTY_SET, + mark_opcode = MARK, + update_set_opcode = UPDATE_SET; + PyObject *iter, *item; + Py_ssize_t i=0; + + if (_Pickler_Write(self, &empty_set_opcode, 1) < 1) + return -1; + + if (memo_put(self, obj) < 0) + return -1; + + if (PySet_GET_SIZE(obj) == 0) + return 0; + + iter = PyObject_GetIter(obj); + if (iter == NULL) return -1; + + for (; item = PyIter_Next(iter); ++i) { + if (i % BATCHSIZE == 0) { + if (_Pickler_Write(self, &mark_opcode, 1) < 1) + goto error_in_loop; + } + + if (save(self, item, 0) < 0) + goto error_in_loop; + + if (i % BATCHSIZE == BATCHSIZE-1) { + if (_Pickler_Write(self, &update_set_opcode, 1) < 1) + goto error_in_loop; + } + + Py_DECREF(item); + } + + Py_DECREF(iter); + + if (PyErr_Occurred()) return -1; + + if (PySet_GET_SIZE(obj) % BATCHSIZE != 0) + if (_Pickler_Write(self, &update_set_opcode, 1) < 1) + return -1; + + return 0; + +error_in_loop: + Py_DECREF(item); + Py_DECREF(iter); + return -1; +} + +static int +save_frozenset(PicklerObject *self, PyObject *obj) +{ + static const char empty_frozenset_opcode = EMPTY_FROZENSET, + mark_opcode = MARK, + frozenset_opcode = FROZENSET, + pop_mark_opcode = POP_MARK; + PyObject *iter, *item; + + if (PySet_GET_SIZE(obj) == 0) { + if (_Pickler_Write(self, &empty_frozenset_opcode, 1) < 1) + return -1; + return 0; + } + + if (_Pickler_Write(self, &mark_opcode, 1) < 1) + return -1; + + iter = PyObject_GetIter(obj); + if (iter == NULL) return -1; + + while ( item = PyIter_Next(iter) ) { + if (save(self, item, 0) < 0) { + Py_DECREF(item); + Py_DECREF(iter); + return -1; + } + Py_DECREF(item); + } + + Py_DECREF(iter); + + if (PyErr_Occurred()) return -1; + + //an explanation of this is in pickle.py:save_frozenset + if (PyMemoTable_Get(self->memo, obj) != NULL) { + //self-referential frozenset + if (_Pickler_Write(self, &pop_mark_opcode, 1) < 1) + return -1; + memo_get(self, obj); + } + else { + //regular frozenset + if (_Pickler_Write(self, &frozenset_opcode, 1) < 1) + return -1; + if (memo_put(self, obj) < 0) + return -1; + } + return 0; +} + +static int save_ellipsis(PicklerObject *self, PyObject *obj) { PyObject *str = PyUnicode_FromString("Ellipsis"); @@ -2927,7 +3574,7 @@ get_class(PyObject *obj) { PyObject *cls; - static PyObject *str_class; + static PyObject *str_class = NULL; if (str_class == NULL) { str_class = PyUnicode_InternFromString("__class__"); @@ -2958,11 +3605,13 @@ PyObject *dictitems = Py_None; Py_ssize_t size; - int use_newobj = self->proto >= 2; + int use_newobj = self->proto >= 2, + use_newobj_kw = self->proto >= 4; const char reduce_op = REDUCE; const char build_op = BUILD; const char newobj_op = NEWOBJ; + const char newobj_kw_op = NEWOBJ_KW; size = PyTuple_Size(args); if (size < 2 || size > 5) { @@ -3010,13 +3659,16 @@ /* Protocol 2 special case: if callable's name is __newobj__, use NEWOBJ. */ if (use_newobj) { - static PyObject *newobj_str = NULL, *name_str = NULL; + static PyObject *newobj_str = NULL, *name_str = NULL, + *newobj_kw_str = NULL; PyObject *name; if (newobj_str == NULL) { newobj_str = PyUnicode_InternFromString("__newobj__"); name_str = PyUnicode_InternFromString("__name__"); - if (newobj_str == NULL || name_str == NULL) + newobj_kw_str = PyUnicode_InternFromString("__newobj_kw__"); + if (newobj_str == NULL || name_str == NULL || + newobj_kw_str == NULL) return -1; } @@ -3027,10 +3679,13 @@ else return -1; use_newobj = 0; + use_newobj_kw = 0; } else { use_newobj = PyUnicode_Check(name) && PyUnicode_Compare(name, newobj_str) == 0; + use_newobj_kw = PyUnicode_Check(name) && + PyUnicode_Compare(name, newobj_kw_str) == 0; Py_DECREF(name); } } @@ -3107,6 +3762,67 @@ if (_Pickler_Write(self, &newobj_op, 1) < 0) return -1; } + else if (use_newobj_kw) { + PyObject *cls, *kargs, *kwargs; + + /* Using NEWOBJ_KW (like, NEWOBJ but with keyword args) */ + if (Py_SIZE(argtup) != 3) { + PyErr_Format(PicklingError, + "When pickling a reduced object, " + "__newobj_kw__ was called with %zd arguments " + "(should have been 3: class name, tuple of kargs, " + "dictionary of kwargs)", Py_SIZE(argtup)); + return -1; + } + cls = PyTuple_GET_ITEM(argtup, 0); + kargs = PyTuple_GET_ITEM(argtup, 1); + kwargs = PyTuple_GET_ITEM(argtup, 2); + + //see NEWOBJ case above + if (obj != NULL) { + PyObject *obj_class; + int p; + + obj_class = get_class(obj); + p = obj_class != cls; + Py_DECREF(obj_class); + if (p) { + PyErr_SetString(PicklingError, + "args[0] from __newobj_kw__ args has the wrong class"); + return -1; + } + } + + if (!PyType_Check(cls)) { + PyErr_Format(PicklingError, + "When pickling a reduced object, " + "__newobj_kw__ was called with a first argument of " + "type %.200s, when it should have been a class type", + cls->ob_type->tp_name); + return -1; + } + if (!PyTuple_Check(kargs)) { + PyErr_Format(PicklingError, + "When pickling a reduced object, " + "__newobj_kw__ was called with a second argument of " + "type %.200s, when it should have been a tuple", + kargs->ob_type->tp_name); + return -1; + } + if (!PyDict_Check(kwargs)) { + PyErr_Format(PicklingError, + "When pickling a reduced object, " + "__newobj_kw__ was called with a third argument of " + "type %.200s, when it should have been a dict", + kwargs->ob_type->tp_name); + return -1; + } + if (save(self, cls, 0) < 0 || + save(self, kargs, 0) < 0 || + save(self, kwargs, 0) < 0 || + _Pickler_Write(self, &newobj_kw_op, 1) < 1) + return -1; + } else { /* Not using NEWOBJ. */ if (save(self, callable, 0) < 0 || save(self, argtup, 0) < 0 || @@ -3236,10 +3952,24 @@ goto done; } } - else if (type == &PyCFunction_Type) { + else if (type == &PyCFunction_Type || type == &PyMethod_Type || + type == &_PyMethodWrapper_Type) { + status = save_global_or_method(self, obj); + goto done; + } + else if (type == &PyWrapperDescr_Type || type == &PyMethodDescr_Type || + type == &PyClassMethodDescr_Type) { status = save_global(self, obj, NULL); goto done; } + else if (type == &PySet_Type && self->proto >= 4) { + status = save_set(self, obj); + goto done; + } + else if (type == &PyFrozenSet_Type && self->proto >= 4) { + status = save_frozenset(self, obj); + goto done; + } /* XXX: This part needs some unit tests. */ @@ -3349,7 +4079,7 @@ static int dump(PicklerObject *self, PyObject *obj) { - const char stop_op = STOP; + static const char stop_op = STOP, bail_out_op = BAIL_OUT; if (self->proto >= 2) { char header[2]; @@ -3361,9 +4091,17 @@ return -1; } - if (save(self, obj, 0) < 0 || - _Pickler_Write(self, &stop_op, 1) < 0) - return -1; + if (save(self, obj, 0) < 0 || _Pickler_Write(self, &stop_op, 1) < 0) { + //error pickling, report BAIL_OUT opcode + if (self->proto >= 4) { + PyObject *ptype, *pvalue, *ptraceback; + PyErr_Fetch(&ptype, &pvalue, &ptraceback); + _Pickler_Write(self, &bail_out_op, 1); + PyErr_Restore(ptype, pvalue, ptraceback); + } + + return -1; + } return 0; } @@ -3409,8 +4147,16 @@ if (_Pickler_ClearBuffer(self) < 0) return NULL; - if (dump(self, obj) < 0) + if (dump(self, obj) < 0) { + if (self->proto >= 4) { + //flush to file anyway, for BAIL_OUT + PyObject *ptype, *pvalue, *ptraceback; + PyErr_Fetch(&ptype, &pvalue, &ptraceback); + _Pickler_FlushToFile(self); + PyErr_Restore(ptype, pvalue, ptraceback); + } return NULL; + } if (_Pickler_FlushToFile(self) < 0) return NULL; @@ -3500,15 +4246,18 @@ static int Pickler_init(PicklerObject *self, PyObject *args, PyObject *kwds) { - static char *kwlist[] = {"file", "protocol", "fix_imports", 0}; + static char *kwlist[] = { "file", "protocol", "fix_imports", + "common_modules", 0}; PyObject *file; PyObject *proto_obj = NULL; PyObject *fix_imports = Py_True; + PyObject *common_modules = v4_common_modules; _Py_IDENTIFIER(persistent_id); _Py_IDENTIFIER(dispatch_table); - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO:Pickler", - kwlist, &file, &proto_obj, &fix_imports)) + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OOO:Pickler", + kwlist, &file, &proto_obj, &fix_imports, + &common_modules)) return -1; /* In case of multiple __init__() calls, clear previous content. */ @@ -3554,6 +4303,8 @@ if (self->dispatch_table == NULL) return -1; } + + self->common_modules = common_modules; return 0; } @@ -3993,12 +4744,35 @@ unsigned char *s = (unsigned char *)bytes; size_t x = 0; - assert(size == 4); - - x = (size_t) s[0]; - x |= (size_t) s[1] << 8; - x |= (size_t) s[2] << 16; - x |= (size_t) s[3] << 24; + switch (size) { + case 1: + x = (size_t) s[0]; + break; + case 2: + x = (size_t) s[0]; + x |= (size_t) s[1] << 8; + break; + case 4: + x = (size_t) s[0]; + x |= (size_t) s[1] << 8; + x |= (size_t) s[2] << 16; + x |= (size_t) s[3] << 24; + break; +# if SIZEOF_SIZE_T > 4 + case 8: + x = (size_t) s[0]; + x |= (size_t) s[1] << 8; + x |= (size_t) s[2] << 16; + x |= (size_t) s[3] << 24; + x |= (size_t)(s[4]) << 32; + x |= (size_t)(s[5]) << 40; + x |= (size_t)(s[6]) << 48; + x |= (size_t)(s[7]) << 56; + break; +# endif + default: + return -1; + } if (x > PY_SSIZE_T_MAX) return -1; @@ -4234,54 +5008,31 @@ } static int -load_binbytes(UnpicklerObject *self) +load_binbytes_arbitrary(UnpicklerObject *self, int n) { PyObject *bytes; - Py_ssize_t x; + Py_ssize_t size; char *s; - if (_Unpickler_Read(self, &s, 4) < 0) - return -1; - - x = calc_binsize(s, 4); - if (x < 0) { + if (_Unpickler_Read(self, &s, n) < n) + return -1; + + size = calc_binsize(s, n); + if (size < 0) { PyErr_Format(PyExc_OverflowError, "BINBYTES exceeds system's maximum size of %zd bytes", PY_SSIZE_T_MAX); return -1; } - if (_Unpickler_Read(self, &s, x) < 0) - return -1; - bytes = PyBytes_FromStringAndSize(s, x); + if (_Unpickler_Read(self, &s, size) < size) + return -1; + bytes = PyBytes_FromStringAndSize(s, size); if (bytes == NULL) return -1; PDATA_PUSH(self->stack, bytes, -1); - return 0; -} - -static int -load_short_binbytes(UnpicklerObject *self) -{ - PyObject *bytes; - Py_ssize_t x; - char *s; - - if (_Unpickler_Read(self, &s, 1) < 0) - return -1; - - x = (unsigned char)s[0]; - - if (_Unpickler_Read(self, &s, x) < 0) - return -1; - - bytes = PyBytes_FromStringAndSize(s, x); - if (bytes == NULL) - return -1; - - PDATA_PUSH(self->stack, bytes, -1); - return 0; + return _Unpickler_Memoize(self, bytes); } static int @@ -4357,17 +5108,20 @@ return 0; } + +//load a str whose size is represented in n bytes static int -load_binunicode(UnpicklerObject *self) +load_binunicode_arbitrary(UnpicklerObject *self, int n) { PyObject *str; Py_ssize_t size; char *s; - if (_Unpickler_Read(self, &s, 4) < 0) - return -1; - - size = calc_binsize(s, 4); + assert(n >= 1 && n <= 8); + if (_Unpickler_Read(self, &s, n) < 0) + return -1; + + size = calc_binsize(s, n); if (size < 0) { PyErr_Format(PyExc_OverflowError, "BINUNICODE exceeds system's maximum size of %zd bytes", @@ -4384,7 +5138,7 @@ return -1; PDATA_PUSH(self->stack, str, -1); - return 0; + return _Unpickler_Memoize(self, str); } static int @@ -4399,14 +5153,16 @@ tuple = Pdata_poptuple(self->stack, i); if (tuple == NULL) return -1; + PDATA_PUSH(self->stack, tuple, -1); - return 0; + return _Unpickler_Memoize(self, tuple); } static int load_counted_tuple(UnpicklerObject *self, int len) { PyObject *tuple; + int memoize = len != 0; tuple = PyTuple_New(len); if (tuple == NULL) @@ -4420,8 +5176,10 @@ return -1; PyTuple_SET_ITEM(tuple, len, item); } + PDATA_PUSH(self->stack, tuple, -1); - return 0; + if (memoize) return _Unpickler_Memoize(self, tuple); + else return 0; } static int @@ -4432,7 +5190,7 @@ if ((list = PyList_New(0)) == NULL) return -1; PDATA_PUSH(self->stack, list, -1); - return 0; + return _Unpickler_Memoize(self, list); } static int @@ -4443,7 +5201,7 @@ if ((dict = PyDict_New()) == NULL) return -1; PDATA_PUSH(self->stack, dict, -1); - return 0; + return _Unpickler_Memoize(self, dict); } static int @@ -4459,7 +5217,7 @@ if (list == NULL) return -1; PDATA_PUSH(self->stack, list, -1); - return 0; + return _Unpickler_Memoize(self, list); } static int @@ -4485,7 +5243,7 @@ } Pdata_clear(self->stack, i); PDATA_PUSH(self->stack, dict, -1); - return 0; + return _Unpickler_Memoize(self, dict); } static PyObject * @@ -4630,7 +5388,7 @@ Py_DECREF(args); Py_DECREF(clsraw); PDATA_PUSH(self->stack, obj, -1); - return 0; + if (obj != Py_None) return _Unpickler_Memoize(self, obj); error: Py_XDECREF(args); @@ -4639,6 +5397,279 @@ } static int +load_newobj_kw(UnpicklerObject *self) +{ + Py_ssize_t stack_size; + PyObject *clsraw = NULL, *kargs = NULL, *kwargs = NULL, *obj; + PyTypeObject *cls; + + stack_size = Py_SIZE(self->stack); + if (stack_size < 3) { + PyErr_Format(UnpicklingError, + "NEWOBJ_KW applied with a stack size of %zd (expected >= 3)", + stack_size); + return -1; + } + + PDATA_POP(self->stack, kwargs); + if (kwargs == NULL) goto error; + if (!PyDict_Check(kwargs)) { + PyErr_SetString(UnpicklingError, "NEWOBJ_KW applied with a third " + "parameter which is not a dict"); + goto error; + } + + PDATA_POP(self->stack, kargs); + if (kargs == NULL) goto error; + if (!PyTuple_Check(kargs)) { + PyErr_SetString(UnpicklingError, "NEWOBJ_KW applied with a second " + "parameter which is not a tuple"); + goto error; + } + + PDATA_POP(self->stack, clsraw); + if (!clsraw) goto error; + cls = (PyTypeObject *)clsraw; + if (!PyType_Check(cls)) { + PyErr_SetString(UnpicklingError, "NEWOBJ_KW class argument " + "isn't a type object"); + goto error; + } + else if (cls->tp_new == NULL) { + PyErr_SetString(UnpicklingError, "NEWOBJ_KW class argument " + "has NULL tp_new"); + return -1; + } + + obj = cls->tp_new(cls, kargs, kwargs); + if (obj == NULL) + return -1; + + PDATA_PUSH(self->stack, obj, -1); + + if (obj != Py_None) + return _Unpickler_Memoize(self, obj); + else return 0; + +error: + Py_XDECREF(clsraw); + Py_XDECREF(kargs); + Py_XDECREF(kwargs); + return -1; +} + +static int +load_binglobal(UnpicklerObject *self) +{ + PyObject *module_name, *global_name, *global = NULL; + char *s; + Py_ssize_t encoded_size; + + /* read module's size (1 byte) */ + if (_Unpickler_Read(self, &s, 1) < 1) + return -1; + encoded_size = (unsigned char)s[0]; + + /* read module name */ + if (_Unpickler_Read(self, &s, encoded_size) < encoded_size) + return -1; + module_name = PyUnicode_DecodeUTF8(s, encoded_size, "strict"); + if (!module_name) + return -1; + + /* read global's size */ + if (_Unpickler_Read(self, &s, 1) < 1) + { + Py_DECREF(module_name); + return -1; + } + encoded_size = (unsigned char)s[0]; + + /* read global name */ + if (_Unpickler_Read(self, &s, encoded_size) < encoded_size) { + Py_DECREF(module_name); + return -1; + } + global_name = PyUnicode_DecodeUTF8(s, encoded_size, "strict"); + + if (global_name) { + global = find_class(self, module_name, global_name); + Py_DECREF(global_name); + } + + Py_DECREF(module_name); + + if (global) { + PDATA_PUSH(self->stack, global, -1); + return _Unpickler_Memoize(self, global); + } + return -1; +} + +static int +load_binglobal_big(UnpicklerObject *self) +{ + /* like load_binglobal, s/1/2/g */ + PyObject *module_name, *global_name, *global = NULL; + char *s; + Py_ssize_t encoded_size; + + /* read module's size (2 bytes) */ + if (_Unpickler_Read(self, &s, 2) < 2) + return -1; + encoded_size = (Py_ssize_t)(s[0]) | ((Py_ssize_t)(s[1])<<8); + + /* read module name */ + if (_Unpickler_Read(self, &s, encoded_size) < encoded_size) + return -1; + module_name = PyUnicode_DecodeUTF8(s, encoded_size, "strict"); + if (!module_name) + return -1; + + /* read global's size */ + if (_Unpickler_Read(self, &s, 2) < 2) + return -1; + encoded_size = (Py_ssize_t)(s[0]) | ((Py_ssize_t)(s[1])<<8); + + /* read global name */ + if (_Unpickler_Read(self, &s, encoded_size) < encoded_size) { + Py_DECREF(module_name); + return -1; + } + global_name = PyUnicode_DecodeUTF8(s, encoded_size, "strict"); + + if (global_name) { + global = find_class(self, module_name, global_name); + Py_DECREF(global_name); + } + + Py_DECREF(module_name); + + if (global) { + PDATA_PUSH(self->stack, global, -1); + return _Unpickler_Memoize(self, global); + } + return -1; +} + +static int +load_binglobal_common(UnpicklerObject *self) +{ + PyObject *module_name, *global_name, *global = NULL; + char *s; + Py_ssize_t num; + + /* read common module id */ + if (_Unpickler_Read(self, &s, 1) < 1) + return -1; + num = (unsigned char)s[0]; + if (num >= PyList_GET_SIZE(self->common_modules)) { + PyErr_Format(UnpicklingError, + "Invalid common module id %d", num); + return -1; + } + assert(num >= 0); + module_name = PyList_GET_ITEM(self->common_modules, num); + + /* read global's size */ + if (_Unpickler_Read(self, &s, 1) < 1) + return -1; + num = (unsigned char)s[0]; + + /* read global name */ + if (_Unpickler_Read(self, &s, num) < num) + return -1; + global_name = PyUnicode_DecodeUTF8(s, num, "strict"); + + if (global_name) { + global = find_class(self, module_name, global_name); + Py_DECREF(global_name); + } + + if (global) { + PDATA_PUSH(self->stack, global, -1); + return _Unpickler_Memoize(self, global); + } + return -1; +} + +static int +load_empty_set(UnpicklerObject *self) +{ + PyObject *set = PySet_New(NULL); + if (!set) return -1; + + PDATA_PUSH(self->stack, set, -1); + return _Unpickler_Memoize(self, set); +} + +static int +load_empty_frozenset(UnpicklerObject *self) +{ + PyObject *fset = PyFrozenSet_New(NULL); + if (!fset) return -1; + + PDATA_PUSH(self->stack, fset, -1); + return 0; +} + +static int +load_update_set(UnpicklerObject *self) +{ + PyObject *set; + Py_ssize_t mark, stack_size, i; + + mark = marker(self); + if (mark < 0) return -1; + + if (mark == 0) { + PyErr_SetString(UnpicklingError, + "UPDATE_SET: found no set-like type before MARK"); + return -1; + } + + set = self->stack->data[mark - 1]; + stack_size = Py_SIZE(self->stack); + + for (i = mark; i < stack_size; ++i) { + if (PySet_Add(set, self->stack->data[i]) < 0) + return -1; + } + + Pdata_clear(self->stack, mark); + return 0; +} + +static int +load_frozenset(UnpicklerObject *self) +{ + PyObject *fset; + Py_ssize_t mark, stack_size, i; + + mark = marker(self); + if (mark < 0) return -1; + + fset = PyFrozenSet_New(NULL); + if (fset == NULL) return -1; + + stack_size = Py_SIZE(self->stack); + for (i = mark; i < stack_size; ++i) { + if (PySet_Add(fset, self->stack->data[i]) < 0) { + Py_DECREF(fset); + return -1; + } + } + + Pdata_clear(self->stack, mark); + if (Pdata_push(self->stack, fset) < 0) { + Py_DECREF(fset); + return -1; + } + + return _Unpickler_Memoize(self, fset); +} + +static int load_global(UnpicklerObject *self) { PyObject *global = NULL; @@ -4671,7 +5702,9 @@ if (global == NULL) return -1; PDATA_PUSH(self->stack, global, -1); - return 0; + //XXX should remove memoization from here and python. this is a deprecated + //opcode + return _Unpickler_Memoize(self, global); } static int @@ -4834,10 +5867,10 @@ value = _Unpickler_MemoGet(self, idx); if (value == NULL) { - PyObject *key = PyLong_FromSsize_t(idx); - if (!PyErr_Occurred()) - PyErr_SetObject(PyExc_KeyError, key); - Py_DECREF(key); + if (!PyErr_Occurred()) { + PyErr_Format(UnpicklingError, + "Couldn't get memoized index %zd", idx); + } return -1; } @@ -5297,6 +6330,12 @@ return -1; PDATA_POP(self->stack, callable); if (callable) { + if (!PyCallable_Check(callable)) { + Py_DECREF(callable); + PyErr_SetString(UnpicklingError, + "REDUCE applied on non-callable"); + return -1; + } obj = PyObject_CallObject(callable, argtup); Py_DECREF(callable); } @@ -5306,6 +6345,8 @@ return -1; PDATA_PUSH(self->stack, obj, -1); + + if (obj != Py_None) return _Unpickler_Memoize(self, obj); return 0; } @@ -5331,6 +6372,14 @@ return -1; } +static int +load_bail_out(UnpicklerObject *self) +{ + PyErr_SetString(UnpicklingError, "This stream does not contain " + "correctly pickled data"); + return -1; +} + static PyObject * load(UnpicklerObject *self) { @@ -5350,7 +6399,7 @@ case opcode: if (load_func(self, (arg)) < 0) break; continue; while (1) { - if (_Unpickler_Read(self, &s, 1) < 0) + if (_Unpickler_Read(self, &s, 1) < 1) break; switch ((enum opcode)s[0]) { @@ -5364,13 +6413,18 @@ OP_ARG(LONG4, load_counted_long, 4) OP(FLOAT, load_float) OP(BINFLOAT, load_binfloat) - OP(BINBYTES, load_binbytes) - OP(SHORT_BINBYTES, load_short_binbytes) + OP_ARG(SHORT_BINBYTES, load_binbytes_arbitrary, 1) + OP_ARG(BINBYTES16, load_binbytes_arbitrary, 2) + OP_ARG(BINBYTES, load_binbytes_arbitrary, 4) + OP_ARG(BINBYTES64, load_binbytes_arbitrary, 8) OP(BINSTRING, load_binstring) OP(SHORT_BINSTRING, load_short_binstring) OP(STRING, load_string) OP(UNICODE, load_unicode) - OP(BINUNICODE, load_binunicode) + OP_ARG(SHORT_BINUNICODE, load_binunicode_arbitrary, 1) + OP_ARG(BINUNICODE16, load_binunicode_arbitrary, 2) + OP_ARG(BINUNICODE, load_binunicode_arbitrary, 4) + OP_ARG(BINUNICODE64, load_binunicode_arbitrary, 8) OP_ARG(EMPTY_TUPLE, load_counted_tuple, 0) OP_ARG(TUPLE1, load_counted_tuple, 1) OP_ARG(TUPLE2, load_counted_tuple, 2) @@ -5383,7 +6437,15 @@ OP(OBJ, load_obj) OP(INST, load_inst) OP(NEWOBJ, load_newobj) + OP(NEWOBJ_KW, load_newobj_kw) OP(GLOBAL, load_global) + OP(BINGLOBAL, load_binglobal) + OP(BINGLOBAL_BIG, load_binglobal_big) + OP(BINGLOBAL_COMMON, load_binglobal_common) + OP(EMPTY_SET, load_empty_set) + OP(EMPTY_FROZENSET, load_empty_frozenset) + OP(UPDATE_SET, load_update_set) + OP(FROZENSET, load_frozenset) OP(APPEND, load_append) OP(APPENDS, load_appends) OP(BUILD, load_build) @@ -5408,6 +6470,7 @@ OP_ARG(EXT4, load_extension, 4) OP_ARG(NEWTRUE, load_bool, Py_True) OP_ARG(NEWFALSE, load_bool, Py_False) + OP(BAIL_OUT, load_bail_out) case STOP: break; @@ -5553,11 +6616,15 @@ module = PyImport_Import(module_name); if (module == NULL) return NULL; + if (self->proto < 4) global = PyObject_GetAttr(module, global_name); + else global = getattr_recurse(module, global_name); + Py_DECREF(module); + } + else if (self->proto < 4) { global = PyObject_GetAttr(module, global_name); - Py_DECREF(module); } else { - global = PyObject_GetAttr(module, global_name); + global = getattr_recurse(module, global_name); } return global; } @@ -5659,9 +6726,11 @@ static int Unpickler_init(UnpicklerObject *self, PyObject *args, PyObject *kwds) { - static char *kwlist[] = {"file", "fix_imports", "encoding", "errors", 0}; + static char *kwlist[] = {"file", "fix_imports", "encoding", "errors", + "common_modules", 0}; PyObject *file; PyObject *fix_imports = Py_True; + PyObject *common_modules = v4_common_modules; char *encoding = NULL; char *errors = NULL; _Py_IDENTIFIER(persistent_load); @@ -5681,8 +6750,9 @@ extra careful in the other Unpickler methods, since a subclass could forget to call Unpickler.__init__() thus breaking our internal invariants. */ - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|Oss:Unpickler", kwlist, - &file, &fix_imports, &encoding, &errors)) + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OssO:Unpickler", kwlist, + &file, &fix_imports, &encoding, &errors, + &common_modules)) return -1; /* In case of multiple __init__() calls, clear previous content. */ @@ -5714,12 +6784,14 @@ return -1; self->memo_size = 32; + self->memo_real_size = 0; self->memo = _Unpickler_NewMemo(self->memo_size); if (self->memo == NULL) return -1; self->arg = NULL; self->proto = 0; + self->common_modules = common_modules; return 0; } @@ -5750,6 +6822,7 @@ { _Unpickler_MemoCleanup(self->unpickler); self->unpickler->memo = _Unpickler_NewMemo(self->unpickler->memo_size); + self->unpickler->memo_real_size = 0; if (self->unpickler->memo == NULL) return NULL; Py_RETURN_NONE; @@ -5930,6 +7003,7 @@ Py_XINCREF(unpickler->memo[i]); new_memo[i] = unpickler->memo[i]; } + self->memo_real_size = unpickler->memo_real_size; } else if (PyDict_Check(obj)) { Py_ssize_t i = 0; @@ -5940,6 +7014,7 @@ if (new_memo == NULL) return -1; + self->memo_real_size = 0; while (PyDict_Next(obj, &i, &key, &value)) { Py_ssize_t idx; if (!PyLong_Check(key)) { @@ -5952,6 +7027,7 @@ goto error; if (_Unpickler_MemoPut(self, idx, value) < 0) goto error; + if (idx > self->memo_real_size) self->memo_real_size = idx; } } else { @@ -5973,6 +7049,7 @@ while (--i >= 0) { Py_XDECREF(new_memo[i]); } + //XXX leak when new_memo_size=0? PyMem_FREE(new_memo); } return -1; @@ -6342,6 +7419,10 @@ { PyObject *copyreg = NULL; PyObject *compat_pickle = NULL; + // Must be consistent with pickle.V4_COMMON_MODULES + static const char *v4_common_modules_static[] = { + "__main__", "builtins", "collections", "pickle" + }; /* XXX: We should ensure that the types of the dictionaries imported are exactly PyDict objects. Otherwise, it is possible to crash the pickle @@ -6423,6 +7504,21 @@ */ PyObject_GC_UnTrack(two_tuple); + if (v4_common_modules == NULL) { + int count = sizeof(v4_common_modules_static) / + sizeof(*v4_common_modules_static), i; + v4_common_modules = PyList_New(count); + + for (i = 0; i < count; ++i) { + //TODO PyList_SET_ITEM? + if (PyList_SetItem(v4_common_modules, i, + PyUnicode_InternFromString( + v4_common_modules_static[i])) < 0) { + goto error; + } + } + } + return 0; error: @@ -6438,6 +7534,7 @@ Py_CLEAR(import_mapping_3to2); Py_CLEAR(empty_tuple); Py_CLEAR(two_tuple); + Py_CLEAR(v4_common_modules); return -1; } diff -r 4b3238923b01 -r d0c3a8d4947a Objects/setobject.c --- a/Objects/setobject.c Fri May 10 19:57:44 2013 -0700 +++ b/Objects/setobject.c Sat May 11 03:03:53 2013 +0300 @@ -1991,31 +1991,6 @@ \n\ If the element is not a member, do nothing."); -static PyObject * -set_reduce(PySetObject *so) -{ - PyObject *keys=NULL, *args=NULL, *result=NULL, *dict=NULL; - _Py_IDENTIFIER(__dict__); - - keys = PySequence_List((PyObject *)so); - if (keys == NULL) - goto done; - args = PyTuple_Pack(1, keys); - if (args == NULL) - goto done; - dict = _PyObject_GetAttrId((PyObject *)so, &PyId___dict__); - if (dict == NULL) { - PyErr_Clear(); - dict = Py_None; - Py_INCREF(dict); - } - result = PyTuple_Pack(3, Py_TYPE(so), args, dict); -done: - Py_XDECREF(args); - Py_XDECREF(keys); - Py_XDECREF(dict); - return result; -} static PyObject * set_sizeof(PySetObject *so) @@ -2047,6 +2022,124 @@ return set_update_internal(self, iterable); } +//returns either a list or a tuple of the form (list, dict) +//list is the list of elements currently in the set and dict is the value +//of so.__dict__, if there is one +static PyObject * +set_getstate(PySetObject *so) +{ + PyObject *keys, *dict, *state = NULL; + _Py_IDENTIFIER(__dict__); + + keys = PySequence_List((PyObject *)so); + if (keys == NULL) return NULL; + + dict = _PyObject_GetAttrId((PyObject *)so, &PyId___dict__); + if (dict == NULL || PyDict_Size(dict) == 0) { + //no __dict__, the state is a list + PyErr_Clear(); + state = keys; + Py_XDECREF(dict); + } + else { + //got a __dict__, the state is a 2-tuple + state = PyTuple_Pack(2, keys, dict); + Py_DECREF(keys); + Py_DECREF(dict); + } + return state; +} + +static PyObject * +set_reduce(PySetObject *so) +{ + PyObject *empty_tuple = PyTuple_New(0); + PyObject *state; + PyObject *ret; + + empty_tuple = PyTuple_New(0); + if (!empty_tuple) return NULL; + + state = set_getstate(so); + if (!state) return NULL; + + ret = PyTuple_Pack(3, Py_TYPE(so), empty_tuple, state); + Py_DECREF(empty_tuple); + Py_DECREF(state); + + return ret; +} + +PyDoc_STRVAR(setstate_doc, "Update a set from an unpickled state"); +static PyObject * +set_setstate(PySetObject *so, PyObject *state) +{ + PyObject *keys, *dict = NULL; + PyObject *it, *key; + int is_list; + _Py_IDENTIFIER(__dict__); + + is_list = PyList_Check(state); + if (!is_list && !PyTuple_Check(state)) { + PyErr_Format(PyExc_TypeError, + "set.__setstate__ expected a tuple or list, got a '%.200s'", + state->ob_type->tp_name); + return NULL; + } + + if (is_list) keys = state; + else { + if (!PyArg_UnpackTuple(state, + "set.__setstate__'s tuple", 2, 2, &keys, &dict)) + return NULL; + + if (!PyList_Check(keys)) { + PyErr_Format(PyExc_TypeError, + "set.__setstate__ expected the tuple's first parameter " + "to be a list, got a '%.200s'", + keys->ob_type->tp_name); + return NULL; + } + + if (!PyDict_Check(dict)) { + PyErr_Format(PyExc_TypeError, + "set.__setstate__ expected the tuple's second parameter " + "to be a dict, got a '%.200s'", + dict->ob_type->tp_name); + return NULL; + } + + if (!_PyObject_HasAttrId((PyObject *)so, &PyId___dict__)) { + PyErr_SetString(PyExc_AttributeError, + "set.__setstate__ was provided with a dict " + "when the underlying object does not have a __dict__ " + "attribute."); + return NULL; + } + } + + set_clear_internal(so); + + it = PyObject_GetIter(keys); + if (it == NULL) return NULL; + + while ((key = PyIter_Next(it)) != NULL) { + if (set_add_key(so, key) == -1) { + Py_DECREF(key); + Py_DECREF(it); + return NULL; + } + Py_DECREF(key); + } + Py_DECREF(it); + if (PyErr_Occurred()) return NULL; + + if (dict && _PyObject_SetAttrId((PyObject *)so, &PyId___dict__, dict) < 0) + return NULL; + + Py_RETURN_NONE; +} + static PySequenceMethods set_as_sequence = { set_len, /* sq_length */ 0, /* sq_concat */ @@ -2094,6 +2187,10 @@ issuperset_doc}, {"pop", (PyCFunction)set_pop, METH_NOARGS, pop_doc}, + {"__getstate__", (PyCFunction)set_getstate, METH_NOARGS, + reduce_doc}, + {"__setstate__", (PyCFunction)set_setstate, METH_O, + setstate_doc}, {"__reduce__", (PyCFunction)set_reduce, METH_NOARGS, reduce_doc}, {"remove", (PyCFunction)set_remove, METH_O, @@ -2199,6 +2296,31 @@ /* frozenset object ********************************************************/ +static PyObject * +frozenset_reduce(PySetObject *so) +{ + PyObject *keys=NULL, *args=NULL, *result=NULL, *dict=NULL; + _Py_IDENTIFIER(__dict__); + + keys = PySequence_List((PyObject *)so); + if (keys == NULL) + goto done; + args = PyTuple_Pack(1, keys); + if (args == NULL) + goto done; + dict = _PyObject_GetAttrId((PyObject *)so, &PyId___dict__); + if (dict == NULL) { + PyErr_Clear(); + dict = Py_None; + Py_INCREF(dict); + } + result = PyTuple_Pack(3, Py_TYPE(so), args, dict); +done: + Py_XDECREF(args); + Py_XDECREF(keys); + Py_XDECREF(dict); + return result; +} static PyMethodDef frozenset_methods[] = { {"__contains__",(PyCFunction)set_direct_contains, METH_O | METH_COEXIST, @@ -2215,7 +2337,7 @@ issubset_doc}, {"issuperset", (PyCFunction)set_issuperset, METH_O, issuperset_doc}, - {"__reduce__", (PyCFunction)set_reduce, METH_NOARGS, + {"__reduce__", (PyCFunction)frozenset_reduce, METH_NOARGS, reduce_doc}, {"__sizeof__", (PyCFunction)set_sizeof, METH_NOARGS, sizeof_doc}, diff -r 4b3238923b01 -r d0c3a8d4947a Objects/typeobject.c --- a/Objects/typeobject.c Fri May 10 19:57:44 2013 -0700 +++ b/Objects/typeobject.c Sat May 11 03:03:53 2013 +0300 @@ -3402,22 +3402,73 @@ } static PyObject * -reduce_2(PyObject *obj) -{ - PyObject *cls, *getnewargs; +reduce_2(PyObject *obj, int proto) +{ + PyObject *cls, *getnewargs, *getnewargs_kw; PyObject *args = NULL, *args2 = NULL; PyObject *getstate = NULL, *state = NULL, *names = NULL; PyObject *slots = NULL, *listitems = NULL, *dictitems = NULL; PyObject *copyreg = NULL, *newobj = NULL, *res = NULL; Py_ssize_t i, n; + int use_newobj_kw = 0; + _Py_IDENTIFIER(__getnewargs_kw__); _Py_IDENTIFIER(__getnewargs__); _Py_IDENTIFIER(__getstate__); _Py_IDENTIFIER(__newobj__); + _Py_IDENTIFIER(__newobj_kw__); cls = (PyObject *) Py_TYPE(obj); + if (proto >= 4) + getnewargs_kw = _PyObject_GetAttrId(obj, &PyId___getnewargs_kw__); + else getnewargs_kw = NULL; + if (getnewargs_kw != NULL) { + args = PyObject_CallObject(getnewargs_kw, NULL); + Py_DECREF(getnewargs_kw); + if (args != NULL) { + PyObject *kargs, *kwargs; + + // __getnewargs_kw__ must return a (tuple, dict) tuple + if (!PyArg_UnpackTuple(args, "__getnewargs_kw__", 2, 2, + &kargs, &kwargs)) { + if (PyTuple_Check(args)) { + PyErr_Format(PyExc_TypeError, + "__getnewargs_kw__ should return a tuple of size 2, " + "not %zd", Py_SIZE(args)); + goto end; + } + else { + PyErr_Format(PyExc_TypeError, + "__getnewargs_kw__ should return a tuple, " + "not '%.200s'", Py_TYPE(args)->tp_name); + goto end; + } + } + else if (!PyTuple_Check(kargs)) { + PyErr_Format(PyExc_TypeError, + "__getnewargs_kw__ should return a tuple whose first " + "parameter is a tuple, not '%.200s'", + Py_TYPE(kargs)->tp_name); + goto end; + } + else if (!PyDict_Check(kwargs)) { + PyErr_Format(PyExc_TypeError, + "__getnewargs_kw__ should return a tuple whose second " + "parameter is a dict, not '%.200s'", + Py_TYPE(kwargs)->tp_name); + goto end; + } + else use_newobj_kw = 1; + } + } + else { + //no __getnewargs_kw__, move on. + PyErr_Clear(); + } + + //Precondition: args != NULL iff __getnewargs_kw__ succeeded. getnewargs = _PyObject_GetAttrId(obj, &PyId___getnewargs__); - if (getnewargs != NULL) { + if (args == NULL && getnewargs != NULL) { args = PyObject_CallObject(getnewargs, NULL); Py_DECREF(getnewargs); if (args != NULL && !PyTuple_Check(args)) { @@ -3429,7 +3480,7 @@ } else { PyErr_Clear(); - args = PyTuple_New(0); + if (args == NULL) args = PyTuple_New(0); } if (args == NULL) goto end; @@ -3513,7 +3564,9 @@ copyreg = import_copyreg(); if (copyreg == NULL) goto end; - newobj = _PyObject_GetAttrId(copyreg, &PyId___newobj__); + if (use_newobj_kw) + newobj = _PyObject_GetAttrId(copyreg, &PyId___newobj_kw__); + else newobj = _PyObject_GetAttrId(copyreg, &PyId___newobj__); if (newobj == NULL) goto end; @@ -3565,7 +3618,7 @@ PyObject *copyreg, *res; if (proto >= 2) - return reduce_2(self); + return reduce_2(self, proto); copyreg = import_copyreg(); if (!copyreg) diff -r 4b3238923b01 -r d0c3a8d4947a PCbuild/build.bat --- a/PCbuild/build.bat Fri May 10 19:57:44 2013 -0700 +++ b/PCbuild/build.bat Sat May 11 03:03:53 2013 +0300 @@ -5,15 +5,39 @@ setlocal set platf=Win32 set conf=Release -set target=build -set dir=%~dp0 +set rebuild=0 +set buildcmd=vcbuild +set clean=0 +set build= :CheckOpts if "%1"=="-c" (set conf=%2) & shift & shift & goto CheckOpts if "%1"=="-p" (set platf=%2) & shift & shift & goto CheckOpts -if "%1"=="-r" (set target=rebuild) & shift & goto CheckOpts +if "%1"=="-r" (set rebuild=1) & shift & goto CheckOpts if "%1"=="-d" (set conf=Debug) & shift & goto CheckOpts +if "%1"=="-10" (set buildcmd=msbuild) & shift & goto CheckOpts +if "%1"=="-clean" (set clean=1) & shift & goto CheckOpts -set cmd=msbuild /p:useenv=true %dir%pcbuild.sln /t:%target% /p:Configuration=%conf% /p:Platform=%platf% +IF "%rebuild%"=="1" ( + IF "%buildcmd%"=="vcbuild" ( + set build=/rebuild + ) ELSE ( + set build=/t:rebuild + ) +) + +IF "%clean%" == "1" ( + IF "%buildcmd%" == "vcbuild" ( + set build=/clean + ) ELSE ( + set build=/t:clean + ) +) +IF "%buildcmd%"=="vcbuild" ( + set cmd=%buildcmd% /useenv pcbuild.sln %build% "%conf%|%platf%" +) ELSE ( + set cmd=%buildcmd% /p:useenv=true "/p:configuration=%conf%" "/p:platform=%platf%" %build% pcbuild.sln +) + echo %cmd% %cmd%