diff -r cb392bce91e6 -r 9f1be171da08 Lib/pickle.py
--- a/Lib/pickle.py	Thu Apr 18 09:41:34 2013 +0200
+++ b/Lib/pickle.py	Thu Apr 18 03:24:00 2013 -0700
@@ -42,17 +42,18 @@
 bytes_types = (bytes, bytearray)
 
 # These are purely informational; no code uses these.
-format_version = "3.0"                  # File format version we write
+format_version = "4.0"                  # File format version we write
 compatible_formats = ["1.0",            # Original protocol 0
                       "1.1",            # Protocol 0 with INST added
                       "1.2",            # Original protocol 1
                       "1.3",            # Protocol 1 with BINFLOAT added
                       "2.0",            # Protocol 2
                       "3.0",            # Protocol 3
+                      "4.0",            # Protocol 4
                       ]                 # Old format versions we can read
 
 # This is the highest protocol number we know how to read.
-HIGHEST_PROTOCOL = 3
+HIGHEST_PROTOCOL = 4
 
 # The protocol we write by default.  May be less than HIGHEST_PROTOCOL.
 # We intentionally write a protocol that Python 2.x cannot read;
@@ -164,7 +165,18 @@
 BINBYTES       = b'B'   # push bytes; counted binary string argument
 SHORT_BINBYTES = b'C'   #  "     "   ;    "      "       "      " < 256 bytes
 
-__all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$",x)])
+# Protocol 4
+SHORT_BINUNICODE = b'\x8c'  # push short string; UTF-8 length < 256 bytes
+BINUNICODE8      = b'\x8d'  # push very long string
+BINBYTES8        = b'\x8e'  # push very long bytes string
+EMPTY_SET        = b'\x8f'  # push empty set on the stack
+ADDITEM          = b'\x90'  # add topmost stack item to set
+ADDITEMS         = b'\x91'  # modify set by adding topmost stack items
+EMPTY_FROZENSET  = b'\x92'  # push empty frozenset on the stack
+FROZENSET        = b'\x93'  # build frozenset from topmost stack items
+STACK_GLOBAL     = b'\x94'  # same as GLOBAL but using names on the stacks
+
+__all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$", x)])
 
 # Pickling machinery
 
@@ -174,9 +186,9 @@
         """This takes a binary file for writing a pickle data stream.
 
         The optional protocol argument tells the pickler to use the
-        given protocol; supported protocols are 0, 1, 2, 3.  The default
-        protocol is 3; a backward-incompatible protocol designed for
-        Python 3.0.
+        given protocol; supported protocols are 0, 1, 2, 3 and 4.  The
+        default protocol is 3; a backward-incompatible protocol designed for
+        Python 3.
 
         Specifying a negative protocol version selects the highest
         protocol version supported.  The higher the protocol used, the
@@ -189,8 +201,8 @@
         meets this interface.
 
         If fix_imports is True and protocol is less than 3, pickle will try to
-        map the new Python 3.x names to the old module names used in Python
-        2.x, so that the pickle data stream is readable with Python 2.x.
+        map the new Python 3 names to the old module names used in Python 2,
+        so that the pickle data stream is readable with Python 2.
         """
         if protocol is None:
             protocol = DEFAULT_PROTOCOL
@@ -409,7 +421,13 @@
             write(REDUCE)
 
         if obj is not None:
-            self.memoize(obj)
+            # If the object is already in the memo, this means it is
+            # recursive. In this case, throw away everything we put on the
+            # stack, and fetch the object back from the memo.
+            if id(obj) in self.memo:
+                write(POP + self.get(self.memo[id(obj)][0]))
+            else:
+                self.memoize(obj)
 
         # More new special cases (that work with older protocols as
         # well): when __reduce__ returns a tuple with 4 or 5 items,
@@ -493,23 +511,32 @@
                                  (str(obj, 'latin1'), 'latin1'), obj=obj)
             return
         n = len(obj)
-        if n < 256:
+        if n <= 0xff:
             self.write(SHORT_BINBYTES + pack("<B", n) + obj)
+        elif n > 0xffffffff and self.proto >= 4:
+            self.write(BINBYTES8 + pack("<Q", n) + obj)
         else:
             self.write(BINBYTES + pack("<I", n) + obj)
         self.memoize(obj)
     dispatch[bytes] = save_bytes
 
-    def save_str(self, obj):
+    def save_str(self, obj, memoize=True):
         if self.bin:
             encoded = obj.encode('utf-8', 'surrogatepass')
             n = len(encoded)
-            self.write(BINUNICODE + pack("<I", n) + encoded)
+            if n <= 0xff and self.proto >= 4:
+                self.write(SHORT_BINUNICODE + pack("<B", n) + encoded)
+            elif n > 0xffffffff and self.proto >= 4:
+                self.write(BINUNICODE8 + pack("<Q", n) + encoded)
+            else:
+                self.write(BINUNICODE + pack("<I", n) + encoded)
         else:
             obj = obj.replace("\\", "\\u005c")
             obj = obj.replace("\n", "\\u000a")
-            self.write(UNICODE + obj.encode('raw-unicode-escape') + b'\n')
-        self.memoize(obj)
+            self.write(UNICODE + bytes(obj.encode('raw-unicode-escape')) +
+                       b'\n')
+        if memoize:
+            self.memoize(obj)
     dispatch[str] = save_str
 
     def save_tuple(self, obj):
@@ -647,6 +674,66 @@
             if n < self._BATCHSIZE:
                 return
 
+    def save_set(self, obj):
+        save = self.save
+        write = self.write
+
+        if self.proto < 4:
+            self.save_reduce(set, (list(obj),), obj=obj)
+            return
+
+        write(EMPTY_SET)
+        self.memoize(obj)
+
+        items = iter(obj)
+        while items is not None:
+            batch = []
+            for i in range(self._BATCHSIZE):
+                try:
+                    batch.append(next(items))
+                except StopIteration:
+                    items = None
+                    break
+            n = len(batch)
+            if n > 1:
+                write(MARK)
+                for item in batch:
+                    save(item)
+                write(ADDITEMS)
+            elif n:
+                save(batch[0])
+                write(ADDITEM)
+            # else batch is empty, and we're done
+    dispatch[set] = save_set
+
+    def save_frozenset(self, obj):
+        save = self.save
+        write = self.write
+
+        if self.proto < 4:
+            self.save_reduce(set, (list(obj),), obj=obj)
+            return
+
+        n = len(obj)
+        if n == 0:
+            write(EMPTY_FROZENSET)
+            return
+
+        write(MARK)
+        for item in obj:
+            save(item)
+
+        if id(obj) in self.memo:
+            # If the object is already in the memo, this means it is
+            # recursive. In this case, throw away everything we put on the
+            # stack, and fetch the object back from the memo.
+            write(POP_MARK + self.get(self.memo[id(obj)][0]))
+            return
+
+        write(FROZENSET)
+        self.memoize(obj)
+    dispatch[frozenset] = save_frozenset
+
     def save_global(self, obj, name=None):
         write = self.write
         memo = self.memo
@@ -684,7 +771,11 @@
                     write(EXT4 + pack("<i", code))
                 return
         # Non-ASCII identifiers are supported only with protocols >= 3.
-        if self.proto >= 3:
+        if self.proto >= 4:
+            self.save_str(module, memoize=False)
+            self.save_str(name, memoize=False)
+            write(STACK_GLOBAL)
+        elif self.proto == 3:
             write(GLOBAL + bytes(module, "utf-8") + b'\n' +
                   bytes(name, "utf-8") + b'\n')
         else:
@@ -940,6 +1031,14 @@
         self.append(str(self.read(len), 'utf-8', 'surrogatepass'))
     dispatch[BINUNICODE[0]] = load_binunicode
 
+    def load_binunicode8(self):
+        len, = unpack('<Q', self.read(8))
+        if len > maxsize:
+            raise UnpicklingError("BINUNICODE8 exceeds system's maximum size "
+                                  "of %d bytes" % maxsize)
+        self.append(str(self.read(len), 'utf-8', 'surrogatepass'))
+    dispatch[BINUNICODE8[0]] = load_binunicode8
+
     def load_short_binstring(self):
         len = self.read(1)[0]
         data = self.read(len)
@@ -952,6 +1051,11 @@
         self.append(self.read(len))
     dispatch[SHORT_BINBYTES[0]] = load_short_binbytes
 
+    def load_short_binunicode(self):
+        len = ord(self.read(1))
+        self.append(str(self.read(len), 'utf-8', 'surrogatepass'))
+    dispatch[SHORT_BINUNICODE[0]] = load_short_binunicode
+
     def load_tuple(self):
         k = self.marker()
         self.stack[k:] = [tuple(self.stack[k+1:])]
@@ -981,6 +1085,19 @@
         self.append({})
     dispatch[EMPTY_DICT[0]] = load_empty_dictionary
 
+    def load_empty_set(self):
+        self.append(set())
+    dispatch[EMPTY_SET[0]] = load_empty_set
+
+    def load_empty_frozenset(self):
+        self.append(frozenset())
+    dispatch[EMPTY_FROZENSET[0]] = load_empty_frozenset
+
+    def load_frozenset(self):
+        k = self.marker()
+        self.stack[k:] = [frozenset(self.stack[k+1:])]
+    dispatch[FROZENSET[0]] = load_frozenset
+
     def load_list(self):
         k = self.marker()
         self.stack[k:] = [self.stack[k+1:]]
@@ -1041,6 +1158,14 @@
         self.append(klass)
     dispatch[GLOBAL[0]] = load_global
 
+    def load_stack_global(self):
+        name = self.stack.pop()
+        module = self.stack.pop()
+        if type(name) is not str or type(module) is not str:
+            raise UnpicklingError
+        self.append(self.find_class(module, name))
+    dispatch[STACK_GLOBAL[0]] = load_stack_global
+
     def load_ext1(self):
         code = self.read(1)[0]
         self.get_extension(code)
@@ -1179,6 +1304,20 @@
         del stack[mark:]
     dispatch[SETITEMS[0]] = load_setitems
 
+    def load_additem(self):
+        stack = self.stack
+        item = stack.pop()
+        stack[-1].add(item)
+    dispatch[ADDITEM[0]] = load_additem
+
+    def load_additems(self):
+        stack = self.stack
+        mark = self.marker()
+        set = stack[mark - 1]
+        set.update(stack[mark + 1:])
+        del stack[mark:]
+    dispatch[ADDITEMS[0]] = load_additems
+
     def load_build(self):
         stack = self.stack
         state = stack.pop()
diff -r cb392bce91e6 -r 9f1be171da08 Lib/pickletools.py
--- a/Lib/pickletools.py	Thu Apr 18 09:41:34 2013 +0200
+++ b/Lib/pickletools.py	Thu Apr 18 03:24:00 2013 -0700
@@ -168,6 +168,7 @@
 TAKEN_FROM_ARGUMENT1  = -2   # num bytes is 1-byte unsigned int
 TAKEN_FROM_ARGUMENT4  = -3   # num bytes is 4-byte signed little-endian int
 TAKEN_FROM_ARGUMENT4U = -4   # num bytes is 4-byte unsigned little-endian int
+TAKEN_FROM_ARGUMENT8U = -5   # num bytes is 8-byte unsigned little-endian int
 
 class ArgumentDescriptor(object):
     __slots__ = (
@@ -175,7 +176,7 @@
         'name',
 
         # length of argument, in bytes; an int; UP_TO_NEWLINE and
-        # TAKEN_FROM_ARGUMENT{1,4} are negative values for variable-length
+        # TAKEN_FROM_ARGUMENT{1,4,8} are negative values for variable-length
         # cases
         'n',
 
@@ -196,7 +197,8 @@
                                        n in (UP_TO_NEWLINE,
                                              TAKEN_FROM_ARGUMENT1,
                                              TAKEN_FROM_ARGUMENT4,
-                                             TAKEN_FROM_ARGUMENT4U))
+                                             TAKEN_FROM_ARGUMENT4U,
+                                             TAKEN_FROM_ARGUMENT8U))
         self.n = n
 
         self.reader = reader
@@ -288,6 +290,27 @@
             doc="Four-byte unsigned integer, little-endian.")
 
 
+def read_uint8(f):
+    r"""
+    >>> import io
+    >>> read_uint8(io.BytesIO(b'\xff\x00\x00\x00\x00\x00\x00\x00'))
+    255
+    >>> read_uint8(io.BytesIO(b'\xff' * 8)) == 2**64-1
+    True
+    """
+
+    data = f.read(8)
+    if len(data) == 8:
+        return _unpack("<Q", data)[0]
+    raise ValueError("not enough data in stream to read uint8")
+
+uint8 = ArgumentDescriptor(
+            name='uint8',
+            n=8,
+            reader=read_uint8,
+            doc="Eight-byte unsigned integer, little-endian.")
+
+
 def read_stringnl(f, decode=True, stripquotes=True):
     r"""
     >>> import io
@@ -381,6 +404,36 @@
                              a single blank separating the two strings.
                              """)
 
+
+def read_string1(f):
+    r"""
+    >>> import io
+    >>> read_string1(io.BytesIO(b"\x00"))
+    ''
+    >>> read_string1(io.BytesIO(b"\x03abcdef"))
+    'abc'
+    """
+
+    n = read_uint1(f)
+    assert n >= 0
+    data = f.read(n)
+    if len(data) == n:
+        return data.decode("latin-1")
+    raise ValueError("expected %d bytes in a string1, but only %d remain" %
+                     (n, len(data)))
+
+string1 = ArgumentDescriptor(
+              name="string1",
+              n=TAKEN_FROM_ARGUMENT1,
+              reader=read_string1,
+              doc="""A counted string.
+
+              The first argument is a 1-byte unsigned int giving the number
+              of bytes in the string, and the second argument is that many
+              bytes.
+              """)
+
+
 def read_string4(f):
     r"""
     >>> import io
@@ -415,28 +468,28 @@
               """)
 
 
-def read_string1(f):
+def read_bytes1(f):
     r"""
     >>> import io
-    >>> read_string1(io.BytesIO(b"\x00"))
-    ''
-    >>> read_string1(io.BytesIO(b"\x03abcdef"))
-    'abc'
+    >>> read_bytes1(io.BytesIO(b"\x00"))
+    b''
+    >>> read_bytes1(io.BytesIO(b"\x03abcdef"))
+    b'abc'
     """
 
     n = read_uint1(f)
     assert n >= 0
     data = f.read(n)
     if len(data) == n:
-        return data.decode("latin-1")
-    raise ValueError("expected %d bytes in a string1, but only %d remain" %
+        return data
+    raise ValueError("expected %d bytes in a bytes1, but only %d remain" %
                      (n, len(data)))
 
-string1 = ArgumentDescriptor(
-              name="string1",
+bytes1 = ArgumentDescriptor(
+              name="bytes1",
               n=TAKEN_FROM_ARGUMENT1,
-              reader=read_string1,
-              doc="""A counted string.
+              reader=read_bytes1,
+              doc="""A counted bytes string.
 
               The first argument is a 1-byte unsigned int giving the number
               of bytes in the string, and the second argument is that many
@@ -505,6 +558,38 @@
               """)
 
 
+def read_bytes8(f):
+    r"""
+    >>> import io
+    >>> read_bytes8(io.BytesIO(b"\x00\x00\x00\x00\x00\x00\x00\x00abc"))
+    b''
+    >>> read_bytes8(io.BytesIO(b"\x03\x00\x00\x00\x00\x00\x00\x00abcdef"))
+    b'abc'
+    >>> read_bytes8(io.BytesIO(b"\x00\x00\x00\x00\x00\x00\x03\x00abcdef"))
+    Traceback (most recent call last):
+    ...
+    ValueError: expected 844424930131968 bytes in a bytes8, but only 6 remain
+    """
+
+    n = read_uint8(f)
+    if n > sys.maxsize:
+        raise ValueError("bytes8 byte count > sys.maxsize: %d" % n)
+    data = f.read(n)
+    if len(data) == n:
+        return data
+    raise ValueError("expected %d bytes in a bytes8, but only %d remain" %
+                     (n, len(data)))
+
+bytes8 = ArgumentDescriptor(
+              name="bytes8",
+              n=TAKEN_FROM_ARGUMENT8U,
+              reader=read_bytes8,
+              doc="""A counted bytes string.
+
+              The first argument is a 8-byte little-endian unsigned int giving
+              the number of bytes, and the second argument is that many bytes.
+              """)
+
 def read_unicodestringnl(f):
     r"""
     >>> import io
@@ -530,6 +615,47 @@
                       escape sequences.
                       """)
 
+
+def read_unicodestring1(f):
+    r"""
+    >>> import io
+    >>> s = 'abcd\uabcd'
+    >>> enc = s.encode('utf-8')
+    >>> enc
+    b'abcd\xea\xaf\x8d'
+    >>> n = bytes([len(enc)])  # little-endian 1-byte length
+    >>> t = read_unicodestring1(io.BytesIO(n + enc + b'junk'))
+    >>> s == t
+    True
+
+    >>> read_unicodestring1(io.BytesIO(n + enc[:-1]))
+    Traceback (most recent call last):
+    ...
+    ValueError: expected 7 bytes in a unicodestring1, but only 6 remain
+    """
+
+    n = read_uint1(f)
+    if n < 0:
+        raise ValueError("unicodestring1 byte count < 0: %d" % n)
+    data = f.read(n)
+    if len(data) == n:
+        return str(data, 'utf-8', 'surrogatepass')
+    raise ValueError("expected %d bytes in a unicodestring1, but only %d "
+                     "remain" % (n, len(data)))
+
+unicodestring1 = ArgumentDescriptor(
+                    name="unicodestring1",
+                    n=TAKEN_FROM_ARGUMENT1,
+                    reader=read_unicodestring1,
+                    doc="""A counted Unicode string.
+
+                    The first argument is a 1-byte little-endian signed int
+                    giving the number of bytes in the string, and the second
+                    argument-- the UTF-8 encoding of the Unicode string --
+                    contains that many bytes.
+                    """)
+
+
 def read_unicodestring4(f):
     r"""
     >>> import io
@@ -570,6 +696,46 @@
                     """)
 
 
+def read_unicodestring8(f):
+    r"""
+    >>> import io
+    >>> s = 'abcd\uabcd'
+    >>> enc = s.encode('utf-8')
+    >>> enc
+    b'abcd\xea\xaf\x8d'
+    >>> n = bytes([len(enc)]) + bytes(7)  # little-endian 8-byte length
+    >>> t = read_unicodestring8(io.BytesIO(n + enc + b'junk'))
+    >>> s == t
+    True
+
+    >>> read_unicodestring8(io.BytesIO(n + enc[:-1]))
+    Traceback (most recent call last):
+    ...
+    ValueError: expected 7 bytes in a unicodestring8, but only 6 remain
+    """
+
+    n = read_uint8(f)
+    if n < 0:
+        raise ValueError("unicodestring8 byte count < 0: %d" % n)
+    data = f.read(n)
+    if len(data) == n:
+        return str(data, 'utf-8', 'surrogatepass')
+    raise ValueError("expected %d bytes in a unicodestring8, but only %d "
+                     "remain" % (n, len(data)))
+
+unicodestring8 = ArgumentDescriptor(
+                    name="unicodestring8",
+                    n=TAKEN_FROM_ARGUMENT8U,
+                    reader=read_unicodestring8,
+                    doc="""A counted Unicode string.
+
+                    The first argument is a 8-byte little-endian signed int
+                    giving the number of bytes in the string, and the second
+                    argument-- the UTF-8 encoding of the Unicode string --
+                    contains that many bytes.
+                    """)
+
+
 def read_decimalnl_short(f):
     r"""
     >>> import io
@@ -863,6 +1029,16 @@
              obtype=dict,
              doc="A Python dict object.")
 
+pyset = StackObject(
+            name="set",
+            obtype=set,
+            doc="A Python set object.")
+
+pyfrozenset = StackObject(
+                  name="frozenset",
+                  obtype=set,
+                  doc="A Python frozenset object.")
+
 anyobject = StackObject(
                 name='any',
                 obtype=object,
@@ -1146,6 +1322,19 @@
       literally as the string content.
       """),
 
+    I(name='BINBYTES8',
+      code='\x8e',
+      arg=bytes8,
+      stack_before=[],
+      stack_after=[pybytes],
+      proto=4,
+      doc="""Push a Python bytes object.
+
+      There are two arguments:  the first is a 8-byte unsigned int giving
+      the number of bytes in the string, and the second is that many bytes,
+      which are taken literally as the string content.
+      """),
+
     # Ways to spell None.
 
     I(name='NONE',
@@ -1194,6 +1383,19 @@
       until the next newline character.
       """),
 
+    I(name='SHORT_BINUNICODE',
+      code='\x8c',
+      arg=unicodestring1,
+      stack_before=[],
+      stack_after=[pyunicode],
+      proto=4,
+      doc="""Push a Python Unicode string object.
+
+      There are two arguments:  the first is a 1-byte little-endian signed int
+      giving the number of bytes in the string.  The second is that many
+      bytes, and is the UTF-8 encoding of the Unicode string.
+      """),
+
     I(name='BINUNICODE',
       code='X',
       arg=unicodestring4,
@@ -1207,6 +1409,19 @@
       bytes, and is the UTF-8 encoding of the Unicode string.
       """),
 
+    I(name='BINUNICODE8',
+      code='\x8d',
+      arg=unicodestring8,
+      stack_before=[],
+      stack_after=[pyunicode],
+      proto=4,
+      doc="""Push a Python Unicode string object.
+
+      There are two arguments:  the first is a 8-byte little-endian signed int
+      giving the number of bytes in the string.  The second is that many
+      bytes, and is the UTF-8 encoding of the Unicode string.
+      """),
+
     # Ways to spell floats.
 
     I(name='FLOAT',
@@ -1432,6 +1647,77 @@
       1, 2, ..., n, and in that order.
       """),
 
+    # Ways to build sets
+
+    I(name='EMPTY_SET',
+      code='\x8f',
+      arg=None,
+      stack_before=[],
+      stack_after=[pyset],
+      proto=4,
+      doc="Push an empty set."),
+
+    I(name='ADDITEM',
+      code='\x90',
+      arg=None,
+      stack_before=[pyset, anyobject],
+      stack_after=[pyset],
+      proto=4,
+      doc="""Add an item to an existing set.
+
+      Stack before:  ... pyset item
+      Stack after:   ... pyset
+
+      where pyset has been modified via pyset.add(item).
+      """),
+
+    I(name='ADDITEMS',
+      code='\x91',
+      arg=None,
+      stack_before=[pyset, markobject, stackslice],
+      stack_after=[pyset],
+      proto=4,
+      doc="""Add an arbitrary number of items to an existing set.
+
+      The slice of the stack following the topmost markobject is taken as
+      a sequence of items, added to the set immediately under the topmost
+      markobject.  Everything at and after the topmost markobject is popped,
+      leaving the mutated set at the top of the stack.
+
+      Stack before:  ... pyset markobject item_1 ... item_n
+      Stack after:   ... pyset
+
+      where pyset has been modified via pyset.add(item_i) = item_i for i in
+      1, 2, ..., n, and in that order.
+      """),
+
+    # Ways to build frozensets
+
+    I(name='EMPTY_FROZENSET',
+      code='\x92',
+      arg=None,
+      stack_before=[],
+      stack_after=[pyfrozenset],
+      proto=4,
+      doc="Push an empty frozenset."),
+
+    I(name='FROZENSET',
+      code='\x93',
+      arg=None,
+      stack_before=[markobject, stackslice],
+      stack_after=[pyfrozenset],
+      proto=4,
+      doc="""Build a frozenset out of the topmost slice, after markobject.
+
+      All the stack entries following the topmost markobject are placed into
+      a single Python frozenset, which single frozenset object replaces all
+      of the stack from the topmost markobject onward.  For example,
+
+      Stack before: ... markobject 1 2 3
+      Stack after:  ... frozenset({1, 2, 3})
+      """),
+
+
     # Stack manipulation.
 
     I(name='POP',
@@ -1618,6 +1904,15 @@
       stack, so unpickling subclasses can override this form of lookup.
       """),
 
+    I(name='STACK_GLOBAL',
+      code='\x94',
+      arg=None,
+      stack_before=[pyunicode, pyunicode],
+      stack_after=[anyobject],
+      proto=0,
+      doc="""Push a global object (module.attr) on the stack.
+      """),
+
     # Ways to build objects of classes pickle doesn't know about directly
     # (user-defined classes).  I despair of documenting this accurately
     # and comprehensibly -- you really have to read the pickle code to
diff -r cb392bce91e6 -r 9f1be171da08 Lib/test/pickletester.py
--- a/Lib/test/pickletester.py	Thu Apr 18 09:41:34 2013 +0200
+++ b/Lib/test/pickletester.py	Thu Apr 18 03:24:00 2013 -0700
@@ -95,6 +95,9 @@
     def __getinitargs__(self):
         return ()
 
+class H(object):
+    pass
+
 import __main__
 __main__.C = C
 C.__module__ = "__main__"
@@ -102,6 +105,8 @@
 D.__module__ = "__main__"
 __main__.E = E
 E.__module__ = "__main__"
+__main__.H = H
+H.__module__ = "__main__"
 
 class myint(int):
     def __init__(self, x):
@@ -574,6 +579,26 @@
             self.assertEqual(list(x.keys()), [1])
             self.assertTrue(x[1] is x)
 
+    def test_recursive_set(self):
+        h = H()
+        y = set({h})
+        h.attr = y
+        for proto in protocols:
+            s = self.dumps(y, proto)
+            x = self.loads(s)
+            self.assertIs(list(x)[0].attr, x)
+            self.assertEqual(len(x), 1)
+
+    def test_recursive_frozenset(self):
+        h = H()
+        y = frozenset({h})
+        h.attr = y
+        for proto in protocols:
+            s = self.dumps(y, proto)
+            x = self.loads(s)
+            self.assertIs(list(x)[0].attr, x)
+            self.assertEqual(len(x), 1)
+
     def test_recursive_inst(self):
         i = C()
         i.attr = i
@@ -817,7 +842,7 @@
                 s = self.dumps(x, proto)
                 y = self.loads(s)
                 self.assertEqual(x, y, (proto, x, s, y))
-                expected = expected_opcode[proto, len(x)]
+                expected = expected_opcode[min(proto, 3), len(x)]
                 self.assertEqual(opcode_in_pickle(expected, s), True)
 
     def test_singletons(self):
@@ -842,7 +867,7 @@
                 s = self.dumps(x, proto)
                 y = self.loads(s)
                 self.assertTrue(x is y, (proto, x, s, y))
-                expected = expected_opcode[proto, x]
+                expected = expected_opcode[min(proto, 3), x]
                 self.assertEqual(opcode_in_pickle(expected, s), True)
 
     def test_newobj_tuple(self):
@@ -990,6 +1015,31 @@
             else:
                 self.assertTrue(num_setitems >= 2)
 
+    def test_set_chunking(self):
+        n = 10  # too small to chunk
+        x = set(range(n))
+        for proto in protocols:
+            s = self.dumps(x, proto)
+            y = self.loads(s)
+            self.assertEqual(x, y)
+            num_additems = count_opcode(pickle.ADDITEMS, s)
+            if proto < 4:
+                self.assertEqual(num_additems, 0)
+            else:
+                self.assertEqual(num_additems, 1)
+
+        n = 2500  # expect at least two chunks when proto >= 4
+        x = set(range(n))
+        for proto in protocols:
+            s = self.dumps(x, proto)
+            y = self.loads(s)
+            self.assertEqual(x, y)
+            num_additems = count_opcode(pickle.ADDITEMS, s)
+            if proto < 4:
+                self.assertEqual(num_additems, 0)
+            else:
+                self.assertGreaterEqual(num_additems, 2)
+
     def test_simple_newobj(self):
         x = object.__new__(SimpleNewObj)  # avoid __init__
         x.abc = 666
@@ -1285,18 +1335,27 @@
         finally:
             data = None
 
-    # BINUNICODE (protocols 1, 2 and 3) cannot carry more than
-    # 2**32 - 1 bytes of utf-8 encoded unicode.
+    # BINUNICODE (protocols 1, 2 and 3) cannot carry more than 2**32 - 1 bytes
+    # of utf-8 encoded unicode. BINUNICODE8 (protocol 4) supports these huge
+    # unicode strings however.
 
-    @bigmemtest(size=_4G, memuse=1 + ascii_char_size, dry_run=False)
+    @bigmemtest(size=_4G, memuse=2 + ascii_char_size, dry_run=False)
     def test_huge_str_64b(self, size):
-        data = "a" * size
+        data = "abcd" * (size // 4)
         try:
             for proto in protocols:
                 if proto == 0:
                     continue
-                with self.assertRaises((ValueError, OverflowError)):
-                    self.dumps(data, protocol=proto)
+                if proto < 4:
+                    with self.assertRaises((ValueError, OverflowError)):
+                        self.dumps(data, protocol=proto)
+                else:
+                    try:
+                        pickled = self.dumps(data, protocol=proto)
+                        self.assertTrue(b"abcd" in pickled[:15])
+                        self.assertTrue(b"abcd" in pickled[-15:])
+                    finally:
+                        pickled = None
         finally:
             data = None
 
@@ -1365,10 +1424,16 @@
 class MyDict(dict):
     sample = {"a": 1, "b": 2}
 
+class MySet(set):
+    sample = {"a", "b"}
+
+class MyFrozenSet(frozenset):
+    sample = frozenset({"a", "b"})
+
 myclasses = [MyInt, MyFloat,
              MyComplex,
              MyStr, MyUnicode,
-             MyTuple, MyList, MyDict]
+             MyTuple, MyList, MyDict, MySet, MyFrozenSet]
 
 
 class SlotList(MyList):
@@ -1414,7 +1479,7 @@
 
     def test_highest_protocol(self):
         # Of course this needs to be changed when HIGHEST_PROTOCOL changes.
-        self.assertEqual(pickle.HIGHEST_PROTOCOL, 3)
+        self.assertEqual(pickle.HIGHEST_PROTOCOL, 4)
 
     def test_callapi(self):
         f = io.BytesIO()
diff -r cb392bce91e6 -r 9f1be171da08 Modules/_pickle.c
--- a/Modules/_pickle.c	Thu Apr 18 09:41:34 2013 +0200
+++ b/Modules/_pickle.c	Thu Apr 18 03:24:00 2013 -0700
@@ -6,7 +6,7 @@
 
 /* Bump this when new opcodes are added to the pickle protocol. */
 enum {
-    HIGHEST_PROTOCOL = 3,
+    HIGHEST_PROTOCOL = 4,
     DEFAULT_PROTOCOL = 3
 };
 
@@ -71,7 +71,18 @@
 
     /* Protocol 3 (Python 3.x) */
     BINBYTES       = 'B',
-    SHORT_BINBYTES = 'C'
+    SHORT_BINBYTES = 'C',
+
+    /* Protocol 4 */
+    SHORT_BINUNICODE = '\x8c',
+    BINUNICODE8      = '\x8d',
+    BINBYTES8        = '\x8e',
+    EMPTY_SET        = '\x8f',
+    ADDITEM          = '\x90',
+    ADDITEMS         = '\x91',
+    EMPTY_FROZENSET  = '\x92',
+    FROZENSET        = '\x93',
+    STACK_GLOBAL     = '\x94'
 };
 
 /* These aren't opcodes -- they're ways to pickle bools before protocol 2
@@ -1766,14 +1777,14 @@
     }
     else {
         Py_ssize_t size;
-        char header[5];
+        char header[9];
         Py_ssize_t len;
 
         size = PyBytes_GET_SIZE(obj);
         if (size < 0)
             return -1;
 
-        if (size < 256) {
+        if (size <= 0xff) {
             header[0] = SHORT_BINBYTES;
             header[1] = (unsigned char)size;
             len = 2;
@@ -1786,6 +1797,14 @@
             header[4] = (unsigned char)((size >> 24) & 0xff);
             len = 5;
         }
+        else if (self->proto >= 4) {
+            int i;
+            header[0] = BINBYTES8;
+            for (i = 0; i < 8; i++) {
+                header[i+1] = (unsigned char)((size >> (8 * i)) & 0xff);
+            }
+            len = 8;
+        }
         else {
             PyErr_SetString(PyExc_OverflowError,
                             "cannot serialize a bytes object larger than 4 GiB");
@@ -1875,26 +1894,39 @@
 static int
 write_utf8(PicklerObject *self, char *data, Py_ssize_t size)
 {
-    char pdata[5];
-
-#if SIZEOF_SIZE_T > 4
-    if (size > 0xffffffffUL) {
-        /* string too large */
+    char header[9];
+    Py_ssize_t len;
+
+    if (size <= 0xff && self->proto >= 4) {
+        header[0] = SHORT_BINUNICODE;
+        header[1] = (unsigned char)(size & 0xff);
+        len = 2;
+    }
+    else if (size <= 0xffffffffUL) {
+        header[0] = BINUNICODE;
+        header[1] = (unsigned char)(size & 0xff);
+        header[2] = (unsigned char)((size >> 8) & 0xff);
+        header[3] = (unsigned char)((size >> 16) & 0xff);
+        header[4] = (unsigned char)((size >> 24) & 0xff);
+        len = 5;
+    }
+    else if (self->proto >= 4) {
+        int i;
+
+        header[0] = BINUNICODE8;
+        for (i = 0; i < 8; i++) {
+            header[i+1] = (unsigned char)((size >> (8 * i)) & 0xff);
+        }
+        len = 9;
+    }
+    else {
         PyErr_SetString(PyExc_OverflowError,
                         "cannot serialize a string larger than 4GiB");
         return -1;
     }
-#endif
-
-    pdata[0] = BINUNICODE;
-    pdata[1] = (unsigned char)(size & 0xff);
-    pdata[2] = (unsigned char)((size >> 8) & 0xff);
-    pdata[3] = (unsigned char)((size >> 16) & 0xff);
-    pdata[4] = (unsigned char)((size >> 24) & 0xff);
-
-    if (_Pickler_Write(self, pdata, sizeof(pdata)) < 0)
-        return -1;
-
+
+    if (_Pickler_Write(self, header, len) < 0)
+        return -1;
     if (_Pickler_Write(self, data, size) < 0)
         return -1;
 
@@ -1930,7 +1962,7 @@
 }
 
 static int
-save_unicode(PicklerObject *self, PyObject *obj)
+save_unicode(PicklerObject *self, PyObject *obj, int memoize)
 {
     if (self->bin) {
         if (write_unicode_binary(self, obj) < 0)
@@ -1960,7 +1992,7 @@
         if (_Pickler_Write(self, "\n", 1) < 0)
             return -1;
     }
-    if (memo_put(self, obj) < 0)
+    if (memoize && memo_put(self, obj) < 0)
         return -1;
 
     return 0;
@@ -2591,6 +2623,235 @@
 }
 
 static int
+save_set(PicklerObject *self, PyObject *obj)
+{
+    PyObject *item;
+    int i;
+    Py_ssize_t set_size, ppos = 0;
+    Py_hash_t hash;
+
+    const char empty_set_op = EMPTY_SET;
+    const char mark_op = MARK;
+    const char additem_op = ADDITEM;
+    const char additems_op = ADDITEMS;
+
+    if (self->proto < 4) {
+        PyObject *items;
+        PyObject *reduce_value;
+        int status;
+
+        items = PySequence_List(obj);
+        if (items == NULL) {
+            return -1;
+        }
+        reduce_value = Py_BuildValue("(O(O))", (PyObject*)&PySet_Type, items);
+        Py_DECREF(items);
+        if (reduce_value == NULL) {
+            return -1;
+        }
+        /* save_reduce() will memoize the object automatically. */
+        status = save_reduce(self, reduce_value, obj);
+        Py_DECREF(reduce_value);
+        return status;
+    }
+
+    if (_Pickler_Write(self, &empty_set_op, 1) < 0)
+        return -1;
+
+    if (memo_put(self, obj) < 0)
+        return -1;
+
+    set_size = PySet_GET_SIZE(obj);
+    if (set_size == 0)
+        return 0;  /* nothing to do */
+
+    /* Special-case len(obj) == 1 to save space. */
+    if (set_size == 1) {
+        _PySet_NextEntry(obj, &ppos, &item, &hash);
+
+        if (save(self, item, 0) < 0)
+            return -1;
+        if (_Pickler_Write(self, &additem_op, 1) < 0)
+            return -1;
+        return 0;
+    }
+
+    /* Write in batches of BATCHSIZE. */
+    do {
+        i = 0;
+        if (_Pickler_Write(self, &mark_op, 1) < 0)
+            return -1;
+        while (_PySet_NextEntry(obj, &ppos, &item, &hash)) {
+            if (save(self, item, 0) < 0)
+                return -1;
+            if (++i == BATCHSIZE)
+                break;
+        }
+        if (_Pickler_Write(self, &additems_op, 1) < 0)
+            return -1;
+        if (PySet_GET_SIZE(obj) != set_size) {
+            PyErr_Format(
+                PyExc_RuntimeError,
+                "set changed size during iteration");
+            return -1;
+        }
+    } while (i == BATCHSIZE);
+
+    return 0;
+}
+
+static int
+save_frozenset(PicklerObject *self, PyObject *obj)
+{
+    PyObject *iter;
+    Py_ssize_t len;
+
+    const char mark_op = MARK;
+    const char frozenset_op = FROZENSET;
+    const char empty_frozenset_op = EMPTY_FROZENSET;
+
+    if (self->fast && !fast_save_enter(self, obj))
+        return -1;
+
+    if (self->proto < 4) {
+        PyObject *items;
+        PyObject *reduce_value;
+        int status;
+
+        items = PySequence_List(obj);
+        if (items == NULL) {
+            return -1;
+        }
+        reduce_value = Py_BuildValue("(O(O))", (PyObject*)&PyFrozenSet_Type,
+                                     items);
+        Py_DECREF(items);
+        if (reduce_value == NULL) {
+            return -1;
+        }
+        /* save_reduce() will memoize the object automatically. */
+        status = save_reduce(self, reduce_value, obj);
+        Py_DECREF(reduce_value);
+        return status;
+    }
+
+    len = PySet_GET_SIZE(obj);
+    if (len == 0) {
+        if (_Pickler_Write(self, &empty_frozenset_op, 1) < 0)
+            return -1;
+        return 0;
+    }
+    
+    if (_Pickler_Write(self, &mark_op, 1) < 0)
+        return -1;
+
+    iter = PyObject_GetIter(obj);
+    for (;;) {
+        PyObject *item;
+
+        item = PyIter_Next(iter);
+        if (item == NULL) {
+            if (PyErr_Occurred()) {
+                Py_DECREF(iter);
+                return -1;
+            }
+            break;
+        }
+        if (save(self, item, 0) < 0) {
+            Py_DECREF(item);
+            Py_DECREF(iter);
+            return -1;
+        }
+        Py_DECREF(item);
+    }
+    Py_DECREF(iter);
+
+    /* If the object is already in the memo, this means it is
+       recursive. In this case, throw away everything we put on the
+       stack, and fetch the object back from the memo. */
+    if (PyMemoTable_Get(self->memo, obj)) {
+        const char pop_mark_op = POP_MARK;
+
+        if (_Pickler_Write(self, &pop_mark_op, 1) < 0)
+            return -1;
+        if (memo_get(self, obj) < 0)
+            return -1;
+        return 0;
+    }
+
+    if (_Pickler_Write(self, &frozenset_op, 1) < 0)
+        return -1;
+    if (memo_put(self, obj) < 0)
+        return -1;
+
+    return 0;
+}
+
+static int
+fix_imports(PyObject **module_name, PyObject **global_name)
+{
+    PyObject *key;
+    PyObject *item;
+
+    key = PyTuple_Pack(2, *module_name, *global_name);
+    if (key == NULL)
+        return -1;
+    item = PyDict_GetItemWithError(name_mapping_3to2, key);
+    Py_DECREF(key);
+    if (item) {
+        PyObject *fixed_module_name;
+        PyObject *fixed_global_name;
+
+        if (!PyTuple_Check(item) || PyTuple_GET_SIZE(item) != 2) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "_compat_pickle.REVERSE_NAME_MAPPING values "
+                         "should be 2-tuples, not %.200s",
+                         Py_TYPE(item)->tp_name);
+            return -1;
+        }
+        fixed_module_name = PyTuple_GET_ITEM(item, 0);
+        fixed_global_name = PyTuple_GET_ITEM(item, 1);
+        if (!PyUnicode_Check(fixed_module_name) ||
+            !PyUnicode_Check(fixed_global_name)) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "_compat_pickle.REVERSE_NAME_MAPPING values "
+                         "should be pairs of str, not (%.200s, %.200s)",
+                         Py_TYPE(fixed_module_name)->tp_name,
+                         Py_TYPE(fixed_global_name)->tp_name);
+            return -1;
+        }
+
+        Py_CLEAR(*module_name);
+        Py_CLEAR(*global_name);
+        Py_INCREF(fixed_module_name);
+        Py_INCREF(fixed_global_name);
+        *module_name = fixed_module_name;
+        *global_name = fixed_global_name;
+    }
+    else if (PyErr_Occurred()) {
+        return -1;
+    }
+
+    item = PyDict_GetItemWithError(import_mapping_3to2, *module_name);
+    if (item) {
+        if (!PyUnicode_Check(item)) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "_compat_pickle.REVERSE_IMPORT_MAPPING values "
+                         "should be strings, not %.200s",
+                         Py_TYPE(item)->tp_name);
+            return -1;
+        }
+        Py_CLEAR(*module_name);
+        Py_INCREF(item);
+        *module_name = item;
+    }
+    else if (PyErr_Occurred()) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static int
 save_global(PicklerObject *self, PyObject *obj, PyObject *name)
 {
     static PyObject *name_str = NULL;
@@ -2715,120 +2976,79 @@
             goto error;
     }
     else {
-        /* Generate a normal global opcode if we are using a pickle
-           protocol <= 2, or if the object is not registered in the
-           extension registry. */
-        PyObject *encoded;
-        PyObject *(*unicode_encoder)(PyObject *);
-
   gen_global:
-        if (_Pickler_Write(self, &global_op, 1) < 0)
-            goto error;
-
-        /* Since Python 3.0 now supports non-ASCII identifiers, we encode both
-           the module name and the global name using UTF-8. We do so only when
-           we are using the pickle protocol newer than version 3. This is to
-           ensure compatibility with older Unpickler running on Python 2.x. */
-        if (self->proto >= 3) {
-            unicode_encoder = PyUnicode_AsUTF8String;
+        if (self->proto >= 4) {
+            const char stack_global_op = STACK_GLOBAL;
+
+            save_unicode(self, module_name, 0 /* memoize */);
+            save_unicode(self, global_name, 0 /* memoize */);
+            
+            if (_Pickler_Write(self, &stack_global_op, 1) < 0)
+                goto error;
         }
         else {
-            unicode_encoder = PyUnicode_AsASCIIString;
-        }
-
-        /* For protocol < 3 and if the user didn't request against doing so,
-           we convert module names to the old 2.x module names. */
-        if (self->fix_imports) {
-            PyObject *key;
-            PyObject *item;
-
-            key = PyTuple_Pack(2, module_name, global_name);
-            if (key == NULL)
+            /* Generate a normal global opcode if we are using a pickle
+               protocol < 4, or if the object is not registered in the
+               extension registry. */
+            PyObject *encoded;
+            PyObject *(*unicode_encoder)(PyObject *);
+
+            if (_Pickler_Write(self, &global_op, 1) < 0)
                 goto error;
-            item = PyDict_GetItemWithError(name_mapping_3to2, key);
-            Py_DECREF(key);
-            if (item) {
-                if (!PyTuple_Check(item) || PyTuple_GET_SIZE(item) != 2) {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "_compat_pickle.REVERSE_NAME_MAPPING values "
-                                 "should be 2-tuples, not %.200s",
-                                 Py_TYPE(item)->tp_name);
+
+            /* For protocol < 3 and if the user didn't request against doing so,
+               we convert module names to the old 2.x module names. */
+            if (self->proto < 3 && self->fix_imports) {
+                if (fix_imports(&module_name, &global_name) < 0) {
                     goto error;
                 }
-                Py_CLEAR(module_name);
-                Py_CLEAR(global_name);
-                module_name = PyTuple_GET_ITEM(item, 0);
-                global_name = PyTuple_GET_ITEM(item, 1);
-                if (!PyUnicode_Check(module_name) ||
-                    !PyUnicode_Check(global_name)) {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "_compat_pickle.REVERSE_NAME_MAPPING values "
-                                 "should be pairs of str, not (%.200s, %.200s)",
-                                 Py_TYPE(module_name)->tp_name,
-                                 Py_TYPE(global_name)->tp_name);
-                    goto error;
-                }
-                Py_INCREF(module_name);
-                Py_INCREF(global_name);
             }
-            else if (PyErr_Occurred()) {
+
+            /* Since Python 3.0 now supports non-ASCII identifiers, we encode both
+               the module name and the global name using UTF-8. We do so only when
+               we are using the pickle protocol newer than version 3. This is to
+               ensure compatibility with older Unpickler running on Python 2.x. */
+            if (self->proto == 3) {
+                unicode_encoder = PyUnicode_AsUTF8String;
+            }
+            else {
+                unicode_encoder = PyUnicode_AsASCIIString;
+            }
+            encoded = unicode_encoder(module_name);
+            if (encoded == NULL) {
+                if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
+                    PyErr_Format(PicklingError,
+                                 "can't pickle module identifier '%S' using "
+                                 "pickle protocol %i", module_name, self->proto);
                 goto error;
             }
-
-            item = PyDict_GetItemWithError(import_mapping_3to2, module_name);
-            if (item) {
-                if (!PyUnicode_Check(item)) {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "_compat_pickle.REVERSE_IMPORT_MAPPING values "
-                                 "should be strings, not %.200s",
-                                 Py_TYPE(item)->tp_name);
-                    goto error;
-                }
-                Py_CLEAR(module_name);
-                module_name = item;
-                Py_INCREF(module_name);
-            }
-            else if (PyErr_Occurred()) {
+            if (_Pickler_Write(self, PyBytes_AS_STRING(encoded),
+                               PyBytes_GET_SIZE(encoded)) < 0) {
+                Py_DECREF(encoded);
                 goto error;
             }
+            Py_DECREF(encoded);
+            if(_Pickler_Write(self, "\n", 1) < 0)
+                goto error;
+
+            /* Save the name of the module. */
+            encoded = unicode_encoder(global_name);
+            if (encoded == NULL) {
+                if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
+                    PyErr_Format(PicklingError,
+                                 "can't pickle global identifier '%S' using "
+                                 "pickle protocol %i", global_name, self->proto);
+                goto error;
+            }
+            if (_Pickler_Write(self, PyBytes_AS_STRING(encoded),
+                               PyBytes_GET_SIZE(encoded)) < 0) {
+                Py_DECREF(encoded);
+                goto error;
+            }
+            Py_DECREF(encoded);
+            if(_Pickler_Write(self, "\n", 1) < 0)
+                goto error;
         }
-
-        /* Save the name of the module. */
-        encoded = unicode_encoder(module_name);
-        if (encoded == NULL) {
-            if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
-                PyErr_Format(PicklingError,
-                             "can't pickle module identifier '%S' using "
-                             "pickle protocol %i", module_name, self->proto);
-            goto error;
-        }
-        if (_Pickler_Write(self, PyBytes_AS_STRING(encoded),
-                          PyBytes_GET_SIZE(encoded)) < 0) {
-            Py_DECREF(encoded);
-            goto error;
-        }
-        Py_DECREF(encoded);
-        if(_Pickler_Write(self, "\n", 1) < 0)
-            goto error;
-
-        /* Save the name of the module. */
-        encoded = unicode_encoder(global_name);
-        if (encoded == NULL) {
-            if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
-                PyErr_Format(PicklingError,
-                             "can't pickle global identifier '%S' using "
-                             "pickle protocol %i", global_name, self->proto);
-            goto error;
-        }
-        if (_Pickler_Write(self, PyBytes_AS_STRING(encoded),
-                          PyBytes_GET_SIZE(encoded)) < 0) {
-            Py_DECREF(encoded);
-            goto error;
-        }
-        Py_DECREF(encoded);
-        if(_Pickler_Write(self, "\n", 1) < 0)
-            goto error;
-
         /* Memoize the object. */
         if (memo_put(self, obj) < 0)
             goto error;
@@ -3118,8 +3338,23 @@
        the caller do not want to memoize the object. Not particularly useful,
        but that is to mimic the behavior save_reduce() in pickle.py when
        obj is None. */
-    if (obj && memo_put(self, obj) < 0)
-        return -1;
+    if (obj != NULL) {
+        /* If the object is already in the memo, this means it is
+           recursive. In this case, throw away everything we put on the
+           stack, and fetch the object back from the memo. */
+        if (PyMemoTable_Get(self->memo, obj)) {
+            const char pop_op = POP;
+
+            if (_Pickler_Write(self, &pop_op, 1) < 0)
+                return -1;
+            if (memo_get(self, obj) < 0)
+                return -1;
+
+            return 0;
+        }
+        else if (memo_put(self, obj) < 0)
+            return -1;
+    }
 
     if (listitems && batch_list(self, listitems) < 0)
         return -1;
@@ -3207,13 +3442,21 @@
         goto done;
     }
     else if (type == &PyUnicode_Type) {
-        status = save_unicode(self, obj);
+        status = save_unicode(self, obj, 1 /* memoize */);
         goto done;
     }
     else if (type == &PyDict_Type) {
         status = save_dict(self, obj);
         goto done;
     }
+    else if (type == &PySet_Type) {
+        status = save_set(self, obj);
+        goto done;
+    }
+    else if (type == &PyFrozenSet_Type) {
+        status = save_frozenset(self, obj);
+        goto done;
+    }
     else if (type == &PyList_Type) {
         status = save_list(self, obj);
         goto done;
@@ -3479,9 +3722,9 @@
 "This takes a binary file for writing a pickle data stream.\n"
 "\n"
 "The optional protocol argument tells the pickler to use the\n"
-"given protocol; supported protocols are 0, 1, 2, 3.  The default\n"
-"protocol is 3; a backward-incompatible protocol designed for\n"
-"Python 3.0.\n"
+"given protocol; supported protocols are 0, 1, 2, 3 and 4.  The\n"
+"default protocol is 3; a backward-incompatible protocol designed for\n"
+"Python 3.\n"
 "\n"
 "Specifying a negative protocol version selects the highest\n"
 "protocol version supported.  The higher the protocol used, the\n"
@@ -3494,8 +3737,8 @@
 "meets this interface.\n"
 "\n"
 "If fix_imports is True and protocol is less than 3, pickle will try to\n"
-"map the new Python 3.x names to the old module names used in Python\n"
-"2.x, so that the pickle data stream is readable with Python 2.x.\n");
+"map the new Python 3 names to the old module names used in Python 2,\n"
+"so that the pickle data stream is readable with Python 2.\n");
 
 static int
 Pickler_init(PicklerObject *self, PyObject *args, PyObject *kwds)
@@ -3988,17 +4231,15 @@
  * as a C Py_ssize_t, or -1 if it's higher than PY_SSIZE_T_MAX.
  */
 static Py_ssize_t
-calc_binsize(char *bytes, int size)
+calc_binsize(char *bytes, int nbytes)
 {
     unsigned char *s = (unsigned char *)bytes;
+    int i;
     size_t x = 0;
 
-    assert(size == 4);
-
-    x =  (size_t) s[0];
-    x |= (size_t) s[1] << 8;
-    x |= (size_t) s[2] << 16;
-    x |= (size_t) s[3] << 24;
+    for (i = 0; i < nbytes; i++) {
+        x |= (size_t) s[i] << (8 * i);
+    }
 
     if (x > PY_SSIZE_T_MAX)
         return -1;
@@ -4012,21 +4253,21 @@
  * of x-platform bugs.
  */
 static long
-calc_binint(char *bytes, int size)
+calc_binint(char *bytes, int nbytes)
 {
     unsigned char *s = (unsigned char *)bytes;
-    int i = size;
+    int i;
     long x = 0;
 
-    for (i = 0; i < size; i++) {
-        x |= (long)s[i] << (i * 8);
+    for (i = 0; i < nbytes; i++) {
+        x |= (long)s[i] << (8 * i);
     }
 
     /* Unlike BININT1 and BININT2, BININT (more accurately BININT4)
      * is signed, so on a box with longs bigger than 4 bytes we need
      * to extend a BININT's sign bit to the full width.
      */
-    if (SIZEOF_LONG > 4 && size == 4) {
+    if (SIZEOF_LONG > 4 && nbytes == 4) {
         x |= -(x & (1L << 31));
     }
 
@@ -4234,26 +4475,27 @@
 }
 
 static int
-load_binbytes(UnpicklerObject *self)
+load_counted_binbytes(UnpicklerObject *self, int nbytes)
 {
     PyObject *bytes;
-    Py_ssize_t x;
+    Py_ssize_t size;
     char *s;
 
-    if (_Unpickler_Read(self, &s, 4) < 0)
-        return -1;
-
-    x = calc_binsize(s, 4);
-    if (x < 0) {
+    if (_Unpickler_Read(self, &s, nbytes) < 0)
+        return -1;
+
+    size = calc_binsize(s, nbytes);
+    if (size < 0) {
         PyErr_Format(PyExc_OverflowError,
                      "BINBYTES exceeds system's maximum size of %zd bytes",
                      PY_SSIZE_T_MAX);
         return -1;
     }
 
-    if (_Unpickler_Read(self, &s, x) < 0)
-        return -1;
-    bytes = PyBytes_FromStringAndSize(s, x);
+    if (_Unpickler_Read(self, &s, size) < 0)
+        return -1;
+
+    bytes = PyBytes_FromStringAndSize(s, size);
     if (bytes == NULL)
         return -1;
 
@@ -4262,74 +4504,27 @@
 }
 
 static int
-load_short_binbytes(UnpicklerObject *self)
-{
-    PyObject *bytes;
-    Py_ssize_t x;
+load_counted_binstring(UnpicklerObject *self, int nbytes)
+{
+    PyObject *str;
+    Py_ssize_t size;
     char *s;
 
-    if (_Unpickler_Read(self, &s, 1) < 0)
-        return -1;
-
-    x = (unsigned char)s[0];
-
-    if (_Unpickler_Read(self, &s, x) < 0)
-        return -1;
-
-    bytes = PyBytes_FromStringAndSize(s, x);
-    if (bytes == NULL)
-        return -1;
-
-    PDATA_PUSH(self->stack, bytes, -1);
-    return 0;
-}
-
-static int
-load_binstring(UnpicklerObject *self)
-{
-    PyObject *str;
-    Py_ssize_t x;
-    char *s;
-
-    if (_Unpickler_Read(self, &s, 4) < 0)
-        return -1;
-
-    x = calc_binint(s, 4);
-    if (x < 0) {
-        PyErr_SetString(UnpicklingError,
-                        "BINSTRING pickle has negative byte count");
-        return -1;
-    }
-
-    if (_Unpickler_Read(self, &s, x) < 0)
-        return -1;
-
+    if (_Unpickler_Read(self, &s, nbytes) < 0)
+        return -1;
+
+    size = calc_binsize(s, nbytes);
+    if (size < 0) {
+        PyErr_Format(UnpicklingError,
+                     "BINSTRING exceeds system's maximum size of %zd bytes",
+                     PY_SSIZE_T_MAX);
+        return -1;
+    }
+
+    if (_Unpickler_Read(self, &s, size) < 0)
+        return -1;
     /* Convert Python 2.x strings to unicode. */
-    str = PyUnicode_Decode(s, x, self->encoding, self->errors);
-    if (str == NULL)
-        return -1;
-
-    PDATA_PUSH(self->stack, str, -1);
-    return 0;
-}
-
-static int
-load_short_binstring(UnpicklerObject *self)
-{
-    PyObject *str;
-    Py_ssize_t x;
-    char *s;
-
-    if (_Unpickler_Read(self, &s, 1) < 0)
-        return -1;
-
-    x = (unsigned char)s[0];
-
-    if (_Unpickler_Read(self, &s, x) < 0)
-        return -1;
-
-    /* Convert Python 2.x strings to unicode. */
-    str = PyUnicode_Decode(s, x, self->encoding, self->errors);
+    str = PyUnicode_Decode(s, size, self->encoding, self->errors);
     if (str == NULL)
         return -1;
 
@@ -4358,16 +4553,16 @@
 }
 
 static int
-load_binunicode(UnpicklerObject *self)
+load_counted_binunicode(UnpicklerObject *self, int nbytes)
 {
     PyObject *str;
     Py_ssize_t size;
     char *s;
 
-    if (_Unpickler_Read(self, &s, 4) < 0)
-        return -1;
-
-    size = calc_binsize(s, 4);
+    if (_Unpickler_Read(self, &s, nbytes) < 0)
+        return -1;
+
+    size = calc_binsize(s, nbytes);
     if (size < 0) {
         PyErr_Format(PyExc_OverflowError,
                      "BINUNICODE exceeds system's maximum size of %zd bytes",
@@ -4375,7 +4570,6 @@
         return -1;
     }
 
-
     if (_Unpickler_Read(self, &s, size) < 0)
         return -1;
 
@@ -4447,6 +4641,28 @@
 }
 
 static int
+load_empty_set(UnpicklerObject *self)
+{
+    PyObject *set;
+
+    if ((set = PySet_New(NULL)) == NULL)
+        return -1;
+    PDATA_PUSH(self->stack, set, -1);
+    return 0;
+}
+
+static int
+load_empty_frozenset(UnpicklerObject *self)
+{
+    PyObject *set;
+
+    if ((set = PyFrozenSet_New(NULL)) == NULL)
+        return -1;
+    PDATA_PUSH(self->stack, set, -1);
+    return 0;
+}
+
+static int
 load_list(UnpicklerObject *self)
 {
     PyObject *list;
@@ -4488,6 +4704,29 @@
     return 0;
 }
 
+static int
+load_frozenset(UnpicklerObject *self)
+{
+    PyObject *items;
+    PyObject *frozenset;
+    Py_ssize_t i;
+
+    if ((i = marker(self)) < 0)
+        return -1;
+
+    items = Pdata_poptuple(self->stack, i);
+    if (items == NULL)
+        return -1;
+
+    frozenset = PyFrozenSet_New(items);
+    Py_DECREF(items);
+    if (frozenset == NULL)
+        return -1;
+
+    PDATA_PUSH(self->stack, frozenset, -1);
+    return 0;
+}
+
 static PyObject *
 instantiate(PyObject *cls, PyObject *args)
 {
@@ -4675,6 +4914,31 @@
 }
 
 static int
+load_stack_global(UnpicklerObject *self)
+{
+    PyObject *global;
+    PyObject *module_name;
+    PyObject *global_name;
+
+    PDATA_POP(self->stack, global_name);
+    PDATA_POP(self->stack, module_name);
+    if (module_name == NULL || !PyUnicode_CheckExact(module_name) ||
+        global_name == NULL || !PyUnicode_CheckExact(global_name)) {
+        PyErr_SetString(UnpicklingError, "STACK_GLOBAL requires str");
+        Py_XDECREF(global_name);
+        Py_XDECREF(module_name);
+        return -1;
+    }
+    global = find_class(self, module_name, global_name);
+    Py_DECREF(global_name);
+    Py_DECREF(module_name);
+    if (global == NULL)
+        return -1;
+    PDATA_PUSH(self->stack, global, -1);
+    return 0;
+}
+
+static int
 load_persid(UnpicklerObject *self)
 {
     PyObject *pid;
@@ -5129,6 +5393,70 @@
 }
 
 static int
+do_additems(UnpicklerObject *self, Py_ssize_t x)
+{
+    PyObject *set;
+    Py_ssize_t len, i;
+
+    len = Py_SIZE(self->stack);
+    if (x > len || x <= 0)
+        return stack_underflow();
+    if (len == x)  /* nothing to do */
+        return 0;
+
+    set = self->stack->data[x - 1];
+
+    if (PySet_Check(set)) {
+        PyObject *items;
+        int status;
+
+        items = Pdata_poptuple(self->stack, x);
+        if (items == NULL)
+            return -1;
+
+        status = _PySet_Update(set, items);
+        Py_DECREF(items);
+        return status;
+    }
+    else {
+        PyObject *add_func;
+        _Py_IDENTIFIER(add);
+
+        add_func = _PyObject_GetAttrId(set, &PyId_add);
+        if (add_func == NULL)
+            return -1;
+        for (i = x; i < len; i++) {
+            PyObject *result;
+            PyObject *item;
+
+            item = self->stack->data[i];
+            result = _Unpickler_FastCall(self, add_func, item);
+            if (result == NULL) {
+                Pdata_clear(self->stack, i + 1);
+                Py_SIZE(self->stack) = x;
+                return -1;
+            }
+            Py_DECREF(result);
+        }
+        Py_SIZE(self->stack) = x;
+    }
+
+    return 0;
+}
+
+static int
+load_additem(UnpicklerObject *self)
+{
+    return do_additems(self, Py_SIZE(self->stack) - 1);
+}
+
+static int
+load_additems(UnpicklerObject *self)
+{
+    return do_additems(self, marker(self));
+}
+
+static int
 load_build(UnpicklerObject *self)
 {
     PyObject *state, *inst, *slotstate;
@@ -5362,13 +5690,16 @@
         OP_ARG(LONG4, load_counted_long, 4)
         OP(FLOAT, load_float)
         OP(BINFLOAT, load_binfloat)
-        OP(BINBYTES, load_binbytes)
-        OP(SHORT_BINBYTES, load_short_binbytes)
-        OP(BINSTRING, load_binstring)
-        OP(SHORT_BINSTRING, load_short_binstring)
+        OP_ARG(SHORT_BINBYTES, load_counted_binbytes, 1)
+        OP_ARG(BINBYTES, load_counted_binbytes, 4)
+        OP_ARG(BINBYTES8, load_counted_binbytes, 8)
+        OP_ARG(SHORT_BINSTRING, load_counted_binstring, 1)
+        OP_ARG(BINSTRING, load_counted_binstring, 4)
         OP(STRING, load_string)
         OP(UNICODE, load_unicode)
-        OP(BINUNICODE, load_binunicode)
+        OP_ARG(SHORT_BINUNICODE, load_counted_binunicode, 1)
+        OP_ARG(BINUNICODE, load_counted_binunicode, 4)
+        OP_ARG(BINUNICODE8, load_counted_binunicode, 8)
         OP_ARG(EMPTY_TUPLE, load_counted_tuple, 0)
         OP_ARG(TUPLE1, load_counted_tuple, 1)
         OP_ARG(TUPLE2, load_counted_tuple, 2)
@@ -5378,10 +5709,16 @@
         OP(LIST, load_list)
         OP(EMPTY_DICT, load_empty_dict)
         OP(DICT, load_dict)
+        OP(EMPTY_SET, load_empty_set)
+        OP(ADDITEM, load_additem)
+        OP(ADDITEMS, load_additems)
+        OP(EMPTY_FROZENSET, load_empty_frozenset)
+        OP(FROZENSET, load_frozenset)
         OP(OBJ, load_obj)
         OP(INST, load_inst)
         OP(NEWOBJ, load_newobj)
         OP(GLOBAL, load_global)
+        OP(STACK_GLOBAL, load_stack_global)
         OP(APPEND, load_append)
         OP(APPENDS, load_appends)
         OP(BUILD, load_build)