diff -r 9fcb2676696c -r 372d0bca85ae Include/dictobject.h
--- a/Include/dictobject.h	Mon Apr 02 07:52:29 2012 +0200
+++ b/Include/dictobject.h	Mon Apr 02 11:47:24 2012 +0100
@@ -13,78 +13,20 @@
    tuning dictionaries, and several ideas for possible optimizations.
 */
 
-/*
-There are three kinds of slots in the table:
+#ifndef Py_LIMITED_API
 
-1. Unused.  me_key == me_value == NULL
-   Does not hold an active (key, value) pair now and never did.  Unused can
-   transition to Active upon key insertion.  This is the only case in which
-   me_key is NULL, and is each slot's initial state.
+typedef struct _dictkeysobject PyDictKeysObject;
 
-2. Active.  me_key != NULL and me_key != dummy and me_value != NULL
-   Holds an active (key, value) pair.  Active can transition to Dummy upon
-   key deletion.  This is the only case in which me_value != NULL.
+/* The ma_values pointer is NULL for a combined table
+ * or points to an array of PyObject* for a split table
+ */
+typedef struct {
+    PyObject_HEAD
+    Py_ssize_t ma_used;
+    PyDictKeysObject *ma_keys;
+    PyObject **ma_values;
+} PyDictObject;
 
-3. Dummy.  me_key == dummy and me_value == NULL
-   Previously held an active (key, value) pair, but that was deleted and an
-   active pair has not yet overwritten the slot.  Dummy can transition to
-   Active upon key insertion.  Dummy slots cannot be made Unused again
-   (cannot have me_key set to NULL), else the probe sequence in case of
-   collision would have no way to know they were once active.
-
-Note: .popitem() abuses the me_hash field of an Unused or Dummy slot to
-hold a search finger.  The me_hash field of Unused or Dummy slots has no
-meaning otherwise.
-*/
-
-/* PyDict_MINSIZE is the minimum size of a dictionary.  This many slots are
- * allocated directly in the dict object (in the ma_smalltable member).
- * It must be a power of 2, and at least 4.  8 allows dicts with no more
- * than 5 active entries to live in ma_smalltable (and so avoid an
- * additional malloc); instrumentation suggested this suffices for the
- * majority of dicts (consisting mostly of usually-small instance dicts and
- * usually-small dicts created to pass keyword arguments).
- */
-#ifndef Py_LIMITED_API
-#define PyDict_MINSIZE 8
-
-typedef struct {
-    /* Cached hash code of me_key. */
-    Py_hash_t me_hash;
-    PyObject *me_key;
-    PyObject *me_value;
-} PyDictEntry;
-
-/*
-To ensure the lookup algorithm terminates, there must be at least one Unused
-slot (NULL key) in the table.
-The value ma_fill is the number of non-NULL keys (sum of Active and Dummy);
-ma_used is the number of non-NULL, non-dummy keys (== the number of non-NULL
-values == the number of Active items).
-To avoid slowing down lookups on a near-full table, we resize the table when
-it's two-thirds full.
-*/
-typedef struct _dictobject PyDictObject;
-struct _dictobject {
-    PyObject_HEAD
-    Py_ssize_t ma_fill;  /* # Active + # Dummy */
-    Py_ssize_t ma_used;  /* # Active */
-
-    /* The table contains ma_mask + 1 slots, and that's a power of 2.
-     * We store the mask instead of the size because the mask is more
-     * frequently needed.
-     */
-    Py_ssize_t ma_mask;
-
-    /* ma_table points to ma_smalltable for small tables, else to
-     * additional malloc'ed memory.  ma_table is never NULL!  This rule
-     * saves repeated runtime null-tests in the workhorse getitem and
-     * setitem calls.
-     */
-    PyDictEntry *ma_table;
-    PyDictEntry *(*ma_lookup)(PyDictObject *mp, PyObject *key, Py_hash_t hash);
-    PyDictEntry ma_smalltable[PyDict_MINSIZE];
-};
 #endif /* Py_LIMITED_API */
 
 PyAPI_DATA(PyTypeObject) PyDict_Type;
@@ -115,6 +57,8 @@
 PyAPI_FUNC(int) PyDict_Next(
     PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value);
 #ifndef Py_LIMITED_API
+PyDictKeysObject *_PyDict_NewKeysForClass(void);
+PyAPI_FUNC(PyObject *) PyObject_GenericGetDict(PyObject *, void *);
 PyAPI_FUNC(int) _PyDict_Next(
     PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value, Py_hash_t *hash);
 #endif
@@ -129,6 +73,7 @@
 PyAPI_FUNC(PyObject *) _PyDict_NewPresized(Py_ssize_t minused);
 PyAPI_FUNC(void) _PyDict_MaybeUntrack(PyObject *mp);
 PyAPI_FUNC(int) _PyDict_HasOnlyStringKeys(PyObject *mp);
+#define _PyDict_HasSplitTable(d) ((d)->ma_values != NULL)
 
 PyAPI_FUNC(int) PyDict_ClearFreeList(void);
 #endif
@@ -160,6 +105,11 @@
 PyAPI_FUNC(int) _PyDict_SetItemId(PyObject *dp, struct _Py_Identifier *key, PyObject *item);
 PyAPI_FUNC(int) PyDict_DelItemString(PyObject *dp, const char *key);
 
+#ifndef Py_LIMITED_API
+int _PyObjectDict_SetItem(PyTypeObject *tp, PyObject **dictptr, PyObject *name, PyObject *value);
+PyObject *_PyDict_LoadGlobal(PyDictObject *, PyDictObject *, PyObject *);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff -r 9fcb2676696c -r 372d0bca85ae Include/object.h
--- a/Include/object.h	Mon Apr 02 07:52:29 2012 +0200
+++ b/Include/object.h	Mon Apr 02 11:47:24 2012 +0100
@@ -449,6 +449,7 @@
                                       see add_operators() in typeobject.c . */
     PyBufferProcs as_buffer;
     PyObject *ht_name, *ht_slots, *ht_qualname;
+    struct _dictkeysobject *ht_cached_keys;
     /* here are optional user slots, followed by the members. */
 } PyHeapTypeObject;
 
@@ -517,7 +518,6 @@
 PyAPI_FUNC(PyObject *) PyObject_GenericGetAttr(PyObject *, PyObject *);
 PyAPI_FUNC(int) PyObject_GenericSetAttr(PyObject *,
                                               PyObject *, PyObject *);
-PyAPI_FUNC(PyObject *) PyObject_GenericGetDict(PyObject *, void *);
 PyAPI_FUNC(int) PyObject_GenericSetDict(PyObject *, PyObject *, void *);
 PyAPI_FUNC(Py_hash_t) PyObject_Hash(PyObject *);
 PyAPI_FUNC(Py_hash_t) PyObject_HashNotImplemented(PyObject *);
diff -r 9fcb2676696c -r 372d0bca85ae Lib/test/test_dict.py
--- a/Lib/test/test_dict.py	Mon Apr 02 07:52:29 2012 +0200
+++ b/Lib/test/test_dict.py	Mon Apr 02 11:47:24 2012 +0100
@@ -319,6 +319,27 @@
         self.assertEqual(hashed2.hash_count, 1)
         self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
 
+    def test_setitem_atomic_at_resize(self):
+        class Hashed(object):
+            def __init__(self):
+                self.hash_count = 0
+                self.eq_count = 0
+            def __hash__(self):
+                self.hash_count += 1
+                return 42
+            def __eq__(self, other):
+                self.eq_count += 1
+                return id(self) == id(other)
+        hashed1 = Hashed()
+        # 5 items
+        y = {hashed1: 5, 0: 0, 1: 1, 2: 2, 3: 3}
+        hashed2 = Hashed()
+        # 6th item forces a resize
+        y[hashed2] = []
+        self.assertEqual(hashed1.hash_count, 1)
+        self.assertEqual(hashed2.hash_count, 1)
+        self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
+
     def test_popitem(self):
         # dict.popitem()
         for copymode in -1, +1:
diff -r 9fcb2676696c -r 372d0bca85ae Lib/test/test_pprint.py
--- a/Lib/test/test_pprint.py	Mon Apr 02 07:52:29 2012 +0200
+++ b/Lib/test/test_pprint.py	Mon Apr 02 11:47:24 2012 +0100
@@ -241,11 +241,15 @@
         # Consequently, this test is fragile and
         # implementation-dependent.  Small changes to Python's sort
         # algorithm cause the test to fail when it should pass.
+        # XXX Or changes to the dictionary implmentation...
 
         self.assertEqual(pprint.pformat(set()), 'set()')
         self.assertEqual(pprint.pformat(set(range(3))), '{0, 1, 2}')
         self.assertEqual(pprint.pformat(frozenset()), 'frozenset()')
+
         self.assertEqual(pprint.pformat(frozenset(range(3))), 'frozenset({0, 1, 2})')
+        # This test needs fixing: http://bugs.python.org/issue13907
+        return
         cube_repr_tgt = """\
 {frozenset(): frozenset({frozenset({2}), frozenset({0}), frozenset({1})}),
  frozenset({0}): frozenset({frozenset(),
diff -r 9fcb2676696c -r 372d0bca85ae Lib/test/test_sys.py
--- a/Lib/test/test_sys.py	Mon Apr 02 07:52:29 2012 +0200
+++ b/Lib/test/test_sys.py	Mon Apr 02 11:47:24 2012 +0100
@@ -687,9 +687,9 @@
         # method-wrapper (descriptor object)
         check({}.__iter__, size(h + '2P'))
         # dict
-        check({}, size(h + '3P2P' + 8*'P2P'))
+        check({}, size(h + '3P' + '4P' + 8*'P2P'))
         longdict = {1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8}
-        check(longdict, size(h + '3P2P' + 8*'P2P') + 16*size('P2P'))
+        check(longdict, size(h + '3P' + '4P') + 16*size('P2P'))
         # dictionary-keyiterator
         check({}.keys(), size(h + 'P'))
         # dictionary-valueiterator
@@ -831,7 +831,7 @@
         # type
         # (PyTypeObject + PyNumberMethods + PyMappingMethods +
         #  PySequenceMethods + PyBufferProcs)
-        s = size(vh + 'P2P15Pl4PP9PP11PI') + size('16Pi17P 3P 10P 2P 3P')
+        s = size(vh + 'P2P15Pl4PP9PP11PIP') + size('16Pi17P 3P 10P 2P 3P')
         check(int, s)
         # class
         class newstyleclass(object): pass
diff -r 9fcb2676696c -r 372d0bca85ae Objects/dictnotes.txt
--- a/Objects/dictnotes.txt	Mon Apr 02 07:52:29 2012 +0200
+++ b/Objects/dictnotes.txt	Mon Apr 02 11:47:24 2012 +0100
@@ -1,7 +1,6 @@
-NOTES ON OPTIMIZING DICTIONARIES
+NOTES ON DICTIONARIES
 ================================
 
-
 Principal Use Cases for Dictionaries
 ------------------------------------
 
@@ -21,7 +20,7 @@
 
 Builtins
     Frequent reads.  Almost never written.
-    Size 126 interned strings (as of Py2.3b1).
+    Size 147 interned strings (as of Py3.3a0).
     A few keys are accessed much more frequently than others.
 
 Uniquification
@@ -59,38 +58,39 @@
     Characterized by deletions interspersed with adds and replacements.
     Performance benefits greatly from the re-use of dummy entries.
 
-
-Data Layout (assuming a 32-bit box with 64 bytes per cache line)
+Data Layout
 ----------------------------------------------------------------
 
-Smalldicts (8 entries) are attached to the dictobject structure
-and the whole group nearly fills two consecutive cache lines.
-
-Larger dicts use the first half of the dictobject structure (one cache
-line) and a separate, continuous block of entries (at 12 bytes each
-for a total of 5.333 entries per cache line).
+Dictionaries are composed of 3 components:
+The dictobject struct itself
+A dict-keys object (keys & hashes)
+A values array
 
 
 Tunable Dictionary Parameters
 -----------------------------
 
-* PyDict_MINSIZE.  Currently set to 8.
-    Must be a power of two.  New dicts have to zero-out every cell.
-    Each additional 8 consumes 1.5 cache lines.  Increasing improves
-    the sparseness of small dictionaries but costs time to read in
-    the additional cache lines if they are not already in cache.
+* PyDict_STARTSIZE. Starting size of dict (unless an instance dict).
+    Currently set to 8. Must be a power of two.
+    New dicts have to zero-out every cell.
+    Increasing improves the sparseness of small dictionaries but costs time
+    to read in the additional cache lines if they are not already in cache.
     That case is common when keyword arguments are passed.
+    Prior to version 3.3, PyDict_MINSIZE was used as the starting size of
+    a new dict.
 
-* Maximum dictionary load in PyDict_SetItem.  Currently set to 2/3.
+* PyDict_MINSIZE. Minimum size of a dict.
+    Currently set to 4 (to keep instance dicts small).
+    Must be a power of two. Prior to version 3.3, PyDict_MINSIZE was
+    set to 8.
+
+* USABLE_FRACTION. Maximum dictionary load in PyDict_SetItem.
+    Currently set to 2/3.
     Increasing this ratio makes dictionaries more dense resulting
     in more collisions.  Decreasing it improves sparseness at the
     expense of spreading entries over more cache lines and at the
     cost of total memory consumed.
 
-    The load test occurs in highly time sensitive code.  Efforts
-    to make the test more complex (for example, varying the load
-    for different sizes) have degraded performance.
-
 * Growth rate upon hitting maximum load.  Currently set to *2.
     Raising this to *4 results in half the number of resizes,
     less effort to resize, better sparseness for some (but not
@@ -126,8 +126,8 @@
 Also, every dictionary iterates at least twice, once for the memset()
 when it is created and once by dealloc().
 
-Dictionary operations involving only a single key can be O(1) unless 
-resizing is possible.  By checking for a resize only when the 
+Dictionary operations involving only a single key can be O(1) unless
+resizing is possible.  By checking for a resize only when the
 dictionary can grow (and may *require* resizing), other operations
 remain O(1), and the odds of resize thrashing or memory fragmentation
 are reduced. In particular, an algorithm that empties a dictionary
@@ -135,136 +135,51 @@
 not be necessary at all because the dictionary is eventually
 discarded entirely.
 
+The key differences between this implementation and earlier versions are:
+    1. The table can be split into two parts, the keys and the values.
+
+    2. There is an additional key-value combination: (key, NULL).
+       Unlike (<dummy>, NULL) which represents a deleted value, (key, NULL)
+       represented a yet to be inserted value. This combination can only occur
+       when the table is split.
+
+    3. No small table embedded in the dict,
+       as this would make sharing of key-tables impossible.
+
+
+These changes have the following consequences.
+   1. General dictionaries are slightly larger.
+
+   2. All object dictionaries of a single class can share a single key-table,
+      saving about 60% memory for such cases.
 
 Results of Cache Locality Experiments
--------------------------------------
+--------------------------------------
 
-When an entry is retrieved from memory, 4.333 adjacent entries are also
-retrieved into a cache line.  Since accessing items in cache is *much*
-cheaper than a cache miss, an enticing idea is to probe the adjacent
-entries as a first step in collision resolution.  Unfortunately, the
-introduction of any regularity into collision searches results in more
-collisions than the current random chaining approach.
+Experiments on an earlier design of dictionary, in which all tables were
+combined, showed the following:
 
-Exploiting cache locality at the expense of additional collisions fails
-to payoff when the entries are already loaded in cache (the expense
-is paid with no compensating benefit).  This occurs in small dictionaries
-where the whole dictionary fits into a pair of cache lines.  It also
-occurs frequently in large dictionaries which have a common access pattern
-where some keys are accessed much more frequently than others.  The
-more popular entries *and* their collision chains tend to remain in cache.
+  When an entry is retrieved from memory, several adjacent entries are also
+  retrieved into a cache line.  Since accessing items in cache is *much*
+  cheaper than a cache miss, an enticing idea is to probe the adjacent
+  entries as a first step in collision resolution.  Unfortunately, the
+  introduction of any regularity into collision searches results in more
+  collisions than the current random chaining approach.
 
-To exploit cache locality, change the collision resolution section
-in lookdict() and lookdict_string().  Set i^=1 at the top of the
-loop and move the  i = (i << 2) + i + perturb + 1 to an unrolled
-version of the loop.
+  Exploiting cache locality at the expense of additional collisions fails
+  to payoff when the entries are already loaded in cache (the expense
+  is paid with no compensating benefit).  This occurs in small dictionaries
+  where the whole dictionary fits into a pair of cache lines.  It also
+  occurs frequently in large dictionaries which have a common access pattern
+  where some keys are accessed much more frequently than others.  The
+  more popular entries *and* their collision chains tend to remain in cache.
 
-This optimization strategy can be leveraged in several ways:
+  To exploit cache locality, change the collision resolution section
+  in lookdict() and lookdict_string().  Set i^=1 at the top of the
+  loop and move the  i = (i << 2) + i + perturb + 1 to an unrolled
+  version of the loop.
 
-* If the dictionary is kept sparse (through the tunable parameters),
-then the occurrence of additional collisions is lessened.
+For split tables, the above will apply to the keys, but the value will
+always be in a different cache line from the key.
 
-* If lookdict() and lookdict_string() are specialized for small dicts
-and for largedicts, then the versions for large_dicts can be given
-an alternate search strategy without increasing collisions in small dicts
-which already have the maximum benefit of cache locality.
 
-* If the use case for a dictionary is known to have a random key
-access pattern (as opposed to a more common pattern with a Zipf's law
-distribution), then there will be more benefit for large dictionaries
-because any given key is no more likely than another to already be
-in cache.
-
-* In use cases with paired accesses to the same key, the second access
-is always in cache and gets no benefit from efforts to further improve
-cache locality.
-
-Optimizing the Search of Small Dictionaries
--------------------------------------------
-
-If lookdict() and lookdict_string() are specialized for smaller dictionaries,
-then a custom search approach can be implemented that exploits the small
-search space and cache locality.
-
-* The simplest example is a linear search of contiguous entries.  This is
-  simple to implement, guaranteed to terminate rapidly, never searches
-  the same entry twice, and precludes the need to check for dummy entries.
-
-* A more advanced example is a self-organizing search so that the most
-  frequently accessed entries get probed first.  The organization
-  adapts if the access pattern changes over time.  Treaps are ideally
-  suited for self-organization with the most common entries at the
-  top of the heap and a rapid binary search pattern.  Most probes and
-  results are all located at the top of the tree allowing them all to
-  be located in one or two cache lines.
-
-* Also, small dictionaries may be made more dense, perhaps filling all
-  eight cells to take the maximum advantage of two cache lines.
-
-
-Strategy Pattern
-----------------
-
-Consider allowing the user to set the tunable parameters or to select a
-particular search method.  Since some dictionary use cases have known
-sizes and access patterns, the user may be able to provide useful hints.
-
-1) For example, if membership testing or lookups dominate runtime and memory
-   is not at a premium, the user may benefit from setting the maximum load
-   ratio at 5% or 10% instead of the usual 66.7%.  This will sharply
-   curtail the number of collisions but will increase iteration time.
-   The builtin namespace is a prime example of a dictionary that can
-   benefit from being highly sparse.
-
-2) Dictionary creation time can be shortened in cases where the ultimate
-   size of the dictionary is known in advance.  The dictionary can be
-   pre-sized so that no resize operations are required during creation.
-   Not only does this save resizes, but the key insertion will go
-   more quickly because the first half of the keys will be inserted into
-   a more sparse environment than before.  The preconditions for this
-   strategy arise whenever a dictionary is created from a key or item
-   sequence and the number of *unique* keys is known.
-
-3) If the key space is large and the access pattern is known to be random,
-   then search strategies exploiting cache locality can be fruitful.
-   The preconditions for this strategy arise in simulations and
-   numerical analysis.
-
-4) If the keys are fixed and the access pattern strongly favors some of
-   the keys, then the entries can be stored contiguously and accessed
-   with a linear search or treap.  This exploits knowledge of the data,
-   cache locality, and a simplified search routine.  It also eliminates
-   the need to test for dummy entries on each probe.  The preconditions
-   for this strategy arise in symbol tables and in the builtin dictionary.
-
-
-Readonly Dictionaries
----------------------
-Some dictionary use cases pass through a build stage and then move to a
-more heavily exercised lookup stage with no further changes to the
-dictionary.
-
-An idea that emerged on python-dev is to be able to convert a dictionary
-to a read-only state.  This can help prevent programming errors and also
-provide knowledge that can be exploited for lookup optimization.
-
-The dictionary can be immediately rebuilt (eliminating dummy entries),
-resized (to an appropriate level of sparseness), and the keys can be
-jostled (to minimize collisions).  The lookdict() routine can then
-eliminate the test for dummy entries (saving about 1/4 of the time
-spent in the collision resolution loop).
-
-An additional possibility is to insert links into the empty spaces
-so that dictionary iteration can proceed in len(d) steps instead of
-(mp->mask + 1) steps.  Alternatively, a separate tuple of keys can be
-kept just for iteration.
-
-
-Caching Lookups
----------------
-The idea is to exploit key access patterns by anticipating future lookups
-based on previous lookups.
-
-The simplest incarnation is to save the most recently accessed entry.
-This gives optimal performance for use cases where every get is followed
-by a set or del to the same key.
diff -r 9fcb2676696c -r 372d0bca85ae Objects/dictobject.c
--- a/Objects/dictobject.c	Mon Apr 02 07:52:29 2012 +0200
+++ b/Objects/dictobject.c	Mon Apr 02 11:47:24 2012 +0100
@@ -7,9 +7,93 @@
    tuning dictionaries, and several ideas for possible optimizations.
 */
 
+
+/*
+There are four kinds of slots in the table:
+
+1. Unused.  me_key == me_value == NULL
+   Does not hold an active (key, value) pair now and never did.  Unused can
+   transition to Active upon key insertion.  This is the only case in which
+   me_key is NULL, and is each slot's initial state.
+
+2. Active.  me_key != NULL and me_key != dummy and me_value != NULL
+   Holds an active (key, value) pair.  Active can transition to Dummy or
+   Pending upon key deletion (for combined and split tables respectively).
+   This is the only case in which me_value != NULL.
+
+3. Dummy.  me_key == dummy and me_value == NULL
+   Previously held an active (key, value) pair, but that was deleted and an
+   active pair has not yet overwritten the slot.  Dummy can transition to
+   Active upon key insertion.  Dummy slots cannot be made Unused again
+   (cannot have me_key set to NULL), else the probe sequence in case of
+   collision would have no way to know they were once active.
+
+4. Pending. Not yet inserted or deleted from a split-table.
+   key != NULL, key != dummy and value == NULL
+
+The DictObject can be in one of two forms.
+Either:
+  A combined table:
+    ma_values == NULL, dk_refcnt == 1.
+    Values are stored in the _me_value field of the PyDictKeysObject.
+    Slot kind 4 is not allowed i.e.
+        key != NULL, key != dummy and value == NULL is illegal.
+Or:
+  A split table:
+    ma_values != NULL, dk_refcnt >= 1
+    Values are stored in the ma_values array.
+    Only string (unicode) keys are allowed, no <dummy> keys are present.
+
+Note: .popitem() abuses the me_hash field of an Unused or Dummy slot to
+hold a search finger.  The me_hash field of Unused or Dummy slots has no
+meaning otherwise. As a consequence of this popitem always converts the dict
+to the combined-table form.
+*/
+
+/* PyDict_MINSIZE_SPLIT is the minimum size of a split dictionary.
+ * It must be a power of 2, and at least 4.
+ * Resizing of split dictionaries is very rare, so the saving memory is more
+ * important than the cost of resizing.
+ */
+#define PyDict_MINSIZE_SPLIT 4
+
+/* PyDict_MINSIZE_COMBINED is the starting size for any new, non-split dict.
+ * 8 allows dicts with no more than 5 active entries; experiments suggested
+ * this suffices for the majority of dicts (consisting mostly of usually-small
+ * dicts created to pass keyword arguments).
+ * Making this 8, rather than 4 reduces the number of resizes for most
+ * dictionaries, without any significant extra memory use.
+ */
+#define PyDict_MINSIZE_COMBINED 8
+
 #include "Python.h"
 #include "stringlib/eq.h"
 
+typedef struct {
+    /* Cached hash code of me_key. */
+    Py_hash_t me_hash;
+    PyObject *me_key;
+    PyObject *_me_value; /* This field is only meaningful for combined tables */
+} PyDictKeyEntry;
+
+typedef PyDictKeyEntry *(*dict_lookup_func)
+(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject ***value_addr);
+
+struct _dictkeysobject {
+    Py_ssize_t dk_refcnt;
+    Py_ssize_t dk_size;
+    dict_lookup_func dk_lookup;
+    Py_ssize_t dk_usable;
+    PyDictKeyEntry dk_entries[1];
+};
+
+
+/*
+To ensure the lookup algorithm terminates, there must be at least one Unused
+slot (NULL key) in the table.
+To avoid slowing down lookups on a near-full table, we resize the table when
+it's USABLE_FRACTION (currently two-thirds) full.
+*/
 
 /* Set a key error with the specified argument, wrapping it in a
  * tuple automatically so that tuple keys are not unpacked as the
@@ -25,10 +109,6 @@
     Py_DECREF(tup);
 }
 
-/* Define this out if you don't want conversion statistics on exit. */
-#undef SHOW_CONVERSION_COUNTS
-
-/* See large comment block below.  This must be >= 1. */
 #define PERTURB_SHIFT 5
 
 /*
@@ -126,8 +206,13 @@
 
 */
 
-/* Object used as dummy key to fill deleted entries */
-static PyObject *dummy = NULL; /* Initialized by first call to newPyDictObject() */
+/* Object used as dummy key to fill deleted entries
+ * This could be any unique object,
+ * use a custom type in order to minimise coupling.
+*/
+static PyObject _dummy_struct;
+
+#define dummy (&_dummy_struct)
 
 #ifdef Py_REF_DEBUG
 PyObject *
@@ -138,77 +223,18 @@
 #endif
 
 /* forward declarations */
-static PyDictEntry *
-lookdict_unicode(PyDictObject *mp, PyObject *key, Py_hash_t hash);
-
-#ifdef SHOW_CONVERSION_COUNTS
-static long created = 0L;
-static long converted = 0L;
-
-static void
-show_counts(void)
-{
-    fprintf(stderr, "created %ld string dicts\n", created);
-    fprintf(stderr, "converted %ld to normal dicts\n", converted);
-    fprintf(stderr, "%.2f%% conversion rate\n", (100.0*converted)/created);
-}
-#endif
-
-/* Debug statistic to compare allocations with reuse through the free list */
-#undef SHOW_ALLOC_COUNT
-#ifdef SHOW_ALLOC_COUNT
-static size_t count_alloc = 0;
-static size_t count_reuse = 0;
-
-static void
-show_alloc(void)
-{
-    fprintf(stderr, "Dict allocations: %" PY_FORMAT_SIZE_T "d\n",
-        count_alloc);
-    fprintf(stderr, "Dict reuse through freelist: %" PY_FORMAT_SIZE_T
-        "d\n", count_reuse);
-    fprintf(stderr, "%.2f%% reuse rate\n\n",
-        (100.0*count_reuse/(count_alloc+count_reuse)));
-}
-#endif
-
-/* Debug statistic to count GC tracking of dicts */
-#ifdef SHOW_TRACK_COUNT
-static Py_ssize_t count_untracked = 0;
-static Py_ssize_t count_tracked = 0;
-
-static void
-show_track(void)
-{
-    fprintf(stderr, "Dicts created: %" PY_FORMAT_SIZE_T "d\n",
-        count_tracked + count_untracked);
-    fprintf(stderr, "Dicts tracked by the GC: %" PY_FORMAT_SIZE_T
-        "d\n", count_tracked);
-    fprintf(stderr, "%.2f%% dict tracking rate\n\n",
-        (100.0*count_tracked/(count_untracked+count_tracked)));
-}
-#endif
-
-
-/* Initialization macros.
-   There are two ways to create a dict:  PyDict_New() is the main C API
-   function, and the tp_new slot maps to dict_new().  In the latter case we
-   can save a little time over what PyDict_New does because it's guaranteed
-   that the PyDictObject struct is already zeroed out.
-   Everyone except dict_new() should use EMPTY_TO_MINSIZE (unless they have
-   an excellent reason not to).
-*/
-
-#define INIT_NONZERO_DICT_SLOTS(mp) do {                                \
-    (mp)->ma_table = (mp)->ma_smalltable;                               \
-    (mp)->ma_mask = PyDict_MINSIZE - 1;                                 \
-    } while(0)
-
-#define EMPTY_TO_MINSIZE(mp) do {                                       \
-    memset((mp)->ma_smalltable, 0, sizeof((mp)->ma_smalltable));        \
-    (mp)->ma_used = (mp)->ma_fill = 0;                                  \
-    INIT_NONZERO_DICT_SLOTS(mp);                                        \
-    } while(0)
+static PyDictKeyEntry *lookdict(PyDictObject *mp, PyObject *key,
+                                Py_hash_t hash, PyObject ***value_addr);
+static PyDictKeyEntry *lookdict_unicode(PyDictObject *mp, PyObject *key,
+                                        Py_hash_t hash, PyObject ***value_addr);
+static PyDictKeyEntry *
+lookdict_unicode_nodummy(PyDictObject *mp, PyObject *key,
+                         Py_hash_t hash, PyObject ***value_addr);
+static PyDictKeyEntry *lookdict_split(PyDictObject *mp, PyObject *key,
+                                      Py_hash_t hash, PyObject ***value_addr);
+
+static int
+dictresize(PyDictObject *mp, Py_ssize_t minused);
 
 /* Dictionary reuse scheme to save calls to malloc, free, and memset */
 #ifndef PyDict_MAXFREELIST
@@ -236,61 +262,145 @@
     PyDict_ClearFreeList();
 }
 
-PyObject *
-PyDict_New(void)
+#define DK_INCREF(dk) (++(dk)->dk_refcnt)
+#define DK_DECREF(dk) if ((--(dk)->dk_refcnt) == 0) free_keys_object(dk)
+#define DK_SIZE(dk) ((dk)->dk_size)
+#define IS_POWER_OF_2(x) (((x) & (x-1)) == 0)
+
+/* USABLE_FRACTION must obey the following:
+ *     (0 < USABLE_FRACTION(n) < n) for all n >= 2
+ *
+ * USABLE_FRACTION should be very quick to calculate.
+ * Fractions around 5/8 to 2/3 seem to work well in practice.
+ */
+
+/* Use (2n+1)/3 rather than 2n+3 because: it makes no difference for
+ * combined tables (the two fractions round to the same number n < ),
+ * but 2*4/3 is 2 whereas (2*4+1)/3 is 3 which potentially saves quite
+ * a lot of space for small, split tables */
+#define USABLE_FRACTION(n) ((((n) << 1)+1)/3)
+
+/* Alternative fraction that is otherwise close enough to (2n+1)/3 to make
+ * little difference. 8 * 2/3 == 8 * 5/8 == 5. 16 * 2/3 == 16 * 5/8 == 10.
+ * 32 * 2/3 = 21, 32 * 5/8 = 20.
+ * Its advantage is that it is faster to compute on machines with slow division.
+ * #define USABLE_FRACTION(n) (((n) >> 1) + ((n) >> 2) - ((n) >> 3))
+*/
+
+
+#define ENSURE_ALLOWS_DELETIONS(d) \
+    if ((d)->ma_keys->dk_lookup == lookdict_unicode_nodummy) { \
+        (d)->ma_keys->dk_lookup = lookdict_unicode; \
+    }
+
+/* This immutable, empty PyDictKeysObject is used for PyDict_Clear()
+ * (which cannot fail and thus can do no allocation).
+ */
+static PyDictKeysObject empty_keys_struct = {
+        2, /* dk_refcnt 1 for this struct, 1 for dummy_struct */
+        1, /* dk_size */
+        lookdict_split, /* dk_lookup */
+        0, /* dk_usable (immutable) */
+        {
+            { 0, 0, 0 } /* dk_entries (empty) */
+        }
+};
+
+static PyObject *empty_values[1] = { NULL };
+
+#define Py_EMPTY_KEYS &empty_keys_struct
+
+static PyDictKeysObject *new_keys_object(Py_ssize_t size) {
+    PyDictKeysObject *dk;
+    Py_ssize_t i;
+    PyDictKeyEntry *ep0;
+
+    assert(size >= PyDict_MINSIZE_SPLIT);
+    assert(IS_POWER_OF_2(size));
+    dk = PyMem_Malloc(sizeof(PyDictKeysObject) +
+                      sizeof(PyDictKeyEntry) * (size-1));
+    if (dk == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    dk->dk_refcnt = 1;
+    dk->dk_size = size;
+    dk->dk_usable = USABLE_FRACTION(size);
+    ep0 = &dk->dk_entries[0];
+    /* Hash value of slot 0 is used by popitem, so it must be initialized */
+    ep0->me_hash = 0;
+    for (i = 0; i < size; i++) {
+        ep0[i].me_key = NULL;
+        ep0[i]._me_value = NULL;
+    }
+    dk->dk_lookup = lookdict_unicode_nodummy;
+    return dk;
+}
+
+static void
+free_keys_object(PyDictKeysObject *keys)
 {
-    register PyDictObject *mp;
-    if (dummy == NULL) { /* Auto-initialize dummy */
-        dummy = PyUnicode_FromString("<dummy key>");
-        if (dummy == NULL)
-            return NULL;
-#ifdef SHOW_CONVERSION_COUNTS
-        Py_AtExit(show_counts);
-#endif
-#ifdef SHOW_ALLOC_COUNT
-        Py_AtExit(show_alloc);
-#endif
-#ifdef SHOW_TRACK_COUNT
-        Py_AtExit(show_track);
-#endif
+    PyDictKeyEntry *entries = &keys->dk_entries[0];
+    Py_ssize_t i, n;
+    for (i = 0, n = DK_SIZE(keys); i < n; i++) {
+        Py_XDECREF(entries[i].me_key);
+        Py_XDECREF(entries[i]._me_value);
     }
+    PyMem_DEL(keys);
+}
+
+#define new_values(size) PyMem_New(PyObject *, size)
+
+#define free_values(values) PyMem_DEL(values)
+
+/* Consumes a reference to the keys object */
+static PyObject *
+new_dict(PyDictKeysObject *keys, PyObject **values)
+{
+    PyDictObject *mp;
     if (numfree) {
         mp = free_list[--numfree];
         assert (mp != NULL);
         assert (Py_TYPE(mp) == &PyDict_Type);
         _Py_NewReference((PyObject *)mp);
-        if (mp->ma_fill) {
-            EMPTY_TO_MINSIZE(mp);
-        } else {
-            /* At least set ma_table and ma_mask; these are wrong
-               if an empty but presized dict is added to freelist */
-            INIT_NONZERO_DICT_SLOTS(mp);
+    }
+    else {
+        mp = PyObject_GC_New(PyDictObject, &PyDict_Type);
+        if (mp == NULL) {
+            DK_DECREF(keys);
+            free_values(values);
+            return NULL;
         }
-        assert (mp->ma_used == 0);
-        assert (mp->ma_table == mp->ma_smalltable);
-        assert (mp->ma_mask == PyDict_MINSIZE - 1);
-#ifdef SHOW_ALLOC_COUNT
-        count_reuse++;
-#endif
-    } else {
-        mp = PyObject_GC_New(PyDictObject, &PyDict_Type);
-        if (mp == NULL)
-            return NULL;
-        EMPTY_TO_MINSIZE(mp);
-#ifdef SHOW_ALLOC_COUNT
-        count_alloc++;
-#endif
     }
-    mp->ma_lookup = lookdict_unicode;
-#ifdef SHOW_TRACK_COUNT
-    count_untracked++;
-#endif
-#ifdef SHOW_CONVERSION_COUNTS
-    ++created;
-#endif
+    mp->ma_keys = keys;
+    mp->ma_values = values;
+    mp->ma_used = 0;
     return (PyObject *)mp;
 }
 
+/* Consumes a reference to the keys object */
+static PyObject *
+new_dict_with_shared_keys(PyDictKeysObject *keys) {
+    PyObject **values;
+    Py_ssize_t i, size;
+
+    size = DK_SIZE(keys);
+    values = new_values(size);
+    if (values == NULL) {
+        DK_DECREF(keys);
+        return PyErr_NoMemory();
+    }
+    for (i = 0; i < size; i++) {
+        values[i] = NULL;
+    }
+    return new_dict(keys, values);
+}
+
+PyObject *
+PyDict_New(void) {
+    return new_dict(new_keys_object(PyDict_MINSIZE_COMBINED), NULL);
+}
+
 /*
 The basic lookup function used by all operations.
 This is based on Algorithm D from Knuth Vol. 3, Sec. 6.4.
@@ -309,29 +419,32 @@
 lookdict() is general-purpose, and may return NULL if (and only if) a
 comparison raises an exception (this was new in Python 2.5).
 lookdict_unicode() below is specialized to string keys, comparison of which can
-never raise an exception; that function can never return NULL.  For both, when
-the key isn't found a PyDictEntry* is returned for which the me_value field is
-NULL; this is the slot in the dict at which the key would have been found, and
-the caller can (if it wishes) add the <key, value> pair to the returned
-PyDictEntry*.
+never raise an exception; that function can never return NULL.
+lookdict_unicode_nodummy is further specialized for string keys that cannot be
+the <dummy> value.
+For both, when the key isn't found a PyDictEntry* is returned
+where the key would have been found, *value_addr points to the matching value
+slot.
 */
-static PyDictEntry *
-lookdict(PyDictObject *mp, PyObject *key, register Py_hash_t hash)
+static PyDictKeyEntry *
+lookdict(PyDictObject *mp, PyObject *key,
+         Py_hash_t hash, PyObject ***value_addr)
 {
     register size_t i;
     register size_t perturb;
-    register PyDictEntry *freeslot;
-    register size_t mask = (size_t)mp->ma_mask;
-    PyDictEntry *ep0 = mp->ma_table;
-    register PyDictEntry *ep;
+    register PyDictKeyEntry *freeslot;
+    register size_t mask = (size_t)mp->ma_keys->dk_size-1;
+    PyDictKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
+    register PyDictKeyEntry *ep;
     register int cmp;
     PyObject *startkey;
 
     i = (size_t)hash & mask;
     ep = &ep0[i];
-    if (ep->me_key == NULL || ep->me_key == key)
+    if (ep->me_key == NULL || ep->me_key == key) {
+        *value_addr = &ep->_me_value;
         return ep;
-
+    }
     if (ep->me_key == dummy)
         freeslot = ep;
     else {
@@ -342,9 +455,11 @@
             Py_DECREF(startkey);
             if (cmp < 0)
                 return NULL;
-            if (ep0 == mp->ma_table && ep->me_key == startkey) {
-                if (cmp > 0)
+            if (ep0 == mp->ma_keys->dk_entries && ep->me_key == startkey) {
+                if (cmp > 0) {
+                    *value_addr = &ep->_me_value;
                     return ep;
+                }
             }
             else {
                 PyErr_SetString(PyExc_RuntimeError,
@@ -360,20 +475,33 @@
     for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
         i = (i << 2) + i + perturb + 1;
         ep = &ep0[i & mask];
-        if (ep->me_key == NULL)
-            return freeslot == NULL ? ep : freeslot;
-        if (ep->me_key == key)
+        if (ep->me_key == NULL) {
+            if (freeslot == NULL) {
+                *value_addr = &ep->_me_value;
+                return ep;
+            } else {
+                *value_addr = &freeslot->_me_value;
+                return freeslot;
+            }
+        }
+        if (ep->me_key == key) {
+            *value_addr = &ep->_me_value;
             return ep;
+        }
         if (ep->me_hash == hash && ep->me_key != dummy) {
             startkey = ep->me_key;
             Py_INCREF(startkey);
             cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
             Py_DECREF(startkey);
-            if (cmp < 0)
+            if (cmp < 0) {
+                *value_addr = NULL;
                 return NULL;
-            if (ep0 == mp->ma_table && ep->me_key == startkey) {
-                if (cmp > 0)
+            }
+            if (ep0 == mp->ma_keys->dk_entries && ep->me_key == startkey) {
+                if (cmp > 0) {
+                    *value_addr = &ep->_me_value;
                     return ep;
+                }
             }
             else {
                 PyErr_SetString(PyExc_RuntimeError,
@@ -388,46 +516,39 @@
     return 0;
 }
 
-/*
- * Hacked up version of lookdict which can assume keys are always
- * unicodes; this assumption allows testing for errors during
- * PyObject_RichCompareBool() to be dropped; unicode-unicode
- * comparisons never raise exceptions.  This also means we don't need
- * to go through PyObject_RichCompareBool(); we can always use
- * unicode_eq() directly.
- *
- * This is valuable because dicts with only unicode keys are very common.
- */
-static PyDictEntry *
-lookdict_unicode(PyDictObject *mp, PyObject *key, register Py_hash_t hash)
+/* Specialized version for string-only keys */
+static PyDictKeyEntry *
+lookdict_unicode(PyDictObject *mp, PyObject *key,
+                 Py_hash_t hash, PyObject ***value_addr)
 {
     register size_t i;
     register size_t perturb;
-    register PyDictEntry *freeslot;
-    register size_t mask = (size_t)mp->ma_mask;
-    PyDictEntry *ep0 = mp->ma_table;
-    register PyDictEntry *ep;
+    register PyDictKeyEntry *freeslot;
+    register size_t mask = (size_t)mp->ma_keys->dk_size-1;
+    PyDictKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
+    register PyDictKeyEntry *ep;
 
     /* Make sure this function doesn't have to handle non-unicode keys,
        including subclasses of str; e.g., one reason to subclass
        unicodes is to override __eq__, and for speed we don't cater to
        that here. */
     if (!PyUnicode_CheckExact(key)) {
-#ifdef SHOW_CONVERSION_COUNTS
-        ++converted;
-#endif
-        mp->ma_lookup = lookdict;
-        return lookdict(mp, key, hash);
+        mp->ma_keys->dk_lookup = lookdict;
+        return lookdict(mp, key, hash, value_addr);
     }
     i = (size_t)hash & mask;
     ep = &ep0[i];
-    if (ep->me_key == NULL || ep->me_key == key)
+    if (ep->me_key == NULL || ep->me_key == key) {
+        *value_addr = &ep->_me_value;
         return ep;
+    }
     if (ep->me_key == dummy)
         freeslot = ep;
     else {
-        if (ep->me_hash == hash && unicode_eq(ep->me_key, key))
+        if (ep->me_hash == hash && unicode_eq(ep->me_key, key)) {
+            *value_addr = &ep->_me_value;
             return ep;
+        }
         freeslot = NULL;
     }
 
@@ -436,13 +557,22 @@
     for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
         i = (i << 2) + i + perturb + 1;
         ep = &ep0[i & mask];
-        if (ep->me_key == NULL)
-            return freeslot == NULL ? ep : freeslot;
+        if (ep->me_key == NULL) {
+            if (freeslot == NULL) {
+                *value_addr = &ep->_me_value;
+                return ep;
+            } else {
+                *value_addr = &freeslot->_me_value;
+                return freeslot;
+            }
+        }
         if (ep->me_key == key
             || (ep->me_hash == hash
             && ep->me_key != dummy
-            && unicode_eq(ep->me_key, key)))
+            && unicode_eq(ep->me_key, key))) {
+            *value_addr = &ep->_me_value;
             return ep;
+        }
         if (ep->me_key == dummy && freeslot == NULL)
             freeslot = ep;
     }
@@ -450,6 +580,99 @@
     return 0;
 }
 
+/* Faster version of lookdict_unicode when it is known that no <dummy> keys
+ * will be present. */
+static PyDictKeyEntry *
+lookdict_unicode_nodummy(PyDictObject *mp, PyObject *key,
+                         Py_hash_t hash, PyObject ***value_addr)
+{
+    register size_t i;
+    register size_t perturb;
+    register size_t mask = (size_t)mp->ma_keys->dk_size-1;
+    PyDictKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
+    register PyDictKeyEntry *ep;
+
+    /* Make sure this function doesn't have to handle non-unicode keys,
+       including subclasses of str; e.g., one reason to subclass
+       unicodes is to override __eq__, and for speed we don't cater to
+       that here. */
+    if (!PyUnicode_CheckExact(key)) {
+        mp->ma_keys->dk_lookup = lookdict;
+        return lookdict(mp, key, hash, value_addr);
+    }
+    i = (size_t)hash & mask;
+    ep = &ep0[i];
+    assert(ep->me_key == NULL || PyUnicode_CheckExact(ep->me_key));
+    if (ep->me_key == NULL || ep->me_key == key) {
+        *value_addr = &ep->_me_value;
+        return ep;
+    }
+    if (ep->me_hash == hash && unicode_eq(ep->me_key, key)) {
+        *value_addr = &ep->_me_value;
+        return ep;
+    }
+    for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
+        i = (i << 2) + i + perturb + 1;
+        ep = &ep0[i & mask];
+        assert(ep->me_key == NULL || PyUnicode_CheckExact(ep->me_key));
+        if (ep->me_key == NULL || ep->me_key == key) {
+            *value_addr = &ep->_me_value;
+            return ep;
+        }
+        if (ep->me_hash == hash && unicode_eq(ep->me_key, key)) {
+            *value_addr = &ep->_me_value;
+            return ep;
+        }
+    }
+    assert(0);          /* NOT REACHED */
+    return 0;
+}
+
+/* Version of lookdict for split tables.
+ * Split tables only contain unicode keys and no dummy keys,
+ * so algorithm is the same as lookdict_unicode_nodummy.
+ */
+static PyDictKeyEntry *
+lookdict_split(PyDictObject *mp, PyObject *key,
+                         Py_hash_t hash, PyObject ***value_addr)
+{
+    register size_t i;
+    register size_t perturb;
+    register size_t mask = (size_t)mp->ma_keys->dk_size-1;
+    PyDictKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
+    register PyDictKeyEntry *ep;
+
+    if (!PyUnicode_CheckExact(key)) {
+        return lookdict(mp, key, hash, value_addr);
+    }
+    i = (size_t)hash & mask;
+    ep = &ep0[i];
+    assert(ep->me_key == NULL || PyUnicode_CheckExact(ep->me_key));
+    if (ep->me_key == NULL || ep->me_key == key) {
+        *value_addr = &mp->ma_values[i];
+        return ep;
+    }
+    if (ep->me_hash == hash && unicode_eq(ep->me_key, key)) {
+        *value_addr = &mp->ma_values[i];
+        return ep;
+    }
+    for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
+        i = (i << 2) + i + perturb + 1;
+        ep = &ep0[i & mask];
+        assert(ep->me_key == NULL || PyUnicode_CheckExact(ep->me_key));
+        if (ep->me_key == NULL || ep->me_key == key) {
+            *value_addr = &mp->ma_values[i & mask];
+            return ep;
+        }
+        if (ep->me_hash == hash && unicode_eq(ep->me_key, key)) {
+            *value_addr = &mp->ma_values[i & mask];
+            return ep;
+        }
+    }
+    assert(0);          /* NOT REACHED */
+    return 0;
+}
+
 int
 _PyDict_HasOnlyStringKeys(PyObject *dict)
 {
@@ -457,7 +680,7 @@
     PyObject *key, *value;
     assert(PyDict_Check(dict));
     /* Shortcut */
-    if (((PyDictObject *)dict)->ma_lookup == lookdict_unicode)
+    if (((PyDictObject *)dict)->ma_keys->dk_lookup != lookdict)
         return 1;
     while (PyDict_Next(dict, &pos, &key, &value))
         if (!PyUnicode_Check(key))
@@ -465,23 +688,12 @@
     return 1;
 }
 
-#ifdef SHOW_TRACK_COUNT
-#define INCREASE_TRACK_COUNT \
-    (count_tracked++, count_untracked--);
-#define DECREASE_TRACK_COUNT \
-    (count_tracked--, count_untracked++);
-#else
-#define INCREASE_TRACK_COUNT
-#define DECREASE_TRACK_COUNT
-#endif
-
 #define MAINTAIN_TRACKING(mp, key, value) \
     do { \
         if (!_PyObject_GC_IS_TRACKED(mp)) { \
             if (_PyObject_GC_MAY_BE_TRACKED(key) || \
                 _PyObject_GC_MAY_BE_TRACKED(value)) { \
                 _PyObject_GC_TRACK(mp); \
-                INCREASE_TRACK_COUNT \
             } \
         } \
     } while(0)
@@ -491,59 +703,77 @@
 {
     PyDictObject *mp;
     PyObject *value;
-    Py_ssize_t mask, i;
-    PyDictEntry *ep;
+    Py_ssize_t i, size;
 
     if (!PyDict_CheckExact(op) || !_PyObject_GC_IS_TRACKED(op))
         return;
 
     mp = (PyDictObject *) op;
-    ep = mp->ma_table;
-    mask = mp->ma_mask;
-    for (i = 0; i <= mask; i++) {
-        if ((value = ep[i].me_value) == NULL)
-            continue;
-        if (_PyObject_GC_MAY_BE_TRACKED(value) ||
-            _PyObject_GC_MAY_BE_TRACKED(ep[i].me_key))
-            return;
+    size = DK_SIZE(mp->ma_keys);
+    if (_PyDict_HasSplitTable(mp)) {
+        for (i = 0; i < size; i++) {
+            if ((value = mp->ma_values[i]) == NULL)
+                continue;
+            if (_PyObject_GC_MAY_BE_TRACKED(value)) {
+                assert(!_PyObject_GC_MAY_BE_TRACKED(mp->ma_keys->dk_entries[i].me_key));
+                return;
+            }
+        }
+    } else {
+        PyDictKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
+        for (i = 0; i < size; i++) {
+            if ((value = ep0[i]._me_value) == NULL)
+                continue;
+            if (_PyObject_GC_MAY_BE_TRACKED(value) ||
+                _PyObject_GC_MAY_BE_TRACKED(ep0[i].me_key))
+                return;
+        }
     }
-    DECREASE_TRACK_COUNT
     _PyObject_GC_UNTRACK(op);
 }
 
-/*
-Internal routine to insert a new item into the table when you have entry object.
-Used by insertdict.
-*/
+/* Internal function to find slot for an item, from its hash,
+ * when it is known that the key is not present in the dict.
+ */
+static PyDictKeyEntry *
+find_empty_slot(PyDictObject *mp, PyObject *key, Py_hash_t hash,
+                PyObject ***value_addr)
+{
+    size_t i;
+    size_t perturb;
+    size_t mask = (size_t)DK_SIZE(mp->ma_keys)-1;
+    PyDictKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];
+    PyDictKeyEntry *ep;
+
+    assert(key != NULL);
+    if (!PyUnicode_CheckExact(key))
+        mp->ma_keys->dk_lookup = lookdict;
+    i = hash & mask;
+    ep = &ep0[i];
+    for (perturb = hash; ep->me_key != NULL; perturb >>= PERTURB_SHIFT) {
+        i = (i << 2) + i + perturb + 1;
+        ep = &ep0[i & mask];
+    }
+    assert(ep->_me_value == NULL);
+    if (mp->ma_values)
+        *value_addr = &mp->ma_values[i & mask];
+    else
+        *value_addr = &ep->_me_value;
+    return ep;
+}
+
 static int
-insertdict_by_entry(register PyDictObject *mp, PyObject *key, Py_hash_t hash,
-                    PyDictEntry *ep, PyObject *value)
+insertion_resize(PyDictObject *mp)
 {
-    PyObject *old_value;
-
-    MAINTAIN_TRACKING(mp, key, value);
-    if (ep->me_value != NULL) {
-        old_value = ep->me_value;
-        ep->me_value = value;
-        Py_DECREF(old_value); /* which **CAN** re-enter */
-        Py_DECREF(key);
-    }
-    else {
-        if (ep->me_key == NULL)
-            mp->ma_fill++;
-        else {
-            assert(ep->me_key == dummy);
-            Py_DECREF(dummy);
-        }
-        ep->me_key = key;
-        ep->me_hash = hash;
-        ep->me_value = value;
-        mp->ma_used++;
-    }
-    return 0;
+    /*
+    * Double the size of the dict,
+    * Previous versions quadrupled size, but doing so may result in excessive
+    * memory use. Doubling keeps the number of resizes low without wasting
+    * too much memory.
+    */
+    return dictresize(mp, 2 * mp->ma_used);
 }
 
-
 /*
 Internal routine to insert a new item into the table.
 Used both by the internal resize routine and by the public insert routine.
@@ -551,18 +781,64 @@
 Returns -1 if an error occurred, or 0 on success.
 */
 static int
-insertdict(register PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
+insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
 {
-    register PyDictEntry *ep;
-
-    assert(mp->ma_lookup != NULL);
-    ep = mp->ma_lookup(mp, key, hash);
+    PyObject *old_value;
+    PyObject **value_addr;
+    PyDictKeyEntry *ep;
+    assert(key != dummy);
+
+    if (mp->ma_values != NULL) {
+        if (!PyUnicode_CheckExact(key)) {
+            if (insertion_resize(mp) < 0)
+                return -1;
+        }
+    }
+
+    ep = mp->ma_keys->dk_lookup(mp, key, hash, &value_addr);
     if (ep == NULL) {
         Py_DECREF(key);
         Py_DECREF(value);
         return -1;
     }
-    return insertdict_by_entry(mp, key, hash, ep, value);
+    MAINTAIN_TRACKING(mp, key, value);
+    old_value = *value_addr;
+    if (old_value != NULL) {
+        assert(ep->me_key != NULL && ep->me_key != dummy);
+        *value_addr = value;
+        Py_DECREF(old_value); /* which **CAN** re-enter */
+        Py_DECREF(key);
+    } else {
+        if (ep->me_key == NULL) {
+            if (mp->ma_keys->dk_usable <= 0) {
+                /* Need to resize. */
+                if (insertion_resize(mp) < 0) {
+                    Py_DECREF(key);
+                    Py_DECREF(value);
+                    return -1;
+                }
+                ep = find_empty_slot(mp, key, hash, &value_addr);
+            }
+            mp->ma_keys->dk_usable--;
+            assert(mp->ma_keys->dk_usable >= 0);
+            ep->me_key = key;
+            ep->me_hash = hash;
+        } else {
+            if (ep->me_key == dummy) {
+                ep->me_key = key;
+                ep->me_hash = hash;
+                Py_DECREF(dummy);
+            } else {
+                Py_DECREF(key);
+                assert(_PyDict_HasSplitTable(mp));
+            }
+        }
+        mp->ma_used++;
+        *value_addr = value;
+    }
+    assert(ep->me_key != NULL && ep->me_key != dummy);
+    assert(PyUnicode_CheckExact(key) || mp->ma_keys->dk_lookup == lookdict);
+    return 0;
 }
 
 /*
@@ -572,50 +848,57 @@
 using insertdict() in dictresize() is dangerous (SF bug #1456209).
 Note that no refcounts are changed by this routine; if needed, the caller
 is responsible for incref'ing `key` and `value`.
+Neither mp->ma_used nor k->dk_usable are modified by this routine; the caller
+must set them correctly
 */
 static void
-insertdict_clean(register PyDictObject *mp, PyObject *key, Py_hash_t hash,
+insertdict_clean(PyDictObject *mp, PyObject *key, Py_hash_t hash,
                  PyObject *value)
 {
-    register size_t i;
-    register size_t perturb;
-    register size_t mask = (size_t)mp->ma_mask;
-    PyDictEntry *ep0 = mp->ma_table;
-    register PyDictEntry *ep;
-
-    MAINTAIN_TRACKING(mp, key, value);
-    i = (size_t)hash & mask;
+    size_t i;
+    size_t perturb;
+    PyDictKeysObject *k = mp->ma_keys;
+    size_t mask = (size_t)DK_SIZE(k)-1;
+    PyDictKeyEntry *ep0 = &k->dk_entries[0];
+    PyDictKeyEntry *ep;
+
+    assert(k->dk_lookup != NULL);
+    assert(value != NULL);
+    assert(key != NULL);
+    assert(key != dummy);
+    assert(PyUnicode_CheckExact(key) || k->dk_lookup == lookdict);
+    i = hash & mask;
     ep = &ep0[i];
     for (perturb = hash; ep->me_key != NULL; perturb >>= PERTURB_SHIFT) {
         i = (i << 2) + i + perturb + 1;
         ep = &ep0[i & mask];
     }
-    assert(ep->me_value == NULL);
-    mp->ma_fill++;
+    assert(ep->_me_value == NULL);
     ep->me_key = key;
     ep->me_hash = hash;
-    ep->me_value = value;
-    mp->ma_used++;
+    ep->_me_value = value;
 }
 
 /*
 Restructure the table by allocating a new table and reinserting all
 items again.  When entries have been deleted, the new table may
 actually be smaller than the old one.
+If a table is split (its keys and hashes are shared, its values are not),
+then the values are temporarily copied into the table, it is resized as
+a combined table, then the _me_value slots in the old table are NULLed out.
+After resizing a table is always combined,
+but can be resplit by make_keys_shared().
 */
 static int
 dictresize(PyDictObject *mp, Py_ssize_t minused)
 {
     Py_ssize_t newsize;
-    PyDictEntry *oldtable, *newtable, *ep;
-    Py_ssize_t i;
-    int is_oldtable_malloced;
-    PyDictEntry small_copy[PyDict_MINSIZE];
-
-    assert(minused >= 0);
-
-    /* Find the smallest table size > minused. */
-    for (newsize = PyDict_MINSIZE;
+    PyDictKeysObject *oldkeys;
+    PyObject **oldvalues;
+    Py_ssize_t i, size;
+
+/* Find the smallest table size > minused. */
+    for (newsize = PyDict_MINSIZE_COMBINED;
          newsize <= minused && newsize > 0;
          newsize <<= 1)
         ;
@@ -623,83 +906,112 @@
         PyErr_NoMemory();
         return -1;
     }
-
-    /* Get space for a new table. */
-    oldtable = mp->ma_table;
-    assert(oldtable != NULL);
-    is_oldtable_malloced = oldtable != mp->ma_smalltable;
-
-    if (newsize == PyDict_MINSIZE) {
-        /* A large table is shrinking, or we can't get any smaller. */
-        newtable = mp->ma_smalltable;
-        if (newtable == oldtable) {
-            if (mp->ma_fill == mp->ma_used) {
-                /* No dummies, so no point doing anything. */
-                return 0;
+    oldkeys = mp->ma_keys;
+    oldvalues = mp->ma_values;
+    /* Allocate a new table. */
+    mp->ma_keys = new_keys_object(newsize);
+    if (oldkeys->dk_lookup == lookdict)
+        mp->ma_keys->dk_lookup = lookdict;
+    if (mp->ma_keys == NULL) {
+        mp->ma_keys = oldkeys;
+        return -1;
+    }
+    size = DK_SIZE(oldkeys);
+    mp->ma_values = NULL;
+    /* If empty then nothing to copy so just return */
+    if (size == 1) {
+        assert(oldkeys = Py_EMPTY_KEYS);
+        DK_DECREF(oldkeys);
+        return 0;
+    }
+    /* Main loop below assumes we can transfer refcount to new keys
+     * and that value is stored in _me_value.
+     * Increment ref-counts and copy values here to compensate
+     * This (resizing a split table) should be relatively rare */
+    if (oldvalues != NULL) {
+        for (i = 0; i < size; i++)
+            if (oldvalues[i] != NULL) {
+                Py_INCREF(oldkeys->dk_entries[i].me_key);
+                oldkeys->dk_entries[i]._me_value = oldvalues[i];
             }
-            /* We're not going to resize it, but rebuild the
-               table anyway to purge old dummy entries.
-               Subtle:  This is *necessary* if fill==size,
-               as lookdict needs at least one virgin slot to
-               terminate failing searches.  If fill < size, it's
-               merely desirable, as dummies slow searches. */
-            assert(mp->ma_fill > mp->ma_used);
-            memcpy(small_copy, oldtable, sizeof(small_copy));
-            oldtable = small_copy;
+    }
+    /* Main loop */
+    for (i = 0; i < size; i++) {
+        PyDictKeyEntry *ep = &oldkeys->dk_entries[i];
+        if (ep->_me_value != NULL) {
+            assert(ep->me_key != dummy);
+            insertdict_clean(mp, ep->me_key, ep->me_hash, ep->_me_value);
         }
     }
+    mp->ma_keys->dk_usable -= mp->ma_used;
+    if (oldvalues != NULL) {
+        /* NULL out _me_value slot in oldkeys, in case it was shared */
+        for (i = 0; i < size; i++)
+            oldkeys->dk_entries[i]._me_value = NULL;
+        assert(oldvalues != empty_values);
+        free_values(oldvalues);
+        DK_DECREF(oldkeys);
+    }
     else {
-        newtable = PyMem_NEW(PyDictEntry, newsize);
-        if (newtable == NULL) {
-            PyErr_NoMemory();
-            return -1;
+        assert(oldkeys->dk_lookup != lookdict_split);
+        if (oldkeys->dk_lookup != lookdict_unicode_nodummy) {
+            PyDictKeyEntry *ep0 = &oldkeys->dk_entries[0];
+            for (i = 0; i < size; i++) {
+                if (ep0[i].me_key == dummy)
+                    Py_DECREF(dummy);
+            }
         }
+        assert(oldkeys->dk_refcnt == 1);
+        PyMem_DEL(oldkeys);
     }
-
-    /* Make the dict empty, using the new table. */
-    assert(newtable != oldtable);
-    mp->ma_table = newtable;
-    mp->ma_mask = newsize - 1;
-    memset(newtable, 0, sizeof(PyDictEntry) * newsize);
-    mp->ma_used = 0;
-    i = mp->ma_fill;
-    mp->ma_fill = 0;
-
-    /* Copy the data over; this is refcount-neutral for active entries;
-       dummy entries aren't copied over, of course */
-    for (ep = oldtable; i > 0; ep++) {
-        if (ep->me_value != NULL) {             /* active entry */
-            --i;
-            insertdict_clean(mp, ep->me_key, ep->me_hash, ep->me_value);
-        }
-        else if (ep->me_key != NULL) {          /* dummy entry */
-            --i;
-            assert(ep->me_key == dummy);
-            Py_DECREF(ep->me_key);
-        }
-        /* else key == value == NULL:  nothing to do */
-    }
-
-    if (is_oldtable_malloced)
-        PyMem_DEL(oldtable);
     return 0;
 }
 
-/* Create a new dictionary pre-sized to hold an estimated number of elements.
-   Underestimates are okay because the dictionary will resize as necessary.
-   Overestimates just mean the dictionary will be more sparse than usual.
-*/
+static PyDictKeysObject *
+make_keys_shared(PyObject *op) {
+    Py_ssize_t i;
+    Py_ssize_t size;
+    PyDictObject *mp = (PyDictObject *)op;
+
+    assert(PyDict_CheckExact(op));
+    if (!_PyDict_HasSplitTable(mp)) {
+        PyDictKeyEntry *ep0;
+        PyObject **values;
+        assert(mp->ma_keys->dk_refcnt == 1);
+        if (mp->ma_keys->dk_lookup == lookdict) {
+            return NULL;
+        } else if (mp->ma_keys->dk_lookup == lookdict_unicode) {
+            /* Remove dummy keys */
+            if (dictresize(mp, DK_SIZE(mp->ma_keys)))
+                return NULL;
+        }
+        assert(mp->ma_keys->dk_lookup == lookdict_unicode_nodummy);
+        /* Copy values into a new array */
+        ep0 = &mp->ma_keys->dk_entries[0];
+        values = new_values(DK_SIZE(mp->ma_keys));
+        if (values == NULL)
+            return NULL;
+        size = DK_SIZE(mp->ma_keys);
+        for (i = 0; i < size; i++) {
+            values[i] = ep0[i]._me_value;
+            ep0[i]._me_value = NULL;
+        }
+        mp->ma_keys->dk_lookup = lookdict_split;
+        mp->ma_values = values;
+    }
+    DK_INCREF(mp->ma_keys);
+    return mp->ma_keys;
+}
 
 PyObject *
 _PyDict_NewPresized(Py_ssize_t minused)
 {
-    PyObject *op = PyDict_New();
-
-    if (minused>5 && op != NULL && dictresize((PyDictObject *)op, minused) == -1) {
-        Py_DECREF(op);
-        return NULL;
-    }
-    return op;
+    Py_ssize_t newsize;
+    for (newsize = PyDict_MINSIZE_COMBINED;
+         newsize <= minused && newsize > 0;
+         newsize <<= 1)
+        ;
+    return new_dict(new_keys_object(newsize), NULL);
 }
 
 /* Note that, for historical reasons, PyDict_GetItem() suppresses all errors
@@ -717,8 +1029,10 @@
 {
     Py_hash_t hash;
     PyDictObject *mp = (PyDictObject *)op;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
     PyThreadState *tstate;
+    PyObject **value_addr;
+
     if (!PyDict_Check(op))
         return NULL;
     if (!PyUnicode_CheckExact(key) ||
@@ -742,20 +1056,20 @@
         /* preserve the existing exception */
         PyObject *err_type, *err_value, *err_tb;
         PyErr_Fetch(&err_type, &err_value, &err_tb);
-        ep = (mp->ma_lookup)(mp, key, hash);
+        ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
         /* ignore errors */
         PyErr_Restore(err_type, err_value, err_tb);
         if (ep == NULL)
             return NULL;
     }
     else {
-        ep = (mp->ma_lookup)(mp, key, hash);
+        ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
         if (ep == NULL) {
             PyErr_Clear();
             return NULL;
         }
     }
-    return ep->me_value;
+    return *value_addr;
 }
 
 /* Variant of PyDict_GetItem() that doesn't suppress exceptions.
@@ -767,7 +1081,8 @@
 {
     Py_hash_t hash;
     PyDictObject*mp = (PyDictObject *)op;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
 
     if (!PyDict_Check(op)) {
         PyErr_BadInternalCall();
@@ -782,49 +1097,45 @@
         }
     }
 
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return NULL;
-    return ep->me_value;
+    return *value_addr;
 }
 
-static int
-dict_set_item_by_hash_or_entry(register PyObject *op, PyObject *key,
-                               Py_hash_t hash, PyDictEntry *ep, PyObject *value)
+/* Fast version of global value lookup.
+ * Lookup in globals, then builtins.
+ */
+PyObject *
+_PyDict_LoadGlobal(PyDictObject *globals, PyDictObject* builtins, PyObject *key)
 {
-    register PyDictObject *mp;
-    register Py_ssize_t n_used;
-
-    mp = (PyDictObject *)op;
-    assert(mp->ma_fill <= mp->ma_mask);  /* at least one empty slot */
-    n_used = mp->ma_used;
-    Py_INCREF(value);
-    Py_INCREF(key);
-    if (ep == NULL) {
-        if (insertdict(mp, key, hash, value) != 0)
-            return -1;
+    PyObject *x;
+    if (PyUnicode_CheckExact(key)) {
+        PyObject **value_addr;
+        Py_hash_t hash = ((PyASCIIObject *)key)->hash;
+        if (hash != -1) {
+            PyDictKeyEntry *e;
+            e = globals->ma_keys->dk_lookup(globals, key, hash, &value_addr);
+            if (e == NULL) {
+                return NULL;
+            }
+            x = *value_addr;
+            if (x != NULL)
+                return x;
+            e = builtins->ma_keys->dk_lookup(builtins, key, hash, &value_addr);
+            if (e == NULL) {
+                return NULL;
+            }
+            x = *value_addr;
+            return x;
+        }
     }
-    else {
-        if (insertdict_by_entry(mp, key, hash, ep, value) != 0)
-            return -1;
-    }
-    /* If we added a key, we can safely resize.  Otherwise just return!
-     * If fill >= 2/3 size, adjust size.  Normally, this doubles or
-     * quaduples the size, but it's also possible for the dict to shrink
-     * (if ma_fill is much larger than ma_used, meaning a lot of dict
-     * keys have been * deleted).
-     *
-     * Quadrupling the size improves average dictionary sparseness
-     * (reducing collisions) at the cost of some memory and iteration
-     * speed (which loops over every possible entry).  It also halves
-     * the number of expensive resize operations in a growing dictionary.
-     *
-     * Very large dictionaries (over 50K items) use doubling instead.
-     * This may help applications with severe memory constraints.
-     */
-    if (!(mp->ma_used > n_used && mp->ma_fill*3 >= (mp->ma_mask+1)*2))
-        return 0;
-    return dictresize(mp, (mp->ma_used > 50000 ? 2 : 4) * mp->ma_used);
+    x = PyDict_GetItemWithError((PyObject *)globals, key);
+    if (x != NULL)
+        return x;
+    if (PyErr_Occurred())
+        return NULL;
+    return PyDict_GetItemWithError((PyObject *)builtins, key);
 }
 
 /* CAUTION: PyDict_SetItem() must guarantee that it won't resize the
@@ -834,36 +1145,39 @@
  * remove them.
  */
 int
-PyDict_SetItem(register PyObject *op, PyObject *key, PyObject *value)
+PyDict_SetItem(PyObject *op, PyObject *key, PyObject *value)
 {
-    register Py_hash_t hash;
-
+    PyDictObject *mp;
+    Py_hash_t hash;
     if (!PyDict_Check(op)) {
         PyErr_BadInternalCall();
         return -1;
     }
     assert(key);
     assert(value);
-    if (PyUnicode_CheckExact(key)) {
-        hash = ((PyASCIIObject *) key)->hash;
-        if (hash == -1)
-            hash = PyObject_Hash(key);
-    }
-    else {
+    mp = (PyDictObject *)op;
+    if (!PyUnicode_CheckExact(key) ||
+        (hash = ((PyASCIIObject *) key)->hash) == -1)
+    {
         hash = PyObject_Hash(key);
         if (hash == -1)
             return -1;
     }
-    return dict_set_item_by_hash_or_entry(op, key, hash, NULL, value);
+    Py_INCREF(value);
+    Py_INCREF(key);
+
+    /* insertdict() handles any resizing that might be necessary */
+    return insertdict(mp, key, hash, value);
 }
 
 int
 PyDict_DelItem(PyObject *op, PyObject *key)
 {
-    register PyDictObject *mp;
-    register Py_hash_t hash;
-    register PyDictEntry *ep;
-    PyObject *old_value, *old_key;
+    PyDictObject *mp;
+    Py_hash_t hash;
+    PyDictKeyEntry *ep;
+    PyObject *old_key, *old_value;
+    PyObject **value_addr;
 
     if (!PyDict_Check(op)) {
         PyErr_BadInternalCall();
@@ -877,21 +1191,24 @@
             return -1;
     }
     mp = (PyDictObject *)op;
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return -1;
-    if (ep->me_value == NULL) {
+    if (*value_addr == NULL) {
         set_key_error(key);
         return -1;
     }
-    old_key = ep->me_key;
-    Py_INCREF(dummy);
-    ep->me_key = dummy;
-    old_value = ep->me_value;
-    ep->me_value = NULL;
+    old_value = *value_addr;
+    *value_addr = NULL;
     mp->ma_used--;
+    if (!_PyDict_HasSplitTable(mp)) {
+        ENSURE_ALLOWS_DELETIONS(mp);
+        old_key = ep->me_key;
+        Py_INCREF(dummy);
+        ep->me_key = dummy;
+        Py_DECREF(old_key);
+    }
     Py_DECREF(old_value);
-    Py_DECREF(old_key);
     return 0;
 }
 
@@ -899,69 +1216,34 @@
 PyDict_Clear(PyObject *op)
 {
     PyDictObject *mp;
-    PyDictEntry *ep, *table;
-    int table_is_malloced;
-    Py_ssize_t fill;
-    PyDictEntry small_copy[PyDict_MINSIZE];
-#ifdef Py_DEBUG
+    PyDictKeysObject *oldkeys;
+    PyObject **oldvalues;
     Py_ssize_t i, n;
-#endif
 
     if (!PyDict_Check(op))
         return;
-    mp = (PyDictObject *)op;
-#ifdef Py_DEBUG
-    n = mp->ma_mask + 1;
-    i = 0;
-#endif
-
-    table = mp->ma_table;
-    assert(table != NULL);
-    table_is_malloced = table != mp->ma_smalltable;
-
-    /* This is delicate.  During the process of clearing the dict,
-     * decrefs can cause the dict to mutate.  To avoid fatal confusion
-     * (voice of experience), we have to make the dict empty before
-     * clearing the slots, and never refer to anything via mp->xxx while
-     * clearing.
-     */
-    fill = mp->ma_fill;
-    if (table_is_malloced)
-        EMPTY_TO_MINSIZE(mp);
-
-    else if (fill > 0) {
-        /* It's a small table with something that needs to be cleared.
-         * Afraid the only safe way is to copy the dict entries into
-         * another small table first.
-         */
-        memcpy(small_copy, table, sizeof(small_copy));
-        table = small_copy;
-        EMPTY_TO_MINSIZE(mp);
+    mp = ((PyDictObject *)op);
+    oldkeys = mp->ma_keys;
+    oldvalues = mp->ma_values;
+    if (oldvalues == empty_values)
+        return;
+    /* Empty the dict... */
+    DK_INCREF(Py_EMPTY_KEYS);
+    mp->ma_keys = Py_EMPTY_KEYS;
+    mp->ma_values = empty_values;
+    mp->ma_used = 0;
+    /* ...then clear the keys and values */
+    if (oldvalues != NULL) {
+        n = DK_SIZE(oldkeys);
+        for (i = 0; i < n; i++)
+            Py_CLEAR(oldvalues[i]);
+        free_values(oldvalues);
+        DK_DECREF(oldkeys);
     }
-    /* else it's a small table that's already empty */
-
-    /* Now we can finally clear things.  If C had refcounts, we could
-     * assert that the refcount on table is 1 now, i.e. that this function
-     * has unique access to it, so decref side-effects can't alter it.
-     */
-    for (ep = table; fill > 0; ++ep) {
-#ifdef Py_DEBUG
-        assert(i < n);
-        ++i;
-#endif
-        if (ep->me_key) {
-            --fill;
-            Py_DECREF(ep->me_key);
-            Py_XDECREF(ep->me_value);
-        }
-#ifdef Py_DEBUG
-        else
-            assert(ep->me_value == NULL);
-#endif
+    else {
+       assert(oldkeys->dk_refcnt == 1);
+       free_keys_object(oldkeys);
     }
-
-    if (table_is_malloced)
-        PyMem_DEL(table);
 }
 
 /*
@@ -982,26 +1264,38 @@
 int
 PyDict_Next(PyObject *op, Py_ssize_t *ppos, PyObject **pkey, PyObject **pvalue)
 {
-    register Py_ssize_t i;
-    register Py_ssize_t mask;
-    register PyDictEntry *ep;
+    Py_ssize_t i;
+    Py_ssize_t mask, offset;
+    PyDictObject *mp;
+    PyObject **value_ptr;
+
 
     if (!PyDict_Check(op))
         return 0;
+    mp = (PyDictObject *)op;
     i = *ppos;
     if (i < 0)
         return 0;
-    ep = ((PyDictObject *)op)->ma_table;
-    mask = ((PyDictObject *)op)->ma_mask;
-    while (i <= mask && ep[i].me_value == NULL)
+    if (mp->ma_values) {
+        value_ptr = &mp->ma_values[i];
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &mp->ma_keys->dk_entries[i]._me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    mask = DK_SIZE(mp->ma_keys)-1;
+    while (i <= mask && *value_ptr == NULL) {
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
         i++;
+    }
     *ppos = i+1;
     if (i > mask)
         return 0;
     if (pkey)
-        *pkey = ep[i].me_key;
+        *pkey = mp->ma_keys->dk_entries[i].me_key;
     if (pvalue)
-        *pvalue = ep[i].me_value;
+        *pvalue = *value_ptr;
     return 1;
 }
 
@@ -1009,48 +1303,63 @@
 int
 _PyDict_Next(PyObject *op, Py_ssize_t *ppos, PyObject **pkey, PyObject **pvalue, Py_hash_t *phash)
 {
-    register Py_ssize_t i;
-    register Py_ssize_t mask;
-    register PyDictEntry *ep;
+    Py_ssize_t i;
+    Py_ssize_t mask, offset;
+    PyDictObject *mp;
+    PyObject **value_ptr;
 
     if (!PyDict_Check(op))
         return 0;
+    mp = (PyDictObject *)op;
     i = *ppos;
     if (i < 0)
         return 0;
-    ep = ((PyDictObject *)op)->ma_table;
-    mask = ((PyDictObject *)op)->ma_mask;
-    while (i <= mask && ep[i].me_value == NULL)
+    if (mp->ma_values) {
+        value_ptr = &mp->ma_values[i];
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &mp->ma_keys->dk_entries[i]._me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    mask = DK_SIZE(mp->ma_keys)-1;
+    while (i <= mask && *value_ptr == NULL) {
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
         i++;
+    }
     *ppos = i+1;
     if (i > mask)
         return 0;
-    *phash = ep[i].me_hash;
+    *phash = mp->ma_keys->dk_entries[i].me_hash;
     if (pkey)
-        *pkey = ep[i].me_key;
+        *pkey = mp->ma_keys->dk_entries[i].me_key;
     if (pvalue)
-        *pvalue = ep[i].me_value;
+        *pvalue = *value_ptr;
     return 1;
 }
 
 /* Methods */
 
 static void
-dict_dealloc(register PyDictObject *mp)
+dict_dealloc(PyDictObject *mp)
 {
-    register PyDictEntry *ep;
-    Py_ssize_t fill = mp->ma_fill;
+    PyObject **values = mp->ma_values;
+    PyDictKeysObject *keys = mp->ma_keys;
+    Py_ssize_t i, n;
     PyObject_GC_UnTrack(mp);
     Py_TRASHCAN_SAFE_BEGIN(mp)
-    for (ep = mp->ma_table; fill > 0; ep++) {
-        if (ep->me_key) {
-            --fill;
-            Py_DECREF(ep->me_key);
-            Py_XDECREF(ep->me_value);
+    if (values != NULL) {
+        if (values != empty_values) {
+            for (i = 0, n = DK_SIZE(mp->ma_keys); i < n; i++) {
+                Py_XDECREF(values[i]);
+            }
+            free_values(values);
         }
+        DK_DECREF(keys);
     }
-    if (mp->ma_table != mp->ma_smalltable)
-        PyMem_DEL(mp->ma_table);
+    else {
+        free_keys_object(keys);
+    }
     if (numfree < PyDict_MAXFREELIST && Py_TYPE(mp) == &PyDict_Type)
         free_list[numfree++] = mp;
     else
@@ -1058,6 +1367,7 @@
     Py_TRASHCAN_SAFE_END(mp)
 }
 
+
 static PyObject *
 dict_repr(PyDictObject *mp)
 {
@@ -1090,10 +1400,12 @@
     while (PyDict_Next((PyObject *)mp, &i, &key, &value)) {
         int status;
         /* Prevent repr from deleting value during key format. */
+        Py_INCREF(key);
         Py_INCREF(value);
         s = PyObject_Repr(key);
         PyUnicode_Append(&s, colon);
         PyUnicode_AppendAndDel(&s, PyObject_Repr(value));
+        Py_DECREF(key);
         Py_DECREF(value);
         if (s == NULL)
             goto Done;
@@ -1148,18 +1460,19 @@
 {
     PyObject *v;
     Py_hash_t hash;
-    PyDictEntry *ep;
-    assert(mp->ma_table != NULL);
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
+
     if (!PyUnicode_CheckExact(key) ||
         (hash = ((PyASCIIObject *) key)->hash) == -1) {
         hash = PyObject_Hash(key);
         if (hash == -1)
             return NULL;
     }
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return NULL;
-    v = ep->me_value;
+    v = *value_addr;
     if (v == NULL) {
         if (!PyDict_CheckExact(mp)) {
             /* Look up __missing__ method if we're a subclass. */
@@ -1203,8 +1516,9 @@
 {
     register PyObject *v;
     register Py_ssize_t i, j;
-    PyDictEntry *ep;
-    Py_ssize_t mask, n;
+    PyDictKeyEntry *ep;
+    Py_ssize_t size, n, offset;
+    PyObject **value_ptr;
 
   again:
     n = mp->ma_used;
@@ -1218,15 +1532,24 @@
         Py_DECREF(v);
         goto again;
     }
-    ep = mp->ma_table;
-    mask = mp->ma_mask;
-    for (i = 0, j = 0; i <= mask; i++) {
-        if (ep[i].me_value != NULL) {
+    ep = &mp->ma_keys->dk_entries[0];
+    size = DK_SIZE(mp->ma_keys);
+    if (mp->ma_values) {
+        value_ptr = mp->ma_values;
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &ep[0]._me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    for (i = 0, j = 0; i < size; i++) {
+        if (*value_ptr != NULL) {
             PyObject *key = ep[i].me_key;
             Py_INCREF(key);
             PyList_SET_ITEM(v, j, key);
             j++;
         }
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
     }
     assert(j == n);
     return v;
@@ -1237,8 +1560,8 @@
 {
     register PyObject *v;
     register Py_ssize_t i, j;
-    PyDictEntry *ep;
-    Py_ssize_t mask, n;
+    Py_ssize_t size, n, offset;
+    PyObject **value_ptr;
 
   again:
     n = mp->ma_used;
@@ -1252,11 +1575,19 @@
         Py_DECREF(v);
         goto again;
     }
-    ep = mp->ma_table;
-    mask = mp->ma_mask;
-    for (i = 0, j = 0; i <= mask; i++) {
-        if (ep[i].me_value != NULL) {
-            PyObject *value = ep[i].me_value;
+    size = DK_SIZE(mp->ma_keys);
+    if (mp->ma_values) {
+        value_ptr = mp->ma_values;
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &mp->ma_keys->dk_entries[0]._me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    for (i = 0, j = 0; i < size; i++) {
+        PyObject *value = *value_ptr;
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
+        if (value != NULL) {
             Py_INCREF(value);
             PyList_SET_ITEM(v, j, value);
             j++;
@@ -1271,9 +1602,10 @@
 {
     register PyObject *v;
     register Py_ssize_t i, j, n;
-    Py_ssize_t mask;
-    PyObject *item, *key, *value;
-    PyDictEntry *ep;
+    Py_ssize_t size, offset;
+    PyObject *item, *key;
+    PyDictKeyEntry *ep;
+    PyObject **value_ptr;
 
     /* Preallocate the list of tuples, to avoid allocations during
      * the loop over the items, which could trigger GC, which
@@ -1300,10 +1632,20 @@
         goto again;
     }
     /* Nothing we do below makes any function calls. */
-    ep = mp->ma_table;
-    mask = mp->ma_mask;
-    for (i = 0, j = 0; i <= mask; i++) {
-        if ((value=ep[i].me_value) != NULL) {
+    ep = mp->ma_keys->dk_entries;
+    size = DK_SIZE(mp->ma_keys);
+    if (mp->ma_values) {
+        value_ptr = mp->ma_values;
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &ep[0]._me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    for (i = 0, j = 0; i < size; i++) {
+        PyObject *value = *value_ptr;
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
+        if (value != NULL) {
             key = ep[i].me_key;
             item = PyList_GET_ITEM(v, j);
             Py_INCREF(key);
@@ -1535,8 +1877,8 @@
 PyDict_Merge(PyObject *a, PyObject *b, int override)
 {
     register PyDictObject *mp, *other;
-    register Py_ssize_t i;
-    PyDictEntry *entry;
+    register Py_ssize_t i, n;
+    PyDictKeyEntry *entry;
 
     /* We accept for the argument either a concrete dictionary object,
      * or an abstract "mapping" object.  For the former, we can do
@@ -1563,20 +1905,25 @@
          * incrementally resizing as we insert new items.  Expect
          * that there will be no (or few) overlapping keys.
          */
-        if ((mp->ma_fill + other->ma_used)*3 >= (mp->ma_mask+1)*2) {
-           if (dictresize(mp, (mp->ma_used + other->ma_used)*2) != 0)
+        if (mp->ma_keys->dk_usable * 3 < other->ma_used * 2)
+            if (dictresize(mp, (mp->ma_used + other->ma_used)*2) != 0)
                return -1;
-        }
-        for (i = 0; i <= other->ma_mask; i++) {
-            entry = &other->ma_table[i];
-            if (entry->me_value != NULL &&
+        for (i = 0, n = DK_SIZE(other->ma_keys); i < n; i++) {
+            PyObject *value;
+            entry = &other->ma_keys->dk_entries[i];
+            if (other->ma_values)
+                value = other->ma_values[i];
+            else
+                value = entry->_me_value;
+
+            if (value != NULL &&
                 (override ||
                  PyDict_GetItem(a, entry->me_key) == NULL)) {
                 Py_INCREF(entry->me_key);
-                Py_INCREF(entry->me_value);
+                Py_INCREF(value);
                 if (insertdict(mp, entry->me_key,
                                entry->me_hash,
-                               entry->me_value) != 0)
+                               value) != 0)
                     return -1;
             }
         }
@@ -1638,11 +1985,35 @@
 PyDict_Copy(PyObject *o)
 {
     PyObject *copy;
+    PyDictObject *mp;
+    Py_ssize_t i, n;
 
     if (o == NULL || !PyDict_Check(o)) {
         PyErr_BadInternalCall();
         return NULL;
     }
+    mp = (PyDictObject *)o;
+    if (_PyDict_HasSplitTable(mp)) {
+        PyDictObject *split_copy;
+        PyObject **newvalues = new_values(DK_SIZE(mp->ma_keys));
+        if (newvalues == NULL)
+            return PyErr_NoMemory();
+        split_copy = PyObject_GC_New(PyDictObject, &PyDict_Type);
+        if (split_copy == NULL) {
+            free_values(newvalues);
+            return NULL;
+        }
+        split_copy->ma_values = newvalues;
+        split_copy->ma_keys = mp->ma_keys;
+        split_copy->ma_used = mp->ma_used;
+        DK_INCREF(mp->ma_keys);
+        for (i = 0, n = DK_SIZE(mp->ma_keys); i < n; i++) {
+            PyObject *value = mp->ma_values[i];
+            Py_XINCREF(value);
+            split_copy->ma_values[i] = value;
+        }
+        return (PyObject *)split_copy;
+    }
     copy = PyDict_New();
     if (copy == NULL)
         return NULL;
@@ -1704,14 +2075,18 @@
     if (a->ma_used != b->ma_used)
         /* can't be equal if # of entries differ */
         return 0;
-
     /* Same # of entries -- check all of 'em.  Exit early on any diff. */
-    for (i = 0; i <= a->ma_mask; i++) {
-        PyObject *aval = a->ma_table[i].me_value;
+    for (i = 0; i < DK_SIZE(a->ma_keys); i++) {
+        PyDictKeyEntry *ep = &a->ma_keys->dk_entries[i];
+        PyObject *aval;
+        if (a->ma_values)
+            aval = a->ma_values[i];
+        else
+            aval = ep->_me_value;
         if (aval != NULL) {
             int cmp;
             PyObject *bval;
-            PyObject *key = a->ma_table[i].me_key;
+            PyObject *key = ep->me_key;
             /* temporarily bump aval's refcount to ensure it stays
                alive until we're done with it */
             Py_INCREF(aval);
@@ -1732,7 +2107,7 @@
         }
     }
     return 1;
- }
+}
 
 static PyObject *
 dict_richcompare(PyObject *v, PyObject *w, int op)
@@ -1753,13 +2128,14 @@
         res = Py_NotImplemented;
     Py_INCREF(res);
     return res;
- }
+}
 
 static PyObject *
 dict_contains(register PyDictObject *mp, PyObject *key)
 {
     Py_hash_t hash;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
 
     if (!PyUnicode_CheckExact(key) ||
         (hash = ((PyASCIIObject *) key)->hash) == -1) {
@@ -1767,10 +2143,10 @@
         if (hash == -1)
             return NULL;
     }
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return NULL;
-    return PyBool_FromLong(ep->me_value != NULL);
+    return PyBool_FromLong(*value_addr != NULL);
 }
 
 static PyObject *
@@ -1780,7 +2156,8 @@
     PyObject *failobj = Py_None;
     PyObject *val = NULL;
     Py_hash_t hash;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
 
     if (!PyArg_UnpackTuple(args, "get", 1, 2, &key, &failobj))
         return NULL;
@@ -1791,17 +2168,16 @@
         if (hash == -1)
             return NULL;
     }
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return NULL;
-    val = ep->me_value;
+    val = *value_addr;
     if (val == NULL)
         val = failobj;
     Py_INCREF(val);
     return val;
 }
 
-
 static PyObject *
 dict_setdefault(register PyDictObject *mp, PyObject *args)
 {
@@ -1809,7 +2185,8 @@
     PyObject *failobj = Py_None;
     PyObject *val = NULL;
     Py_hash_t hash;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
 
     if (!PyArg_UnpackTuple(args, "setdefault", 1, 2, &key, &failobj))
         return NULL;
@@ -1820,16 +2197,27 @@
         if (hash == -1)
             return NULL;
     }
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return NULL;
-    val = ep->me_value;
+    val = *value_addr;
     if (val == NULL) {
-        if (dict_set_item_by_hash_or_entry((PyObject*)mp, key, hash, ep,
-                                           failobj) == 0)
-            val = failobj;
+        Py_INCREF(failobj);
+        Py_INCREF(key);
+        if (mp->ma_keys->dk_usable <= 0) {
+            /* Need to resize. */
+            if (insertion_resize(mp) < 0)
+                return NULL;
+            ep = find_empty_slot(mp, key, hash, &value_addr);
+        }
+        ep->me_key = key;
+        ep->me_hash = hash;
+        *value_addr = failobj;
+        val = failobj;
+        mp->ma_keys->dk_usable--;
+        mp->ma_used++;
     }
-    Py_XINCREF(val);
+    Py_INCREF(val);
     return val;
 }
 
@@ -1845,9 +2233,10 @@
 dict_pop(PyDictObject *mp, PyObject *args)
 {
     Py_hash_t hash;
-    PyDictEntry *ep;
     PyObject *old_value, *old_key;
     PyObject *key, *deflt = NULL;
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
 
     if(!PyArg_UnpackTuple(args, "pop", 1, 2, &key, &deflt))
         return NULL;
@@ -1865,10 +2254,11 @@
         if (hash == -1)
             return NULL;
     }
-    ep = (mp->ma_lookup)(mp, key, hash);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
     if (ep == NULL)
         return NULL;
-    if (ep->me_value == NULL) {
+    old_value = *value_addr;
+    if (old_value == NULL) {
         if (deflt) {
             Py_INCREF(deflt);
             return deflt;
@@ -1876,13 +2266,15 @@
         set_key_error(key);
         return NULL;
     }
-    old_key = ep->me_key;
-    Py_INCREF(dummy);
-    ep->me_key = dummy;
-    old_value = ep->me_value;
-    ep->me_value = NULL;
+    *value_addr = NULL;
     mp->ma_used--;
-    Py_DECREF(old_key);
+    if (!_PyDict_HasSplitTable(mp)) {
+        ENSURE_ALLOWS_DELETIONS(mp);
+        old_key = ep->me_key;
+        Py_INCREF(dummy);
+        ep->me_key = dummy;
+        Py_DECREF(old_key);
+    }
     return old_value;
 }
 
@@ -1890,9 +2282,10 @@
 dict_popitem(PyDictObject *mp)
 {
     Py_hash_t i = 0;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
     PyObject *res;
 
+
     /* Allocate the result tuple before checking the size.  Believe it
      * or not, this allocation could trigger a garbage collection which
      * could empty the dict, so if we checked the size first and that
@@ -1911,49 +2304,71 @@
                         "popitem(): dictionary is empty");
         return NULL;
     }
+    /* Convert split table to combined table */
+    if (mp->ma_keys->dk_lookup == lookdict_split) {
+        if (dictresize(mp, DK_SIZE(mp->ma_keys))) {
+            Py_DECREF(res);
+            return NULL;
+        }
+    }
+    ENSURE_ALLOWS_DELETIONS(mp);
     /* Set ep to "the first" dict entry with a value.  We abuse the hash
      * field of slot 0 to hold a search finger:
      * If slot 0 has a value, use slot 0.
      * Else slot 0 is being used to hold a search finger,
      * and we use its hash value as the first index to look.
      */
-    ep = &mp->ma_table[0];
-    if (ep->me_value == NULL) {
+    ep = &mp->ma_keys->dk_entries[0];
+    if (ep->_me_value == NULL) {
+        Py_ssize_t mask = DK_SIZE(mp->ma_keys)-1;
         i = ep->me_hash;
         /* The hash field may be a real hash value, or it may be a
          * legit search finger, or it may be a once-legit search
          * finger that's out of bounds now because it wrapped around
          * or the table shrunk -- simply make sure it's in bounds now.
          */
-        if (i > mp->ma_mask || i < 1)
+        if (i > mask || i < 1)
             i = 1;              /* skip slot 0 */
-        while ((ep = &mp->ma_table[i])->me_value == NULL) {
+        while ((ep = &mp->ma_keys->dk_entries[i])->_me_value == NULL) {
             i++;
-            if (i > mp->ma_mask)
+            if (i > mask)
                 i = 1;
         }
     }
     PyTuple_SET_ITEM(res, 0, ep->me_key);
-    PyTuple_SET_ITEM(res, 1, ep->me_value);
+    PyTuple_SET_ITEM(res, 1, ep->_me_value);
     Py_INCREF(dummy);
     ep->me_key = dummy;
-    ep->me_value = NULL;
+    ep->_me_value = NULL;
     mp->ma_used--;
-    assert(mp->ma_table[0].me_value == NULL);
-    mp->ma_table[0].me_hash = i + 1;  /* next place to start */
+    assert(mp->ma_keys->dk_entries[0]._me_value == NULL);
+    mp->ma_keys->dk_entries[0].me_hash = i + 1;  /* next place to start */
     return res;
 }
 
 static int
 dict_traverse(PyObject *op, visitproc visit, void *arg)
 {
-    Py_ssize_t i = 0;
-    PyObject *pk;
-    PyObject *pv;
-
-    while (PyDict_Next(op, &i, &pk, &pv)) {
-        Py_VISIT(pk);
-        Py_VISIT(pv);
+    Py_ssize_t i, n;
+    PyDictObject *mp = (PyDictObject *)op;
+    if (mp->ma_keys->dk_lookup == lookdict) {
+        for (i = 0; i < DK_SIZE(mp->ma_keys); i++) {
+            if (mp->ma_keys->dk_entries[i]._me_value != NULL) {
+                Py_VISIT(mp->ma_keys->dk_entries[i]._me_value);
+                Py_VISIT(mp->ma_keys->dk_entries[i].me_key);
+            }
+        }
+    } else {
+        if (mp->ma_values != NULL) {
+            for (i = 0, n = DK_SIZE(mp->ma_keys); i < n; i++) {
+                Py_VISIT(mp->ma_values[i]);
+            }
+        }
+        else {
+            for (i = 0, n = DK_SIZE(mp->ma_keys); i < n; i++) {
+                Py_VISIT(mp->ma_keys->dk_entries[i]._me_value);
+            }
+        }
     }
     return 0;
 }
@@ -1970,12 +2385,22 @@
 static PyObject *
 dict_sizeof(PyDictObject *mp)
 {
-    Py_ssize_t res;
-
+    Py_ssize_t size;
+    double res, keys_size;
+
+    size = DK_SIZE(mp->ma_keys);
     res = sizeof(PyDictObject);
-    if (mp->ma_table != mp->ma_smalltable)
-        res = res + (mp->ma_mask + 1) * sizeof(PyDictEntry);
-    return PyLong_FromSsize_t(res);
+    if (mp->ma_values)
+        res += size * sizeof(PyObject*);
+    /* Count our share of the keys object -- with rounding errors. */
+    keys_size = sizeof(PyDictKeysObject) + (size-1) * sizeof(PyDictKeyEntry);
+    /* If refcnt > 1, then one count is (probably) held by a type */
+    /* XXX  This is somewhat approximate :) */
+    if (mp->ma_keys->dk_refcnt < 3)
+        res += keys_size;
+    else
+        res += keys_size / (mp->ma_keys->dk_refcnt - 1);
+    return PyFloat_FromDouble(res);
 }
 
 PyDoc_STRVAR(contains__doc__,
@@ -2066,7 +2491,8 @@
 {
     Py_hash_t hash;
     PyDictObject *mp = (PyDictObject *)op;
-    PyDictEntry *ep;
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
 
     if (!PyUnicode_CheckExact(key) ||
         (hash = ((PyASCIIObject *) key)->hash) == -1) {
@@ -2074,8 +2500,8 @@
         if (hash == -1)
             return -1;
     }
-    ep = (mp->ma_lookup)(mp, key, hash);
-    return ep == NULL ? -1 : (ep->me_value != NULL);
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
+    return (ep == NULL) ? -1 : (*value_addr != NULL);
 }
 
 /* Internal version of PyDict_Contains used when the hash value is already known */
@@ -2083,10 +2509,11 @@
 _PyDict_Contains(PyObject *op, PyObject *key, Py_hash_t hash)
 {
     PyDictObject *mp = (PyDictObject *)op;
-    PyDictEntry *ep;
-
-    ep = (mp->ma_lookup)(mp, key, hash);
-    return ep == NULL ? -1 : (ep->me_value != NULL);
+    PyDictKeyEntry *ep;
+    PyObject **value_addr;
+
+    ep = (mp->ma_keys->dk_lookup)(mp, key, hash, &value_addr);
+    return (ep == NULL) ? -1 : (*value_addr != NULL);
 }
 
 /* Hack to implement "key in dict" */
@@ -2112,22 +2539,17 @@
     self = type->tp_alloc(type, 0);
     if (self != NULL) {
         PyDictObject *d = (PyDictObject *)self;
-        /* It's guaranteed that tp->alloc zeroed out the struct. */
-        assert(d->ma_table == NULL && d->ma_fill == 0 && d->ma_used == 0);
-        INIT_NONZERO_DICT_SLOTS(d);
-        d->ma_lookup = lookdict_unicode;
+        d->ma_keys = new_keys_object(PyDict_MINSIZE_COMBINED);
+        /* XXX - Should we raise a no-memory error? */
+        if (d->ma_keys == NULL) {
+            DK_INCREF(Py_EMPTY_KEYS);
+            d->ma_keys = Py_EMPTY_KEYS;
+            d->ma_values = empty_values;
+        }
+        d->ma_used = 0;
         /* The object has been implicitly tracked by tp_alloc */
         if (type == &PyDict_Type)
             _PyObject_GC_UNTRACK(d);
-#ifdef SHOW_CONVERSION_COUNTS
-        ++created;
-#endif
-#ifdef SHOW_TRACK_COUNT
-        if (_PyObject_GC_IS_TRACKED(d))
-            count_tracked++;
-        else
-            count_untracked++;
-#endif
     }
     return self;
 }
@@ -2332,9 +2754,10 @@
 static PyObject *dictiter_iternextkey(dictiterobject *di)
 {
     PyObject *key;
-    register Py_ssize_t i, mask;
-    register PyDictEntry *ep;
+    register Py_ssize_t i, mask, offset;
+    register PyDictKeysObject *k;
     PyDictObject *d = di->di_dict;
+    PyObject **value_ptr;
 
     if (d == NULL)
         return NULL;
@@ -2350,15 +2773,25 @@
     i = di->di_pos;
     if (i < 0)
         goto fail;
-    ep = d->ma_table;
-    mask = d->ma_mask;
-    while (i <= mask && ep[i].me_value == NULL)
+    k = d->ma_keys;
+    if (d->ma_values) {
+        value_ptr = &d->ma_values[i];
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &k->dk_entries[i]._me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    mask = DK_SIZE(k)-1;
+    while (i <= mask && *value_ptr == NULL) {
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
         i++;
+    }
     di->di_pos = i+1;
     if (i > mask)
         goto fail;
     di->len--;
-    key = ep[i].me_key;
+    key = k->dk_entries[i].me_key;
     Py_INCREF(key);
     return key;
 
@@ -2404,9 +2837,9 @@
 static PyObject *dictiter_iternextvalue(dictiterobject *di)
 {
     PyObject *value;
-    register Py_ssize_t i, mask;
-    register PyDictEntry *ep;
+    register Py_ssize_t i, mask, offset;
     PyDictObject *d = di->di_dict;
+    PyObject **value_ptr;
 
     if (d == NULL)
         return NULL;
@@ -2420,17 +2853,26 @@
     }
 
     i = di->di_pos;
-    mask = d->ma_mask;
+    mask = DK_SIZE(d->ma_keys)-1;
     if (i < 0 || i > mask)
         goto fail;
-    ep = d->ma_table;
-    while ((value=ep[i].me_value) == NULL) {
+    if (d->ma_values) {
+        value_ptr = &d->ma_values[i];
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &d->ma_keys->dk_entries[i]._me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    while (i <= mask && *value_ptr == NULL) {
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
         i++;
         if (i > mask)
             goto fail;
     }
     di->di_pos = i+1;
     di->len--;
+    value = *value_ptr;
     Py_INCREF(value);
     return value;
 
@@ -2476,9 +2918,9 @@
 static PyObject *dictiter_iternextitem(dictiterobject *di)
 {
     PyObject *key, *value, *result = di->di_result;
-    register Py_ssize_t i, mask;
-    register PyDictEntry *ep;
+    register Py_ssize_t i, mask, offset;
     PyDictObject *d = di->di_dict;
+    PyObject **value_ptr;
 
     if (d == NULL)
         return NULL;
@@ -2494,10 +2936,19 @@
     i = di->di_pos;
     if (i < 0)
         goto fail;
-    ep = d->ma_table;
-    mask = d->ma_mask;
-    while (i <= mask && ep[i].me_value == NULL)
+    mask = DK_SIZE(d->ma_keys)-1;
+    if (d->ma_values) {
+        value_ptr = &d->ma_values[i];
+        offset = sizeof(PyObject *);
+    }
+    else {
+        value_ptr = &d->ma_keys->dk_entries[i]._me_value;
+        offset = sizeof(PyDictKeyEntry);
+    }
+    while (i <= mask && *value_ptr == NULL) {
+        value_ptr = (PyObject **)(((char *)value_ptr) + offset);
         i++;
+    }
     di->di_pos = i+1;
     if (i > mask)
         goto fail;
@@ -2512,8 +2963,8 @@
             return NULL;
     }
     di->len--;
-    key = ep[i].me_key;
-    value = ep[i].me_value;
+    key = d->ma_keys->dk_entries[i].me_key;
+    value = *value_ptr;
     Py_INCREF(key);
     Py_INCREF(value);
     PyTuple_SET_ITEM(result, 0, key);
@@ -3107,3 +3558,144 @@
 {
     return dictview_new(dict, &PyDictValues_Type);
 }
+
+PyDictKeysObject *
+_PyDict_NewKeysForClass(void)
+{
+    PyDictKeysObject *keys = new_keys_object(PyDict_MINSIZE_SPLIT);
+    if (keys != NULL)
+        keys->dk_lookup = lookdict_split;
+    return keys;
+}
+
+#define CACHED_KEYS(tp) (((PyHeapTypeObject*)tp)->ht_cached_keys)
+
+PyObject *
+PyObject_GenericGetDict(PyObject *obj, void *context)
+{
+    PyObject *dict, **dictptr = _PyObject_GetDictPtr(obj);
+    if (dictptr == NULL) {
+        PyErr_SetString(PyExc_AttributeError,
+                        "This object has no __dict__");
+        return NULL;
+    }
+    dict = *dictptr;
+    if (dict == NULL) {
+        PyTypeObject *tp = Py_TYPE(obj);
+        if ((tp->tp_flags & Py_TPFLAGS_HEAPTYPE) && CACHED_KEYS(tp)) {
+            DK_INCREF(CACHED_KEYS(tp));
+            *dictptr = dict = new_dict_with_shared_keys(CACHED_KEYS(tp));
+        }
+        else {
+            *dictptr = dict = PyDict_New();
+        }
+    }
+    Py_XINCREF(dict);
+    return dict;
+}
+
+int
+_PyObjectDict_SetItem(PyTypeObject *tp, PyObject **dictptr,
+                     PyObject *key, PyObject *value) {
+    PyObject *dict;
+    int res;
+    PyDictKeysObject *cached;
+
+    assert(dictptr != NULL);
+    if ((tp->tp_flags & Py_TPFLAGS_HEAPTYPE) && (cached = CACHED_KEYS(tp))) {
+        assert(dictptr != NULL);
+        dict = *dictptr;
+        if (dict == NULL) {
+            DK_INCREF(cached);
+            dict = new_dict_with_shared_keys(cached);
+            if (dict == NULL)
+                return -1;
+        }
+        *dictptr = dict;
+        if (value == NULL) {
+            res = PyDict_DelItem(dict, key);
+            if (cached != ((PyDictObject *)dict)->ma_keys) {
+                CACHED_KEYS(tp) = NULL;
+                DK_DECREF(cached);
+            }
+        } else {
+            res = PyDict_SetItem(dict, key, value);
+            if (cached != ((PyDictObject *)dict)->ma_keys) {
+                /* Either update tp->ht_cached_keys or delete it */
+                if (cached->dk_refcnt == 1) {
+                    CACHED_KEYS(tp) = make_keys_shared(dict);
+                } else {
+                    CACHED_KEYS(tp) = NULL;
+                }
+                DK_DECREF(cached);
+            }
+        }
+    } else {
+        dict = *dictptr;
+        if (dict == NULL) {
+            dict = PyDict_New();
+            if (dict == NULL)
+                return -1;
+            *dictptr = dict;
+        }
+        if (value == NULL) {
+            res = PyDict_DelItem(dict, key);
+        } else {
+            res = PyDict_SetItem(dict, key, value);
+        }
+    }
+    return res;
+}
+
+void
+_PyDictKeys_DecRef(PyDictKeysObject *keys)
+{
+    DK_DECREF(keys);
+}
+
+
+/* ARGSUSED */
+static PyObject *
+dummy_repr(PyObject *op)
+{
+    return PyUnicode_FromString("<dummy key>");
+}
+
+/* ARGUSED */
+static void
+dummy_dealloc(PyObject* ignore)
+{
+    /* This should never get called, but we also don't want to SEGV if
+     * we accidentally decref dummy-key out of existence.
+     */
+    Py_FatalError("deallocating <dummy key>");
+}
+
+static PyTypeObject PyDictDummy_Type = {
+    PyVarObject_HEAD_INIT(&PyType_Type, 0)
+    "<dummy key> type",
+    0,
+    0,
+    dummy_dealloc,      /*tp_dealloc*/ /*never called*/
+    0,                  /*tp_print*/
+    0,                  /*tp_getattr*/
+    0,                  /*tp_setattr*/
+    0,                  /*tp_reserved*/
+    dummy_repr,         /*tp_repr*/
+    0,                  /*tp_as_number*/
+    0,                  /*tp_as_sequence*/
+    0,                  /*tp_as_mapping*/
+    0,                  /*tp_hash */
+    0,                  /*tp_call */
+    0,                  /*tp_str */
+    0,                  /*tp_getattro */
+    0,                  /*tp_setattro */
+    0,                  /*tp_as_buffer */
+    Py_TPFLAGS_DEFAULT, /*tp_flags */
+};
+
+static PyObject _dummy_struct = {
+  _PyObject_EXTRA_INIT
+  2, &PyDictDummy_Type
+};
+
diff -r 9fcb2676696c -r 372d0bca85ae Objects/object.c
--- a/Objects/object.c	Mon Apr 02 07:52:29 2012 +0200
+++ b/Objects/object.c	Mon Apr 02 11:47:24 2012 +0100
@@ -1173,13 +1173,10 @@
     if (dict == NULL) {
         dictptr = _PyObject_GetDictPtr(obj);
         if (dictptr != NULL) {
-            dict = *dictptr;
-            if (dict == NULL && value != NULL) {
-                dict = PyDict_New();
-                if (dict == NULL)
-                    goto done;
-                *dictptr = dict;
-            }
+            res = _PyObjectDict_SetItem(Py_TYPE(obj), dictptr, name, value);
+            if (res < 0 && PyErr_ExceptionMatches(PyExc_KeyError))
+                PyErr_SetObject(PyExc_AttributeError, name);
+            goto done;
         }
     }
     if (dict != NULL) {
@@ -1221,22 +1218,6 @@
     return _PyObject_GenericSetAttrWithDict(obj, name, value, NULL);
 }
 
-PyObject *
-PyObject_GenericGetDict(PyObject *obj, void *context)
-{
-    PyObject *dict, **dictptr = _PyObject_GetDictPtr(obj);
-    if (dictptr == NULL) {
-        PyErr_SetString(PyExc_AttributeError,
-                        "This object has no __dict__");
-        return NULL;
-    }
-    dict = *dictptr;
-    if (dict == NULL)
-        *dictptr = dict = PyDict_New();
-    Py_XINCREF(dict);
-    return dict;
-}
-
 int
 PyObject_GenericSetDict(PyObject *obj, PyObject *value, void *context)
 {
diff -r 9fcb2676696c -r 372d0bca85ae Objects/typeobject.c
--- a/Objects/typeobject.c	Mon Apr 02 07:52:29 2012 +0200
+++ b/Objects/typeobject.c	Mon Apr 02 11:47:24 2012 +0100
@@ -14,7 +14,7 @@
    MCACHE_MAX_ATTR_SIZE, since it might be a problem if very large
    strings are used as attribute names. */
 #define MCACHE_MAX_ATTR_SIZE    100
-#define MCACHE_SIZE_EXP         10
+#define MCACHE_SIZE_EXP         9
 #define MCACHE_HASH(version, name_hash)                                 \
         (((unsigned int)(version) * (unsigned int)(name_hash))          \
          >> (8*sizeof(unsigned int) - MCACHE_SIZE_EXP))
@@ -2306,6 +2306,9 @@
             type->tp_dictoffset = slotoffset;
         slotoffset += sizeof(PyObject *);
     }
+    if (type->tp_dictoffset) {
+        et->ht_cached_keys = _PyDict_NewKeysForClass();
+    }
     if (add_weak) {
         assert(!base->tp_itemsize);
         type->tp_weaklistoffset = slotoffset;
@@ -2411,6 +2414,9 @@
             res->ht_type.tp_doc = tp_doc;
         }
     }
+    if (res->ht_type.tp_dictoffset) {
+        res->ht_cached_keys = _PyDict_NewKeysForClass();
+    }
 
     if (PyType_Ready(&res->ht_type) < 0)
         goto fail;
@@ -2767,9 +2773,13 @@
     return 0;
 }
 
+extern void
+_PyDictKeys_DecRef(PyDictKeysObject *keys);
+
 static int
 type_clear(PyTypeObject *type)
 {
+    PyDictKeysObject *cached_keys;
     /* Because of type_is_gc(), the collector only calls this
        for heaptypes. */
     assert(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
@@ -2801,6 +2811,11 @@
     */
 
     PyType_Modified(type);
+    cached_keys = ((PyHeapTypeObject *)type)->ht_cached_keys;
+    if (cached_keys != NULL) {
+        ((PyHeapTypeObject *)type)->ht_cached_keys = NULL;
+        _PyDictKeys_DecRef(cached_keys);
+    }
     if (type->tp_dict)
         PyDict_Clear(type->tp_dict);
     Py_CLEAR(type->tp_mro);
diff -r 9fcb2676696c -r 372d0bca85ae Python/ceval.c
--- a/Python/ceval.c	Mon Apr 02 07:52:29 2012 +0200
+++ b/Python/ceval.c	Mon Apr 02 11:47:24 2012 +0100
@@ -2091,56 +2091,17 @@
 
         TARGET(LOAD_GLOBAL)
             w = GETITEM(names, oparg);
-            if (PyUnicode_CheckExact(w)) {
-                /* Inline the PyDict_GetItem() calls.
-                   WARNING: this is an extreme speed hack.
-                   Do not try this at home. */
-                Py_hash_t hash = ((PyASCIIObject *)w)->hash;
-                if (hash != -1) {
-                    PyDictObject *d;
-                    PyDictEntry *e;
-                    d = (PyDictObject *)(f->f_globals);
-                    e = d->ma_lookup(d, w, hash);
-                    if (e == NULL) {
-                        x = NULL;
-                        break;
-                    }
-                    x = e->me_value;
-                    if (x != NULL) {
-                        Py_INCREF(x);
-                        PUSH(x);
-                        DISPATCH();
-                    }
-                    d = (PyDictObject *)(f->f_builtins);
-                    e = d->ma_lookup(d, w, hash);
-                    if (e == NULL) {
-                        x = NULL;
-                        break;
-                    }
-                    x = e->me_value;
-                    if (x != NULL) {
-                        Py_INCREF(x);
-                        PUSH(x);
-                        DISPATCH();
-                    }
-                    goto load_global_error;
-                }
+            x = _PyDict_LoadGlobal((PyDictObject *)f->f_globals,
+                                   (PyDictObject *)f->f_builtins,
+                                   w);
+            if (x != NULL) {
+                Py_INCREF(x);
+                PUSH(x);
+                DISPATCH();
             }
-            /* This is the un-inlined version of the code above */
-            x = PyDict_GetItem(f->f_globals, w);
-            if (x == NULL) {
-                x = PyDict_GetItem(f->f_builtins, w);
-                if (x == NULL) {
-                  load_global_error:
-                    format_exc_check_arg(
-                                PyExc_NameError,
-                                GLOBAL_NAME_ERROR_MSG, w);
-                    break;
-                }
-            }
-            Py_INCREF(x);
-            PUSH(x);
-            DISPATCH();
+            if (!PyErr_Occurred())
+                format_exc_check_arg(PyExc_NameError, GLOBAL_NAME_ERROR_MSG, w);
+            break;
 
         TARGET(DELETE_FAST)
             x = GETLOCAL(oparg);
diff -r 9fcb2676696c -r 372d0bca85ae Tools/gdb/libpython.py
--- a/Tools/gdb/libpython.py	Mon Apr 02 07:52:29 2012 +0200
+++ b/Tools/gdb/libpython.py	Mon Apr 02 11:47:24 2012 +0100
@@ -634,9 +634,14 @@
         Yields a sequence of (PyObjectPtr key, PyObjectPtr value) pairs,
         analagous to dict.iteritems()
         '''
-        for i in safe_range(self.field('ma_mask') + 1):
-            ep = self.field('ma_table') + i
-            pyop_value = PyObjectPtr.from_pyobject_ptr(ep['me_value'])
+        keys = self.field('ma_keys')
+        values = self.field('ma_values')
+        for i in safe_range(keys['dk_size']):
+            ep = keys['dk_entries'].address + i
+            if long(values):
+                pyop_value = PyObjectPtr.from_pyobject_ptr(values[i])
+            else:
+                pyop_value = PyObjectPtr.from_pyobject_ptr(ep['_me_value'])
             if not pyop_value.is_null():
                 pyop_key = PyObjectPtr.from_pyobject_ptr(ep['me_key'])
                 yield (pyop_key, pyop_value)