Index: Python/ceval.c
===================================================================
--- Python/ceval.c	(revision 53197)
+++ Python/ceval.c	(working copy)
@@ -1,4 +1,3 @@
-
 /* Execute compiled code */
 
 /* XXX TO DO:
@@ -418,7 +417,41 @@
 	return 0;
 }
 
+#ifdef CACHED_LOOKUPS
 
+#ifdef CACHED_MODULE_LOOKUPS
+/* BAD HACK ... replicating module structure here! */
+typedef struct {
+	PyObject_HEAD
+	PyObject *md_dict;
+} PyModuleObject;
+#endif
+
+/*#define CACHED_LOOKUP_STATS*/
+#ifdef CACHED_LOOKUP_STATS
+
+static int loadglobal_total = 0;
+static int loadglobal_hit = 0;
+static int loadattr_total = 0;
+static int loadattr_module = 0;
+static int loadattr_hit = 0;
+static int loadattr_init = 0;
+
+static void dumpstats(void)
+{
+    printf("LOAD_GLOBAL total = %i\n", loadglobal_total);
+    printf("LOAD_GLOBAL hit = %i\n", loadglobal_hit);
+    printf("LOAD_ATTR total = %i\n", loadattr_total);
+    printf("LOAD_ATTR module = %i\n", loadattr_module);
+    printf("LOAD_ATTR hit = %i\n", loadattr_hit);
+}
+
+#define CACHED_LOOKUP_STAT(x) do{(x)++;}while(0)
+#else
+#define CACHED_LOOKUP_STAT(x)
+#endif
+#endif
+
 /* The interpreter's recursion limit */
 
 #ifndef Py_DEFAULT_RECURSION_LIMIT
@@ -531,6 +564,13 @@
 	PyObject *retval = NULL;	/* Return value */
 	PyThreadState *tstate = PyThreadState_GET();
 	PyCodeObject *co;
+#ifdef CACHED_LOOKUP_STATS
+	if (!loadattr_init)
+	{
+	    loadattr_init = 1;
+	    atexit(dumpstats);
+	}
+#endif
 
 	/* when tracing we set things up so that
 
@@ -1847,6 +1887,41 @@
 			continue;
 
 		case LOAD_GLOBAL:
+#ifdef CACHED_LOOKUPS
+			CACHED_LOOKUP_STAT(loadglobal_total);
+			{
+				/* Check for cached values */
+				LookupDCacheEntry *entry = co->co_lookup_dcache+oparg;
+				if (LOOKUP_D_CACHE_HIT(entry,
+				                       f->f_globals,
+				                       f->f_builtins)) {
+					/* Cache hit */
+					CACHED_LOOKUP_STAT(loadglobal_hit);
+					x = entry->value;
+				} else {
+					/* Cache miss, do a regular lookup */
+					w = GETITEM(names, oparg);
+					x = PyDict_GetItem(f->f_globals, w);
+					if (x == NULL)
+						x = PyDict_GetItem(f->f_builtins, w);
+					/* Update the cache */
+					LOOKUP_D_CACHE_STORE(entry,
+					                     f->f_globals,
+					                     f->f_builtins,
+					                     x);
+				}
+				if (x == NULL) {
+					/* Lookup failure (may be from cache!) */
+					w = GETITEM(names, oparg);
+					format_exc_check_arg(PyExc_NameError,
+						GLOBAL_NAME_ERROR_MSG, w);
+					break;
+				}
+				Py_INCREF(x);
+				PUSH(x);
+				continue;
+			}
+#else
 			w = GETITEM(names, oparg);
 			if (PyString_CheckExact(w)) {
 				/* Inline the PyDict_GetItem() calls.
@@ -1898,7 +1973,7 @@
 			Py_INCREF(x);
 			PUSH(x);
 			continue;
-
+#endif
 		case DELETE_FAST:
 			x = GETLOCAL(oparg);
 			if (x != NULL) {
@@ -1986,6 +2061,42 @@
 			break;
 
 		case LOAD_ATTR:
+#ifdef CACHED_MODULE_LOOKUPS
+			v = TOP();
+			CACHED_LOOKUP_STAT(loadattr_total);
+			if (PyModule_CheckExact(v)) {
+				LookupDCacheEntry *entry = co->co_lookup_dcache + oparg;
+				CACHED_LOOKUP_STAT(loadattr_module);
+				PyDictObject *d = (PyDictObject *)
+				    ((PyModuleObject *)v)->md_dict;
+				if (d && LOOKUP_CACHE_HIT2(entry, d)) {
+					/* Cache hit */
+					CACHED_LOOKUP_STAT(loadattr_hit);
+					x = entry->value;
+					/* Note that we know for sure x!=NULL */
+					Py_INCREF(x);
+					Py_DECREF(v);
+					SET_TOP(x);
+					if (x != NULL) continue;
+					break;
+				} else {
+					/* Cache miss, do a regular lookup */
+					w = GETITEM(names, oparg);
+					x = PyObject_GetAttr(v, w);
+					if (x!=NULL && d!=NULL) {
+					    /* Note: failed lookups not cached! */
+					    LOOKUP_CACHE_STORE2(entry, d, x);
+					}
+				}
+			} else {
+				w = GETITEM(names, oparg);
+				x = PyObject_GetAttr(v, w);
+			}
+			Py_DECREF(v);
+			SET_TOP(x);
+			if (x != NULL) continue;
+			break;
+#else
 			w = GETITEM(names, oparg);
 			v = TOP();
 			x = PyObject_GetAttr(v, w);
@@ -1993,7 +2104,7 @@
 			SET_TOP(x);
 			if (x != NULL) continue;
 			break;
-
+#endif
 		case COMPARE_OP:
 			w = POP();
 			v = TOP();
Index: Python/compile.c
===================================================================
--- Python/compile.c	(revision 53197)
+++ Python/compile.c	(working copy)
@@ -3315,7 +3315,74 @@
 	int a_lineno_off;      /* bytecode offset of last lineno */
 };
 
+#ifdef CACHED_LOOKUPS
+
+/* At the dfs traversal time all names that are used in
+ * LOAD_GLOBAL or LOAD_ATTR instuctions are marked so that
+ * they can be moved to the front of co_names list.
+ * This allows a lower memory consumption and prevents cache
+ * trashing. The attrnames parameter points to a list of
+ * integers indexed by oparg; msb=1 means the name is
+ * used in LOAD_ATTR/LOAD_GLOBAL.
+ */
+
 static void
+dfs(struct compiler *c, basicblock *b, struct assembler *a, int *attrnames)
+{
+	int i;
+	struct instr *instr = NULL;
+
+	if (b->b_seen)
+		return;
+	b->b_seen = 1;
+	if (b->b_next != NULL)
+		dfs(c, b->b_next, a, attrnames);
+	for (i = 0; i < b->b_iused; i++) {
+		instr = &b->b_instr[i];
+		if (instr->i_opcode == LOAD_GLOBAL ||
+		    instr->i_opcode == LOAD_ATTR) {
+			/* Flag the name as used */
+			attrnames[instr->i_oparg] |= 0x80000000;
+		}
+		if (instr->i_jrel || instr->i_jabs)
+			dfs(c, instr->i_target, a, attrnames);
+	}
+	a->a_postorder[a->a_nblocks++] = b;
+}
+
+/* This function fixes the argument of LOAD_GLOBAL/LOAD_ATTR
+ * opcodes using the indexes specified in inames.
+ */
+
+static void
+reorder_names(struct assembler *a, int *inames)
+{
+	int bi, i;
+	struct instr *instr = NULL;
+	for (bi = 0; bi<a->a_nblocks; bi++) {
+		basicblock *b = a->a_postorder[bi];
+		for (i = 0; i < b->b_iused; i++) {
+			instr = &b->b_instr[i];
+			if (instr->i_opcode == STORE_NAME ||
+			    instr->i_opcode == DELETE_NAME ||
+			    instr->i_opcode == STORE_ATTR ||
+			    instr->i_opcode == DELETE_ATTR ||
+			    instr->i_opcode == STORE_GLOBAL ||
+			    instr->i_opcode == DELETE_GLOBAL ||
+			    instr->i_opcode == LOAD_NAME ||
+			    instr->i_opcode == LOAD_ATTR ||
+			    instr->i_opcode == IMPORT_NAME ||
+			    instr->i_opcode == IMPORT_FROM ||
+			    instr->i_opcode == LOAD_GLOBAL) {
+				instr->i_oparg = inames[instr->i_oparg];
+			}
+		}
+	}
+}
+
+#else
+
+static void
 dfs(struct compiler *c, basicblock *b, struct assembler *a)
 {
 	int i;
@@ -3334,6 +3401,8 @@
 	a->a_postorder[a->a_nblocks++] = b;
 }
 
+#endif
+
 static int
 stackdepth_walk(struct compiler *c, basicblock *b, int depth, int maxdepth)
 {
@@ -3754,7 +3823,13 @@
 	consts = PySequence_List(tmp); /* optimize_code requires a list */
 	Py_DECREF(tmp);
 
+#ifdef CACHED_LOOKUPS
+	/* u_names is already a tuple! */
+	Py_INCREF(c->u->u_names);
+	names = c->u->u_names;
+#else
 	names = dict_keys_inorder(c->u->u_names, 0);
+#endif
 	varnames = dict_keys_inorder(c->u->u_varnames, 0);
 	if (!consts || !names || !varnames)
 		goto error;
@@ -3872,8 +3947,72 @@
 	}
 	if (!assemble_init(&a, nblocks, c->u->u_firstlineno))
 		goto error;
+#ifdef CACHED_LOOKUPS
+	{
+		/* Sort names so that ones that are used for LOAD_GLOBAL
+		 * and LOAD_ATTR opcodes come first in co_names.
+		 * To do this use the dfs visit to mark all interesting
+		 * names in the attrnames byte array.
+		 */
+		int n, rp, wp;
+		PyObject *names = dict_keys_inorder(c->u->u_names, 0);
+		if (!names) {
+		    PyErr_NoMemory();
+		    goto error;
+		}
+		n = PyTuple_GET_SIZE(names);
+		int *attrnames = (int *) PyObject_Malloc(n*2*sizeof(int));
+		if (!attrnames) {
+		    Py_DECREF(names);
+		    PyErr_NoMemory();
+		    goto error;
+		}
+		for (i=0; i<n; i++)
+			attrnames[i] = i;
+		dfs(c, entryblock, &a, attrnames);
+
+		/* Move used names at the beginning of the list */
+		for (rp = 0; rp<n && (attrnames[rp]&0x80000000)!=0; rp++) {
+			/* Leave names already at the beginning in place */
+			attrnames[rp] &= 0x7FFFFFFF;
+		}
+		/* In the following rp is the "read pointer" and wp the
+		 * "write pointer". Note that the write pointer is never
+		 * on an interesting name.
+		 */
+		wp = rp;
+		for (; rp < n; rp++) {
+			if (attrnames[rp] & 0x80000000) {
+				/* This is an interesting name; swap it
+				 * with the one pointed by wp
+				 */
+				int ia, ib;
+				PyObject *a = PyTuple_GET_ITEM(names, rp);
+				PyObject *b = PyTuple_GET_ITEM(names, wp);
+				PyTuple_SET_ITEM(names, rp, b);
+				PyTuple_SET_ITEM(names, wp, a);
+				ia = attrnames[rp];
+				ib = attrnames[wp];
+				attrnames[wp++] = ia & 0x7FFFFFFF;
+				attrnames[rp] = ib;
+			}
+		}
+
+		/* Compute the inverse permutation of attrnames */
+		for (i=0; i<n; i++)
+		    attrnames[n + attrnames[i]] = i;
+
+		reorder_names(&a, attrnames + n);
+
+		PyObject_Free(attrnames);
+
+		/* Replace u_names with the already computed tuple */
+		Py_DECREF(c->u->u_names);
+		c->u->u_names = names;
+	}
+#else
 	dfs(c, entryblock, &a);
-
+#endif
 	/* Can't modify the bytecode after computing jump offsets. */
 	assemble_jump_offsets(&a, c);
 
Index: Include/code.h
===================================================================
--- Include/code.h	(revision 53197)
+++ Include/code.h	(working copy)
@@ -7,7 +7,8 @@
 #endif
 
 /* Bytecode object */
-typedef struct {
+typedef struct _codeobject PyCodeObject;
+struct _codeobject {
     PyObject_HEAD
     int co_argcount;		/* #arguments, except *args */
     int co_nlocals;		/* #local variables */
@@ -25,8 +26,18 @@
     int co_firstlineno;		/* first source line number */
     PyObject *co_lnotab;	/* string (encoding addr<->lineno mapping) */
     void *co_zombieframe;     /* for optimization only (see frameobject.c) */
-} PyCodeObject;
+#if defined(CACHED_LOOKUPS)
+    /* Dictionary lookup caching optimization; see dictobject.h */
+    LookupDCacheEntry *co_lookup_dcache;
+    int co_lookup_cache_size;	/* Number of entries in lookup cache */
+    PyCodeObject *prev, *next; /* dlinked list of all code objects */
+#endif
+};
 
+#if defined(CACHED_LOOKUPS)
+void co_clear_lookup_cache(void);
+#endif
+
 /* Masks for co_flags above */
 #define CO_OPTIMIZED	0x0001
 #define CO_NEWLOCALS	0x0002
Index: Include/dictobject.h
===================================================================
--- Include/dictobject.h	(revision 53197)
+++ Include/dictobject.h	(working copy)
@@ -37,6 +37,78 @@
 meaning otherwise.
 */
 
+#ifdef CACHED_LOOKUPS
+/* Dicts also support timestamping, i.e. they carry a special timestamp
+ * field that is automatically changed at every change of the content
+ * of the dictionary. This timestamp field is used in a few speed critical
+ * places to avoid repeating a lookup operation for the same key if the
+ * dictionary didn't change since last lookup.
+ */
+#ifdef LONG_LONG_LOOKUP_TIMESTAMPS
+
+typedef unsigned long long lookup_tstamp_t;
+
+#else
+
+typedef size_t lookup_tstamp_t;
+
+#endif
+
+/* Cached lookups are stored as pairs of lookup_tstamp_t/PyObject* for
+ * normal lookups (LOAD_ATTR specialization for module objects) and as
+ * triplets with two lookup_tstamp_t for global+builtin lookups
+ * (LOAD_GLOBAL); the idea is that for load global it's faster to
+ * just check if neither of the dicts changed and use the result than
+ * checking the first and (for builtins) second result separately.
+ *
+ * NOTE: For now only double entries are used and the cache is
+ *       allocated parallel to co_names and indexed by oparg in
+ *       both LOAD_ATTR and LOAD_GLOBAL.
+ *
+ * NOTE: the global timestamp is incremented until it wraps to 0,
+ *       and at that point a sweep on all existing dictionaries
+ *       and code objects is done to clear the cache.
+ */
+
+/* not used for now */
+typedef struct LOOKUP_CACHE_ENTRY_TAG {
+    lookup_tstamp_t timestamp;
+    PyObject *value;
+} LookupCacheEntry;
+
+/* abused also for LOAD_ATTR */
+typedef struct LOOKUP_D_CACHE_ENTRY_TAG {
+    lookup_tstamp_t timestamp1;
+    lookup_tstamp_t timestamp2;
+    PyObject *value;
+} LookupDCacheEntry;
+
+#define LOOKUP_CACHE_HIT(entry, d) \
+   ((entry)->timestamp == ((PyDictObject *)(d))->timestamp)
+
+#define LOOKUP_CACHE_STORE(entry, d, x)\
+   do{ (entry)->timestamp = ((PyDictObject *)(d))->timestamp;\
+       (entry)->value = (x); } while(0)
+
+#define LOOKUP_CACHE_HIT2(entry, d) \
+   ((entry)->timestamp1 == ((PyDictObject *)(d))->timestamp)
+
+#define LOOKUP_CACHE_STORE2(entry, d, x)\
+   do{ (entry)->timestamp1 = (entry)->timestamp2 =\
+           ((PyDictObject *)(d))->timestamp;\
+           (entry)->value = (x); } while(0)
+
+#define LOOKUP_D_CACHE_HIT(entry, d1, d2) \
+   ((entry)->timestamp1 == ((PyDictObject *)(d1))->timestamp &&\
+    (entry)->timestamp2 == ((PyDictObject *)(d2))->timestamp)
+
+#define LOOKUP_D_CACHE_STORE(entry, d1, d2, x)\
+   do{ (entry)->timestamp1 = ((PyDictObject *)(d1))->timestamp;\
+       (entry)->timestamp2 = ((PyDictObject *)(d2))->timestamp;\
+       (entry)->value = (x); } while(0)
+
+#endif
+
 /* PyDict_MINSIZE is the minimum size of a dictionary.  This many slots are
  * allocated directly in the dict object (in the ma_smalltable member).
  * It must be a power of 2, and at least 4.  8 allows dicts with no more
@@ -69,6 +141,14 @@
 typedef struct _dictobject PyDictObject;
 struct _dictobject {
 	PyObject_HEAD
+#ifdef CACHED_LOOKUPS
+	/* timestamp is updated with an ever-incrementing counter
+	 * each time the dictionary is modified; this field is used
+	 * to validate cached lookups from Python/ceval.c
+	 */
+	lookup_tstamp_t timestamp;
+	PyDictObject *prev, *next;
+#endif
 	Py_ssize_t ma_fill;  /* # Active + # Dummy */
 	Py_ssize_t ma_used;  /* # Active */
 
Index: Objects/codeobject.c
===================================================================
--- Objects/codeobject.c	(revision 53197)
+++ Objects/codeobject.c	(working copy)
@@ -2,6 +2,10 @@
 #include "code.h"
 #include "structmember.h"
 
+#ifdef CACHED_LOOKUPS
+#include "opcode.h"
+#endif
+
 #define NAME_CHARS \
 	"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"
 
@@ -39,7 +43,35 @@
 	}
 }
 
+#ifdef CACHED_LOOKUPS
 
+/* Forward */
+static void
+code_dealloc(PyCodeObject *co);
+
+static
+PyCodeObject *first_code_object = NULL;
+
+void
+co_clear_lookup_cache(void)
+{
+	PyCodeObject *co = first_code_object;
+	int nn = 0;
+	while(co) {
+		int n = co->co_lookup_cache_size;
+		LookupDCacheEntry *e = co->co_lookup_dcache;
+		while (n-- > 0) {
+			e->timestamp1 = 0;
+			e->timestamp2 = 0;
+			e++;
+		}
+		nn++;
+		co = co->next;
+	}
+}
+
+#endif
+
 PyCodeObject *
 PyCode_New(int argcount, int nlocals, int stacksize, int flags,
 	   PyObject *code, PyObject *consts, PyObject *names,
@@ -103,11 +135,63 @@
 		Py_INCREF(lnotab);
 		co->co_lnotab = lnotab;
                 co->co_zombieframe = NULL;
+#if defined(CACHED_LOOKUPS)
+		/* Lookup cache allocation */
+		{
+			/* Compute the maximum index in co_names used
+			 * by LOAD_GLOBAL/LOAD_ATTR opcodes. This number
+			 * will be used to compute the size of the
+			 * lookup cache.
+			 */
+			Py_ssize_t n = PyString_GET_SIZE(code), j, mx = -1;
+			unsigned char *s = (unsigned char *)PyString_AS_STRING(code);
+			for (j=0; j<n; j++) {
+				if (s[j] >= HAVE_ARGUMENT && j+2<n) {
+					Py_ssize_t ix = (s[j+1] + (s[j+2]<<8));
+					if (s[j] == EXTENDED_ARG && j+5<n) {
+					    j += 3;
+					    ix = (ix << 16) + (s[j+1] + (s[j+2]<<8));
+					}
+					if (s[j] == LOAD_GLOBAL ||
+					    s[j] == LOAD_ATTR) {
+						if (ix > mx) mx = ix;
+					}
+					j += 2;
+				} else {
+				    /* NOP */
+				}
+			}
+			i = mx + 1;
+		}
+		/* Chain into global list */
+		co->next = first_code_object;
+		co->prev = NULL;
+		if (co->next) co->next->prev = co;
+		first_code_object = co;
+		/* Initialize cache entries */
+		co->co_lookup_dcache = NULL;
+		co->co_lookup_cache_size = i;
+		if (i > 0) {
+			LookupDCacheEntry *c = (LookupDCacheEntry *)
+			    PyObject_Malloc(i*sizeof(LookupDCacheEntry));
+			if (c!=NULL) {
+				while (i-- > 0) {
+					c[i].timestamp1 = 0;
+					c[i].timestamp2 = 0;
+					c[i].value = NULL;
+				}
+				co->co_lookup_dcache = c;
+			} else {
+				/* Whoops, are we running out of memory ? */
+				code_dealloc(co);
+				co = NULL;
+			}
+		}
+#endif
 	}
 	return co;
 }
 
-
 #define OFF(x) offsetof(PyCodeObject, x)
 
 static PyMemberDef code_memberlist[] = {
@@ -268,6 +352,15 @@
 	Py_XDECREF(co->co_lnotab);
         if (co->co_zombieframe != NULL)
                 PyObject_GC_Del(co->co_zombieframe);
+#ifdef CACHED_LOOKUPS
+	if (co->co_lookup_dcache != NULL) {
+		/* Free lookup cache */
+		PyObject_Free(co->co_lookup_dcache);
+	}
+	/* Unchain from global list */
+	if (co->prev) co->prev->next = co->next; else first_code_object = co->next;
+	if (co->next) co->next->prev = co->prev;
+#endif
 	PyObject_DEL(co);
 }
 
Index: Objects/dictobject.c
===================================================================
--- Objects/dictobject.c	(revision 53197)
+++ Objects/dictobject.c	(working copy)
@@ -147,6 +147,75 @@
 }
 #endif
 
+#ifdef CACHED_LOOKUPS
+
+/*
+Global counter used to mark updates to dictionaries, so cached lookups
+can be checked for validity. The counter is a global timestamp that is
+incremented at every change to a dictionary and stored as the dictionary
+timestamp. Cached lookups use borrowed pointers to the value; this is safe
+because for an object to be destroyed no dictionary can hold a reference
+to it, hence for a value to be deleted the dict must be touched first thus
+invalidating the lookup cache entry.
+*/
+
+/*
+#define LOOKUP_DEBUG_TIMESTAMP_LIMIT
+*/
+#ifdef LOOKUP_DEBUG_TIMESTAMP_LIMIT
+
+static lookup_tstamp_t dict_timestamp_value = 0xFFFFFFFFU - 10000;
+
+#define TOUCHDICT(d) do{ if (++dict_timestamp_value == 0) { \
+                             dict_clear_lookup_cache();\
+                             co_clear_lookup_cache();\
+                             printf("Lookup tstamp limit reached\n");\
+                             dict_timestamp_value = 0xFFFFFFFFU - 10000;\
+                         }\
+                         (d)->timestamp = dict_timestamp_value; } while(0)
+
+#else
+
+static lookup_tstamp_t dict_timestamp_value = 1;
+
+#define TOUCHDICT(d) do{ if (++dict_timestamp_value == 0) { \
+                             dict_clear_lookup_cache();\
+                             co_clear_lookup_cache();\
+                         }\
+                         (d)->timestamp = dict_timestamp_value; } while(0)
+
+#endif
+
+/* All dictionaries are kept in a doubly-linked list so that
+ * when the global timestamp wraps at 2**32 cached lookups
+ * can be invalidated.
+ * Note that dictionaries are removed from the list only when
+ * the object is really destroyed; dictionaries in the free
+ * list are instead kept in the list. This will mean some time
+ * wasted when doing the cleaning sweep but some time saved
+ * when reusing a dictionary from the free list.
+ */
+static
+PyDictObject *first_dict = NULL;
+
+static
+void dict_clear_lookup_cache(void)
+{
+	PyDictObject *d = first_dict;
+	lookup_tstamp_t x = 1;
+	while (d) {
+		d->timestamp = ++x;
+		d = d->next;
+	}
+	dict_timestamp_value = x;
+}
+
+#else
+
+#define TOUCHDICT(d) /* nop */
+
+#endif
+
 /* forward declarations */
 static dictentry *
 lookdict_string(dictobject *mp, PyObject *key, long hash);
@@ -217,7 +286,17 @@
 		if (mp == NULL)
 			return NULL;
 		EMPTY_TO_MINSIZE(mp);
+#ifdef CACHED_LOOKUPS
+		/* Chain in the list of all dictionaries */
+		mp->next = first_dict;
+		mp->prev = NULL;
+		if (mp->next) mp->next->prev = mp;
+		first_dict = mp;
+#endif
 	}
+#ifdef CACHED_LOOKUPS
+	mp->timestamp = 1;
+#endif
 	mp->ma_lookup = lookdict_string;
 #ifdef SHOW_CONVERSION_COUNTS
 	++created;
@@ -620,6 +699,7 @@
 	assert(key);
 	assert(value);
 	mp = (dictobject *)op;
+	TOUCHDICT(mp);
 	if (PyString_CheckExact(key)) {
 		hash = ((PyStringObject *)key)->ob_shash;
 		if (hash == -1)
@@ -682,6 +762,7 @@
 		set_key_error(key);
 		return -1;
 	}
+	TOUCHDICT(mp);
 	old_key = ep->me_key;
 	Py_INCREF(dummy);
 	ep->me_key = dummy;
@@ -713,6 +794,7 @@
 	i = 0;
 #endif
 
+	TOUCHDICT(mp);
 	table = mp->ma_table;
 	assert(table != NULL);
 	table_is_malloced = table != mp->ma_smalltable;
@@ -823,8 +905,14 @@
 		PyMem_DEL(mp->ma_table);
 	if (num_free_dicts < MAXFREEDICTS && mp->ob_type == &PyDict_Type)
 		free_dicts[num_free_dicts++] = mp;
-	else
+	else {
+#ifdef CACHED_LOOKUPS
+		/* Unchain from list of all dictionaries */
+		if (mp->prev) mp->prev->next = mp->next; else first_dict = mp->next;
+		if (mp->next) mp->next->prev = mp->prev;
+#endif
 		mp->ob_type->tp_free((PyObject *)mp);
+	}
 	Py_TRASHCAN_SAFE_END(mp)
 }
 
@@ -1763,6 +1851,7 @@
 		set_key_error(key);
 		return NULL;
 	}
+	TOUCHDICT(mp);
 	old_key = ep->me_key;
 	Py_INCREF(dummy);
 	ep->me_key = dummy;
@@ -1820,6 +1909,7 @@
 				i = 1;
 		}
 	}
+	TOUCHDICT(mp);
 	PyTuple_SET_ITEM(res, 0, ep->me_key);
 	PyTuple_SET_ITEM(res, 1, ep->me_value);
 	Py_INCREF(dummy);
@@ -2014,6 +2104,14 @@
 		assert(d->ma_table == NULL && d->ma_fill == 0 && d->ma_used == 0);
 		INIT_NONZERO_DICT_SLOTS(d);
 		d->ma_lookup = lookdict_string;
+#ifdef CACHED_LOOKUP
+		/* Chain in the list of all dictionaries */
+		d->next = first_dict;
+		d->prev = NULL;
+		if (d->next) d->next->prev = d;
+		first_dict = d;
+		d->timestamp = 1;
+#endif
 #ifdef SHOW_CONVERSION_COUNTS
 		++created;
 #endif