Index: Python/ceval.c =================================================================== --- Python/ceval.c (revision 53197) +++ Python/ceval.c (working copy) @@ -1,4 +1,3 @@ - /* Execute compiled code */ /* XXX TO DO: @@ -418,7 +417,41 @@ return 0; } +#ifdef CACHED_LOOKUPS +#ifdef CACHED_MODULE_LOOKUPS +/* BAD HACK ... replicating module structure here! */ +typedef struct { + PyObject_HEAD + PyObject *md_dict; +} PyModuleObject; +#endif + +/*#define CACHED_LOOKUP_STATS*/ +#ifdef CACHED_LOOKUP_STATS + +static int loadglobal_total = 0; +static int loadglobal_hit = 0; +static int loadattr_total = 0; +static int loadattr_module = 0; +static int loadattr_hit = 0; +static int loadattr_init = 0; + +static void dumpstats(void) +{ + printf("LOAD_GLOBAL total = %i\n", loadglobal_total); + printf("LOAD_GLOBAL hit = %i\n", loadglobal_hit); + printf("LOAD_ATTR total = %i\n", loadattr_total); + printf("LOAD_ATTR module = %i\n", loadattr_module); + printf("LOAD_ATTR hit = %i\n", loadattr_hit); +} + +#define CACHED_LOOKUP_STAT(x) do{(x)++;}while(0) +#else +#define CACHED_LOOKUP_STAT(x) +#endif +#endif + /* The interpreter's recursion limit */ #ifndef Py_DEFAULT_RECURSION_LIMIT @@ -531,6 +564,13 @@ PyObject *retval = NULL; /* Return value */ PyThreadState *tstate = PyThreadState_GET(); PyCodeObject *co; +#ifdef CACHED_LOOKUP_STATS + if (!loadattr_init) + { + loadattr_init = 1; + atexit(dumpstats); + } +#endif /* when tracing we set things up so that @@ -1847,6 +1887,41 @@ continue; case LOAD_GLOBAL: +#ifdef CACHED_LOOKUPS + CACHED_LOOKUP_STAT(loadglobal_total); + { + /* Check for cached values */ + LookupDCacheEntry *entry = co->co_lookup_dcache+oparg; + if (LOOKUP_D_CACHE_HIT(entry, + f->f_globals, + f->f_builtins)) { + /* Cache hit */ + CACHED_LOOKUP_STAT(loadglobal_hit); + x = entry->value; + } else { + /* Cache miss, do a regular lookup */ + w = GETITEM(names, oparg); + x = PyDict_GetItem(f->f_globals, w); + if (x == NULL) + x = PyDict_GetItem(f->f_builtins, w); + /* Update the cache */ + LOOKUP_D_CACHE_STORE(entry, + f->f_globals, + f->f_builtins, + x); + } + if (x == NULL) { + /* Lookup failure (may be from cache!) */ + w = GETITEM(names, oparg); + format_exc_check_arg(PyExc_NameError, + GLOBAL_NAME_ERROR_MSG, w); + break; + } + Py_INCREF(x); + PUSH(x); + continue; + } +#else w = GETITEM(names, oparg); if (PyString_CheckExact(w)) { /* Inline the PyDict_GetItem() calls. @@ -1898,7 +1973,7 @@ Py_INCREF(x); PUSH(x); continue; - +#endif case DELETE_FAST: x = GETLOCAL(oparg); if (x != NULL) { @@ -1986,6 +2061,42 @@ break; case LOAD_ATTR: +#ifdef CACHED_MODULE_LOOKUPS + v = TOP(); + CACHED_LOOKUP_STAT(loadattr_total); + if (PyModule_CheckExact(v)) { + LookupDCacheEntry *entry = co->co_lookup_dcache + oparg; + CACHED_LOOKUP_STAT(loadattr_module); + PyDictObject *d = (PyDictObject *) + ((PyModuleObject *)v)->md_dict; + if (d && LOOKUP_CACHE_HIT2(entry, d)) { + /* Cache hit */ + CACHED_LOOKUP_STAT(loadattr_hit); + x = entry->value; + /* Note that we know for sure x!=NULL */ + Py_INCREF(x); + Py_DECREF(v); + SET_TOP(x); + if (x != NULL) continue; + break; + } else { + /* Cache miss, do a regular lookup */ + w = GETITEM(names, oparg); + x = PyObject_GetAttr(v, w); + if (x!=NULL && d!=NULL) { + /* Note: failed lookups not cached! */ + LOOKUP_CACHE_STORE2(entry, d, x); + } + } + } else { + w = GETITEM(names, oparg); + x = PyObject_GetAttr(v, w); + } + Py_DECREF(v); + SET_TOP(x); + if (x != NULL) continue; + break; +#else w = GETITEM(names, oparg); v = TOP(); x = PyObject_GetAttr(v, w); @@ -1993,7 +2104,7 @@ SET_TOP(x); if (x != NULL) continue; break; - +#endif case COMPARE_OP: w = POP(); v = TOP(); Index: Python/compile.c =================================================================== --- Python/compile.c (revision 53197) +++ Python/compile.c (working copy) @@ -3315,7 +3315,74 @@ int a_lineno_off; /* bytecode offset of last lineno */ }; +#ifdef CACHED_LOOKUPS + +/* At the dfs traversal time all names that are used in + * LOAD_GLOBAL or LOAD_ATTR instuctions are marked so that + * they can be moved to the front of co_names list. + * This allows a lower memory consumption and prevents cache + * trashing. The attrnames parameter points to a list of + * integers indexed by oparg; msb=1 means the name is + * used in LOAD_ATTR/LOAD_GLOBAL. + */ + static void +dfs(struct compiler *c, basicblock *b, struct assembler *a, int *attrnames) +{ + int i; + struct instr *instr = NULL; + + if (b->b_seen) + return; + b->b_seen = 1; + if (b->b_next != NULL) + dfs(c, b->b_next, a, attrnames); + for (i = 0; i < b->b_iused; i++) { + instr = &b->b_instr[i]; + if (instr->i_opcode == LOAD_GLOBAL || + instr->i_opcode == LOAD_ATTR) { + /* Flag the name as used */ + attrnames[instr->i_oparg] |= 0x80000000; + } + if (instr->i_jrel || instr->i_jabs) + dfs(c, instr->i_target, a, attrnames); + } + a->a_postorder[a->a_nblocks++] = b; +} + +/* This function fixes the argument of LOAD_GLOBAL/LOAD_ATTR + * opcodes using the indexes specified in inames. + */ + +static void +reorder_names(struct assembler *a, int *inames) +{ + int bi, i; + struct instr *instr = NULL; + for (bi = 0; bia_nblocks; bi++) { + basicblock *b = a->a_postorder[bi]; + for (i = 0; i < b->b_iused; i++) { + instr = &b->b_instr[i]; + if (instr->i_opcode == STORE_NAME || + instr->i_opcode == DELETE_NAME || + instr->i_opcode == STORE_ATTR || + instr->i_opcode == DELETE_ATTR || + instr->i_opcode == STORE_GLOBAL || + instr->i_opcode == DELETE_GLOBAL || + instr->i_opcode == LOAD_NAME || + instr->i_opcode == LOAD_ATTR || + instr->i_opcode == IMPORT_NAME || + instr->i_opcode == IMPORT_FROM || + instr->i_opcode == LOAD_GLOBAL) { + instr->i_oparg = inames[instr->i_oparg]; + } + } + } +} + +#else + +static void dfs(struct compiler *c, basicblock *b, struct assembler *a) { int i; @@ -3334,6 +3401,8 @@ a->a_postorder[a->a_nblocks++] = b; } +#endif + static int stackdepth_walk(struct compiler *c, basicblock *b, int depth, int maxdepth) { @@ -3754,7 +3823,13 @@ consts = PySequence_List(tmp); /* optimize_code requires a list */ Py_DECREF(tmp); +#ifdef CACHED_LOOKUPS + /* u_names is already a tuple! */ + Py_INCREF(c->u->u_names); + names = c->u->u_names; +#else names = dict_keys_inorder(c->u->u_names, 0); +#endif varnames = dict_keys_inorder(c->u->u_varnames, 0); if (!consts || !names || !varnames) goto error; @@ -3872,8 +3947,72 @@ } if (!assemble_init(&a, nblocks, c->u->u_firstlineno)) goto error; +#ifdef CACHED_LOOKUPS + { + /* Sort names so that ones that are used for LOAD_GLOBAL + * and LOAD_ATTR opcodes come first in co_names. + * To do this use the dfs visit to mark all interesting + * names in the attrnames byte array. + */ + int n, rp, wp; + PyObject *names = dict_keys_inorder(c->u->u_names, 0); + if (!names) { + PyErr_NoMemory(); + goto error; + } + n = PyTuple_GET_SIZE(names); + int *attrnames = (int *) PyObject_Malloc(n*2*sizeof(int)); + if (!attrnames) { + Py_DECREF(names); + PyErr_NoMemory(); + goto error; + } + for (i=0; iu->u_names); + c->u->u_names = names; + } +#else dfs(c, entryblock, &a); - +#endif /* Can't modify the bytecode after computing jump offsets. */ assemble_jump_offsets(&a, c); Index: Include/code.h =================================================================== --- Include/code.h (revision 53197) +++ Include/code.h (working copy) @@ -7,7 +7,8 @@ #endif /* Bytecode object */ -typedef struct { +typedef struct _codeobject PyCodeObject; +struct _codeobject { PyObject_HEAD int co_argcount; /* #arguments, except *args */ int co_nlocals; /* #local variables */ @@ -25,8 +26,18 @@ int co_firstlineno; /* first source line number */ PyObject *co_lnotab; /* string (encoding addr<->lineno mapping) */ void *co_zombieframe; /* for optimization only (see frameobject.c) */ -} PyCodeObject; +#if defined(CACHED_LOOKUPS) + /* Dictionary lookup caching optimization; see dictobject.h */ + LookupDCacheEntry *co_lookup_dcache; + int co_lookup_cache_size; /* Number of entries in lookup cache */ + PyCodeObject *prev, *next; /* dlinked list of all code objects */ +#endif +}; +#if defined(CACHED_LOOKUPS) +void co_clear_lookup_cache(void); +#endif + /* Masks for co_flags above */ #define CO_OPTIMIZED 0x0001 #define CO_NEWLOCALS 0x0002 Index: Include/dictobject.h =================================================================== --- Include/dictobject.h (revision 53197) +++ Include/dictobject.h (working copy) @@ -37,6 +37,78 @@ meaning otherwise. */ +#ifdef CACHED_LOOKUPS +/* Dicts also support timestamping, i.e. they carry a special timestamp + * field that is automatically changed at every change of the content + * of the dictionary. This timestamp field is used in a few speed critical + * places to avoid repeating a lookup operation for the same key if the + * dictionary didn't change since last lookup. + */ +#ifdef LONG_LONG_LOOKUP_TIMESTAMPS + +typedef unsigned long long lookup_tstamp_t; + +#else + +typedef size_t lookup_tstamp_t; + +#endif + +/* Cached lookups are stored as pairs of lookup_tstamp_t/PyObject* for + * normal lookups (LOAD_ATTR specialization for module objects) and as + * triplets with two lookup_tstamp_t for global+builtin lookups + * (LOAD_GLOBAL); the idea is that for load global it's faster to + * just check if neither of the dicts changed and use the result than + * checking the first and (for builtins) second result separately. + * + * NOTE: For now only double entries are used and the cache is + * allocated parallel to co_names and indexed by oparg in + * both LOAD_ATTR and LOAD_GLOBAL. + * + * NOTE: the global timestamp is incremented until it wraps to 0, + * and at that point a sweep on all existing dictionaries + * and code objects is done to clear the cache. + */ + +/* not used for now */ +typedef struct LOOKUP_CACHE_ENTRY_TAG { + lookup_tstamp_t timestamp; + PyObject *value; +} LookupCacheEntry; + +/* abused also for LOAD_ATTR */ +typedef struct LOOKUP_D_CACHE_ENTRY_TAG { + lookup_tstamp_t timestamp1; + lookup_tstamp_t timestamp2; + PyObject *value; +} LookupDCacheEntry; + +#define LOOKUP_CACHE_HIT(entry, d) \ + ((entry)->timestamp == ((PyDictObject *)(d))->timestamp) + +#define LOOKUP_CACHE_STORE(entry, d, x)\ + do{ (entry)->timestamp = ((PyDictObject *)(d))->timestamp;\ + (entry)->value = (x); } while(0) + +#define LOOKUP_CACHE_HIT2(entry, d) \ + ((entry)->timestamp1 == ((PyDictObject *)(d))->timestamp) + +#define LOOKUP_CACHE_STORE2(entry, d, x)\ + do{ (entry)->timestamp1 = (entry)->timestamp2 =\ + ((PyDictObject *)(d))->timestamp;\ + (entry)->value = (x); } while(0) + +#define LOOKUP_D_CACHE_HIT(entry, d1, d2) \ + ((entry)->timestamp1 == ((PyDictObject *)(d1))->timestamp &&\ + (entry)->timestamp2 == ((PyDictObject *)(d2))->timestamp) + +#define LOOKUP_D_CACHE_STORE(entry, d1, d2, x)\ + do{ (entry)->timestamp1 = ((PyDictObject *)(d1))->timestamp;\ + (entry)->timestamp2 = ((PyDictObject *)(d2))->timestamp;\ + (entry)->value = (x); } while(0) + +#endif + /* PyDict_MINSIZE is the minimum size of a dictionary. This many slots are * allocated directly in the dict object (in the ma_smalltable member). * It must be a power of 2, and at least 4. 8 allows dicts with no more @@ -69,6 +141,14 @@ typedef struct _dictobject PyDictObject; struct _dictobject { PyObject_HEAD +#ifdef CACHED_LOOKUPS + /* timestamp is updated with an ever-incrementing counter + * each time the dictionary is modified; this field is used + * to validate cached lookups from Python/ceval.c + */ + lookup_tstamp_t timestamp; + PyDictObject *prev, *next; +#endif Py_ssize_t ma_fill; /* # Active + # Dummy */ Py_ssize_t ma_used; /* # Active */ Index: Objects/codeobject.c =================================================================== --- Objects/codeobject.c (revision 53197) +++ Objects/codeobject.c (working copy) @@ -2,6 +2,10 @@ #include "code.h" #include "structmember.h" +#ifdef CACHED_LOOKUPS +#include "opcode.h" +#endif + #define NAME_CHARS \ "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz" @@ -39,7 +43,35 @@ } } +#ifdef CACHED_LOOKUPS +/* Forward */ +static void +code_dealloc(PyCodeObject *co); + +static +PyCodeObject *first_code_object = NULL; + +void +co_clear_lookup_cache(void) +{ + PyCodeObject *co = first_code_object; + int nn = 0; + while(co) { + int n = co->co_lookup_cache_size; + LookupDCacheEntry *e = co->co_lookup_dcache; + while (n-- > 0) { + e->timestamp1 = 0; + e->timestamp2 = 0; + e++; + } + nn++; + co = co->next; + } +} + +#endif + PyCodeObject * PyCode_New(int argcount, int nlocals, int stacksize, int flags, PyObject *code, PyObject *consts, PyObject *names, @@ -103,11 +135,63 @@ Py_INCREF(lnotab); co->co_lnotab = lnotab; co->co_zombieframe = NULL; +#if defined(CACHED_LOOKUPS) + /* Lookup cache allocation */ + { + /* Compute the maximum index in co_names used + * by LOAD_GLOBAL/LOAD_ATTR opcodes. This number + * will be used to compute the size of the + * lookup cache. + */ + Py_ssize_t n = PyString_GET_SIZE(code), j, mx = -1; + unsigned char *s = (unsigned char *)PyString_AS_STRING(code); + for (j=0; j= HAVE_ARGUMENT && j+2 mx) mx = ix; + } + j += 2; + } else { + /* NOP */ + } + } + i = mx + 1; + } + /* Chain into global list */ + co->next = first_code_object; + co->prev = NULL; + if (co->next) co->next->prev = co; + first_code_object = co; + /* Initialize cache entries */ + co->co_lookup_dcache = NULL; + co->co_lookup_cache_size = i; + if (i > 0) { + LookupDCacheEntry *c = (LookupDCacheEntry *) + PyObject_Malloc(i*sizeof(LookupDCacheEntry)); + if (c!=NULL) { + while (i-- > 0) { + c[i].timestamp1 = 0; + c[i].timestamp2 = 0; + c[i].value = NULL; + } + co->co_lookup_dcache = c; + } else { + /* Whoops, are we running out of memory ? */ + code_dealloc(co); + co = NULL; + } + } +#endif } return co; } - #define OFF(x) offsetof(PyCodeObject, x) static PyMemberDef code_memberlist[] = { @@ -268,6 +352,15 @@ Py_XDECREF(co->co_lnotab); if (co->co_zombieframe != NULL) PyObject_GC_Del(co->co_zombieframe); +#ifdef CACHED_LOOKUPS + if (co->co_lookup_dcache != NULL) { + /* Free lookup cache */ + PyObject_Free(co->co_lookup_dcache); + } + /* Unchain from global list */ + if (co->prev) co->prev->next = co->next; else first_code_object = co->next; + if (co->next) co->next->prev = co->prev; +#endif PyObject_DEL(co); } Index: Objects/dictobject.c =================================================================== --- Objects/dictobject.c (revision 53197) +++ Objects/dictobject.c (working copy) @@ -147,6 +147,75 @@ } #endif +#ifdef CACHED_LOOKUPS + +/* +Global counter used to mark updates to dictionaries, so cached lookups +can be checked for validity. The counter is a global timestamp that is +incremented at every change to a dictionary and stored as the dictionary +timestamp. Cached lookups use borrowed pointers to the value; this is safe +because for an object to be destroyed no dictionary can hold a reference +to it, hence for a value to be deleted the dict must be touched first thus +invalidating the lookup cache entry. +*/ + +/* +#define LOOKUP_DEBUG_TIMESTAMP_LIMIT +*/ +#ifdef LOOKUP_DEBUG_TIMESTAMP_LIMIT + +static lookup_tstamp_t dict_timestamp_value = 0xFFFFFFFFU - 10000; + +#define TOUCHDICT(d) do{ if (++dict_timestamp_value == 0) { \ + dict_clear_lookup_cache();\ + co_clear_lookup_cache();\ + printf("Lookup tstamp limit reached\n");\ + dict_timestamp_value = 0xFFFFFFFFU - 10000;\ + }\ + (d)->timestamp = dict_timestamp_value; } while(0) + +#else + +static lookup_tstamp_t dict_timestamp_value = 1; + +#define TOUCHDICT(d) do{ if (++dict_timestamp_value == 0) { \ + dict_clear_lookup_cache();\ + co_clear_lookup_cache();\ + }\ + (d)->timestamp = dict_timestamp_value; } while(0) + +#endif + +/* All dictionaries are kept in a doubly-linked list so that + * when the global timestamp wraps at 2**32 cached lookups + * can be invalidated. + * Note that dictionaries are removed from the list only when + * the object is really destroyed; dictionaries in the free + * list are instead kept in the list. This will mean some time + * wasted when doing the cleaning sweep but some time saved + * when reusing a dictionary from the free list. + */ +static +PyDictObject *first_dict = NULL; + +static +void dict_clear_lookup_cache(void) +{ + PyDictObject *d = first_dict; + lookup_tstamp_t x = 1; + while (d) { + d->timestamp = ++x; + d = d->next; + } + dict_timestamp_value = x; +} + +#else + +#define TOUCHDICT(d) /* nop */ + +#endif + /* forward declarations */ static dictentry * lookdict_string(dictobject *mp, PyObject *key, long hash); @@ -217,7 +286,17 @@ if (mp == NULL) return NULL; EMPTY_TO_MINSIZE(mp); +#ifdef CACHED_LOOKUPS + /* Chain in the list of all dictionaries */ + mp->next = first_dict; + mp->prev = NULL; + if (mp->next) mp->next->prev = mp; + first_dict = mp; +#endif } +#ifdef CACHED_LOOKUPS + mp->timestamp = 1; +#endif mp->ma_lookup = lookdict_string; #ifdef SHOW_CONVERSION_COUNTS ++created; @@ -620,6 +699,7 @@ assert(key); assert(value); mp = (dictobject *)op; + TOUCHDICT(mp); if (PyString_CheckExact(key)) { hash = ((PyStringObject *)key)->ob_shash; if (hash == -1) @@ -682,6 +762,7 @@ set_key_error(key); return -1; } + TOUCHDICT(mp); old_key = ep->me_key; Py_INCREF(dummy); ep->me_key = dummy; @@ -713,6 +794,7 @@ i = 0; #endif + TOUCHDICT(mp); table = mp->ma_table; assert(table != NULL); table_is_malloced = table != mp->ma_smalltable; @@ -823,8 +905,14 @@ PyMem_DEL(mp->ma_table); if (num_free_dicts < MAXFREEDICTS && mp->ob_type == &PyDict_Type) free_dicts[num_free_dicts++] = mp; - else + else { +#ifdef CACHED_LOOKUPS + /* Unchain from list of all dictionaries */ + if (mp->prev) mp->prev->next = mp->next; else first_dict = mp->next; + if (mp->next) mp->next->prev = mp->prev; +#endif mp->ob_type->tp_free((PyObject *)mp); + } Py_TRASHCAN_SAFE_END(mp) } @@ -1763,6 +1851,7 @@ set_key_error(key); return NULL; } + TOUCHDICT(mp); old_key = ep->me_key; Py_INCREF(dummy); ep->me_key = dummy; @@ -1820,6 +1909,7 @@ i = 1; } } + TOUCHDICT(mp); PyTuple_SET_ITEM(res, 0, ep->me_key); PyTuple_SET_ITEM(res, 1, ep->me_value); Py_INCREF(dummy); @@ -2014,6 +2104,14 @@ assert(d->ma_table == NULL && d->ma_fill == 0 && d->ma_used == 0); INIT_NONZERO_DICT_SLOTS(d); d->ma_lookup = lookdict_string; +#ifdef CACHED_LOOKUP + /* Chain in the list of all dictionaries */ + d->next = first_dict; + d->prev = NULL; + if (d->next) d->next->prev = d; + first_dict = d; + d->timestamp = 1; +#endif #ifdef SHOW_CONVERSION_COUNTS ++created; #endif