diff -r 6c5f9c6c25ea Include/ceval.h --- a/Include/ceval.h Wed Sep 14 18:17:32 2016 +0300 +++ b/Include/ceval.h Wed Sep 14 16:38:08 2016 -0400 @@ -219,6 +219,11 @@ PyAPI_FUNC(void) _PyEval_SignalAsyncExc(void); #endif +#ifndef Py_LIMITED_API +PyAPI_FUNC(void) _PyEval_FreeOpcodeCache(void *); +PyAPI_FUNC(void) _PyEval_Fini(void); +#endif + /* Masks and values used by FORMAT_VALUE opcode. */ #define FVC_MASK 0x3 #define FVC_NONE 0x0 diff -r 6c5f9c6c25ea Include/code.h --- a/Include/code.h Wed Sep 14 18:17:32 2016 +0300 +++ b/Include/code.h Wed Sep 14 16:38:08 2016 -0400 @@ -26,6 +26,7 @@ int co_stacksize; /* #entries needed for evaluation stack */ int co_flags; /* CO_..., see below */ int co_firstlineno; /* first source line number */ + int co_opt; /* Used by ceval opcode cache */ PyObject *co_code; /* instruction opcodes */ PyObject *co_consts; /* list (constants used) */ PyObject *co_names; /* list of strings (names used) */ diff -r 6c5f9c6c25ea Include/pylifecycle.h --- a/Include/pylifecycle.h Wed Sep 14 18:17:32 2016 +0300 +++ b/Include/pylifecycle.h Wed Sep 14 16:38:08 2016 -0400 @@ -108,6 +108,7 @@ PyAPI_FUNC(void) _PyType_Fini(void); PyAPI_FUNC(void) _PyRandom_Fini(void); PyAPI_FUNC(void) PyAsyncGen_Fini(void); +PyAPI_FUNC(void) _PyEval_Fini(void); PyAPI_DATA(PyThreadState *) _Py_Finalizing; #endif diff -r 6c5f9c6c25ea Makefile.pre.in --- a/Makefile.pre.in Wed Sep 14 18:17:32 2016 +0300 +++ b/Makefile.pre.in Wed Sep 14 16:38:08 2016 -0400 @@ -863,7 +863,7 @@ $(OPCODETARGETS_H): $(OPCODETARGETGEN_FILES) $(PYTHON_FOR_GEN) $(OPCODETARGETGEN) $(OPCODETARGETS_H) -Python/ceval.o: $(OPCODETARGETS_H) $(srcdir)/Python/ceval_gil.h +Python/ceval.o: $(OPCODETARGETS_H) $(srcdir)/Python/ceval_gil.h $(srcdir)/Python/ceval_cache.h Python/frozen.o: Python/importlib.h Python/importlib_external.h diff -r 6c5f9c6c25ea Objects/codeobject.c --- a/Objects/codeobject.c Wed Sep 14 18:17:32 2016 +0300 +++ b/Objects/codeobject.c Wed Sep 14 16:38:08 2016 -0400 @@ -161,6 +161,7 @@ co->co_zombieframe = NULL; co->co_weakreflist = NULL; co->co_extra = NULL; + co->co_opt = 0; return co; } diff -r 6c5f9c6c25ea PCbuild/pythoncore.vcxproj --- a/PCbuild/pythoncore.vcxproj Wed Sep 14 18:17:32 2016 +0300 +++ b/PCbuild/pythoncore.vcxproj Wed Sep 14 16:38:08 2016 -0400 @@ -207,6 +207,7 @@ + diff -r 6c5f9c6c25ea Python/ceval.c --- a/Python/ceval.c Wed Sep 14 18:17:32 2016 +0300 +++ b/Python/ceval.c Wed Sep 14 16:38:08 2016 -0400 @@ -1,4 +1,3 @@ - /* Execute compiled code */ /* XXX TO DO: @@ -215,7 +214,19 @@ Guarded by the GIL. */ static int pending_async_exc = 0; + +/* Code access macros */ +#ifdef WORDS_BIGENDIAN + #define OPCODE(word) ((word) >> 8) + #define OPARG(word) ((word) & 255) +#else + #define OPCODE(word) ((word) & 255) + #define OPARG(word) ((word) >> 8) +#endif + #include "ceval_gil.h" +#include "ceval_cache.h" + int PyEval_ThreadsInitialized(void) @@ -331,6 +342,14 @@ static int pending_async_exc = 0; #endif /* WITH_THREAD */ +void +_PyEval_Fini(void) +{ +#if OPCACHE_COLLECT_STATS + opcode_cache_print_stats(); +#endif +} + /* This function is used to signal that async exceptions are waiting to be raised, therefore it is also useful in non-threaded builds. */ @@ -746,6 +765,7 @@ const _Py_CODEUNIT *first_instr; PyObject *names; PyObject *consts; + _PyCodeObjectCache *cache = NULL; #ifdef LLTRACE _Py_IDENTIFIER(__ltrace__); @@ -866,6 +886,8 @@ /* The integer overflow is checked by an assertion below. */ #define INSTR_OFFSET() (sizeof(_Py_CODEUNIT) * (int)(next_instr - first_instr)) +#define OPCACHE_OFFSET() ((int)(next_instr - first_instr) - 1) + #define NEXTOPARG() do { \ _Py_CODEUNIT word = *next_instr; \ opcode = _Py_OPCODE(word); \ @@ -1078,6 +1100,23 @@ f->f_stacktop = NULL; /* remains NULL unless yield suspends frame */ f->f_executing = 1; + if (co->co_opt < OPCACHE_CALLS_THRESHOLD) { + co->co_opt++; + if (co->co_opt == OPCACHE_CALLS_THRESHOLD) { + if (init_opcode_cache(co)) { + goto exit_eval_frame; + } + if (_PyCode_GetExtra((PyObject *)co, 0, (void **)&cache)) { + goto exit_eval_frame; + } + } + } + else { + if (_PyCode_GetExtra((PyObject *)co, 0, (void **)&cache)) { + goto exit_eval_frame; + } + } + if (co->co_flags & (CO_GENERATOR | CO_COROUTINE | CO_ASYNC_GENERATOR)) { if (!throwflag && f->f_exc_type != NULL && f->f_exc_type != Py_None) { /* We were in an except handler when we left, @@ -2344,6 +2383,25 @@ if (PyDict_CheckExact(f->f_globals) && PyDict_CheckExact(f->f_builtins)) { + _PyCodeObjectCache_LOAD_GLOBAL *lg_cache; + lg_cache = OPCACHE_GET_LOAD_GLOBAL(cache, OPCACHE_OFFSET()); + if (lg_cache && lg_cache->optimized) { + if (lg_cache->globals_tag == + ((PyDictObject *)f->f_globals)->ma_version_tag && + cache->builtins_tag == + ((PyDictObject *)f->f_builtins)->ma_version_tag) + + { + PyObject *res = lg_cache->ptr; + OPCACHE_STATS_HIT(LOAD_GLOBAL); + Py_INCREF(res); + PUSH(res); + DISPATCH(); + } else { + OPCACHE_STATS_MISS(LOAD_GLOBAL); + } + } + v = _PyDict_LoadGlobal((PyDictObject *)f->f_globals, (PyDictObject *)f->f_builtins, name); @@ -2357,6 +2415,15 @@ goto error; } Py_INCREF(v); + + if (!OPCACHE_UPDATE_LOAD_GLOBAL(lg_cache)) { + lg_cache->globals_tag = + ((PyDictObject *)f->f_globals)->ma_version_tag; + cache->builtins_tag = + ((PyDictObject *)f->f_builtins)->ma_version_tag; + lg_cache->ptr = v; + } + } else { /* Slow-path if globals or builtins is not a dict */ @@ -2789,7 +2856,8 @@ TARGET(LOAD_ATTR) { PyObject *name = GETITEM(names, oparg); PyObject *owner = TOP(); - PyObject *res = PyObject_GetAttr(owner, name); + PyObject *res; + res = PyObject_GetAttr(owner, name); Py_DECREF(owner); SET_TOP(res); if (res == NULL) diff -r 6c5f9c6c25ea Python/ceval_cache.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Python/ceval_cache.h Wed Sep 14 16:38:08 2016 -0400 @@ -0,0 +1,263 @@ +#include +#include + + +/* WARNING: This file is full of magic. */ + + +#define OPCACHE_OPCODES(XX) \ + XX(LOAD_GLOBAL) + + +#define OPCACHE_COLLECT_STATS 0 +#define OPCACHE_CALLS_THRESHOLD 1000 +#define OPCACHE_MISSES_BEFORE_DEOPT 20 + + +#define OPCACHE_OPCODE_HEAD \ + int8_t optimized; /* < 0 - deoptimized; \ + = 0 - not yet optimized; \ + > 0 - optimized */ + + +typedef struct { + OPCACHE_OPCODE_HEAD + uint64_t globals_tag; + PyObject *ptr; +} _PyCodeObjectCache_LOAD_GLOBAL; + + +/* +- How to implement cache for a new opcode? + +Let's say we want to add cache to MY_OPCODE opcode: + +1. Define a `_PyCodeObjectCache_MY_OPCODE` struct. +2. Add `XX(MY_OPCODE);` to OPCACHE_OPCODES macro. +3. Everything else will be handled automatically. +*/ + + +#define _OPCACHE_OPCODE_FIELD(OPCODE) \ + uint8_t OPCODE##_size; \ + _PyCodeObjectCache_##OPCODE *OPCODE##_cache; + + +typedef struct { + uint8_t *index; + uint64_t builtins_tag; + + OPCACHE_OPCODES(_OPCACHE_OPCODE_FIELD) +} _PyCodeObjectCache; + + +#if OPCACHE_COLLECT_STATS + +static uint64_t opcode_stats_opts[255]; +static uint64_t opcode_stats_deopts[255]; +static uint64_t opcode_stats_hits[255]; +static uint64_t opcode_stats_misses[255]; +static uint64_t opcode_stats_memory = 0; + +#endif + + +static int +init_opcode_cache(PyCodeObject *co) +{ + const uint16_t *instr; + Py_ssize_t opcodes_num = PyBytes_Size(co->co_code) / 2; + uint8_t *index = NULL; + _PyCodeObjectCache *cache; + + cache = (_PyCodeObjectCache*)PyMem_Malloc(sizeof(_PyCodeObjectCache)); + if (cache == NULL) { + goto error; + } + +# define _OPCODE_PREPARE(OPCODE) \ + uint8_t OPCODE##_size = 0; \ + cache->OPCODE##_size = 0; \ + cache->OPCODE##_cache = NULL; + + OPCACHE_OPCODES(_OPCODE_PREPARE) +# undef _OPCODE_PREPARE + + index = (uint8_t *)PyMem_Calloc(opcodes_num, sizeof(uint8_t)); + if (index == NULL) { + goto error; + } + + instr = (uint16_t*) PyBytes_AS_STRING(co->co_code); + for (Py_ssize_t offset = 0; offset < opcodes_num; offset++) { + uint16_t word = *instr; + uint8_t opcode = OPCODE(word); + instr++; + +# define _OPCODE_COUNT(OPCODE) \ + if (opcode == OPCODE && OPCODE##_size < 255) { \ + index[offset] = OPCODE##_size++; \ + } + OPCACHE_OPCODES(_OPCODE_COUNT) +# undef _OPCODE_COUNT + } + +# define _OPCODE_INIT(OPCODE) \ + if (OPCODE##_size) { \ + cache->OPCODE##_cache = (_PyCodeObjectCache_##OPCODE*) PyMem_Calloc( \ + OPCODE##_size, sizeof(_PyCodeObjectCache_##OPCODE)); \ + if (cache->OPCODE##_cache == NULL) { \ + goto error; \ + } \ + cache->OPCODE##_size = OPCODE##_size; \ + } + OPCACHE_OPCODES(_OPCODE_INIT) +# undef _OPCODE_INIT + + cache->index = index; + if (_PyCode_SetExtra((PyObject *)co, 0, cache)) { + goto error; + } + +# if OPCACHE_COLLECT_STATS + opcode_stats_memory += sizeof(_PyCodeObjectCache); +# define _OPCODE_SIZE(OPCODE) \ + if (OPCODE##_size) { \ + opcode_stats_memory += OPCODE##_size * \ + sizeof(_PyCodeObjectCache_##OPCODE); \ + } + OPCACHE_OPCODES(_OPCODE_SIZE) +# undef _OPCODE_SIZE +# endif + + return 0; + +error: + PyMem_Free(index); + + /* Cleanup opcode structs */ +# define _OPCODE_CLEANUP(OPCODE) PyMem_Free(cache->OPCODE##_cache); \ + OPCACHE_OPCODES(_OPCODE_CLEANUP) +# undef _OPCODE_CLEANUP + + PyMem_Free(cache); + + return -1; +} + + +void +_PyEval_FreeOpcodeCache(void *co_extra) +{ + _PyCodeObjectCache *cache = (_PyCodeObjectCache *)co_extra; + +# define _OPCODE_CLEANUP(OPCODE) PyMem_Free(cache->OPCODE##_cache); + OPCACHE_OPCODES(_OPCODE_CLEANUP) +# undef _OPCODE_CLEANUP + + PyMem_Free(cache->index); + PyMem_Free(cache); +} + + +/* --- Stats --- */ + + +#if OPCACHE_COLLECT_STATS + +#define _OPCACHE_STATS_OPT(opcode) do { \ + opcode_stats_opts[opcode]++; \ + } while (0); + +#define _OPCACHE_STATS_DEOPT(opcode) do { \ + opcode_stats_deopts[opcode]++; \ + } while (0); + +#define OPCACHE_STATS_HIT(opcode) do { \ + opcode_stats_hits[opcode]++; \ + } while (0); + +#define OPCACHE_STATS_MISS(opcode) do { \ + opcode_stats_misses[opcode]++; \ + } while (0); + + +static void +opcode_cache_print_stats(void) +{ + printf("=== OPCODE CACHE === \n"); + printf("memory: %" PRIu64 "\n", opcode_stats_memory); + +# define _OPCODE_PRINT_STAT(OPCODE) \ + printf("--- " #OPCODE " ---\n"); \ + printf("opts: %" PRIu64 "\n", opcode_stats_opts[OPCODE]); \ + printf("deopts: %" PRIu64 "\n", opcode_stats_deopts[OPCODE]); \ + printf("hits: %" PRIu64 "\n", opcode_stats_hits[OPCODE]); \ + printf("misses: %" PRIu64 "\n\n", opcode_stats_misses[OPCODE]); + + OPCACHE_OPCODES(_OPCODE_PRINT_STAT) +# undef _OPCODE_PRINT_STAT +} + + +#else + +#define _OPCACHE_STATS_OPT(opcode) +#define _OPCACHE_STATS_DEOPT(opcode) + +#define OPCACHE_STATS_HIT(opcode) +#define OPCACHE_STATS_MISS(opcode) + +#endif + + +#define _OPCACHE_DEFINE_GETTER(OPCODE) \ + static inline _PyCodeObjectCache_##OPCODE * \ + OPCACHE_GET_##OPCODE(_PyCodeObjectCache *cache, int offset) \ + { \ + Py_ssize_t position; \ + _PyCodeObjectCache_##OPCODE *opcache; \ + if (cache == NULL) { \ + return NULL; \ + } \ + position = cache->index[offset]; \ + assert(cache->OPCODE##_size > position); \ + opcache = &cache->OPCODE##_cache[position]; \ + return opcache->optimized >= 0 ? opcache : NULL; \ + } +OPCACHE_OPCODES(_OPCACHE_DEFINE_GETTER) +#undef _OPCACHE_DEFINE_GETTER + + +#define _OPCACHE_DEFINE_MAYBE_DEOPT(OPCODE) \ + static inline void \ + OPCACHE_MAYBE_DEOPT_##OPCODE( \ + _PyCodeObjectCache_##OPCODE *opcache) \ + { \ + if (opcache->optimized >= 0) { \ + opcache->optimized--; \ + if (opcache->optimized == 0) { \ + opcache->optimized = -1; \ + _OPCACHE_STATS_DEOPT(OPCODE); \ + } \ + } \ + } +OPCACHE_OPCODES(_OPCACHE_DEFINE_MAYBE_DEOPT) +#undef _OPCACHE_DEFINE_MAYBE_DEOPT + + +#define _OPCACHE_DEFINE_UPDATER(OPCODE) \ + static inline int \ + OPCACHE_UPDATE_##OPCODE(_PyCodeObjectCache_##OPCODE *opcache) \ + { \ + if (opcache == NULL) return -1; \ + if (opcache->optimized == 0) { /* first time */ \ + opcache->optimized = OPCACHE_MISSES_BEFORE_DEOPT; \ + _OPCACHE_STATS_OPT(OPCODE); \ + } else { \ + OPCACHE_MAYBE_DEOPT_##OPCODE(opcache); \ + } \ + return 0; \ + } +OPCACHE_OPCODES(_OPCACHE_DEFINE_UPDATER) +#undef _OPCACHE_DEFINE_UPDATER diff -r 6c5f9c6c25ea Python/pylifecycle.c --- a/Python/pylifecycle.c Wed Sep 14 18:17:32 2016 +0300 +++ b/Python/pylifecycle.c Wed Sep 14 16:38:08 2016 -0400 @@ -739,6 +739,8 @@ } #endif + _PyEval_Fini(); + call_ll_exitfuncs(); return status; } diff -r 6c5f9c6c25ea Python/pystate.c --- a/Python/pystate.c Wed Sep 14 18:17:32 2016 +0300 +++ b/Python/pystate.c Wed Sep 14 16:38:08 2016 -0400 @@ -224,7 +224,10 @@ tstate->coroutine_wrapper = NULL; tstate->in_coroutine_wrapper = 0; - tstate->co_extra_user_count = 0; + + /* Index 1 is reserved for ceval opcode cache */ + tstate->co_extra_user_count = 1; + tstate->co_extra_freefuncs[0] = _PyEval_FreeOpcodeCache; tstate->async_gen_firstiter = NULL; tstate->async_gen_finalizer = NULL;