Index: Objects/longobject.c =================================================================== --- Objects/longobject.c (revision 61746) +++ Objects/longobject.c (working copy) @@ -15,6 +15,34 @@ #ifndef NSMALLNEGINTS #define NSMALLNEGINTS 5 #endif + +/* Free list for long objects with abs(size) == FREELIST_MAX_DIGITS. + * The free list (ab)uses the ob_type field to chain free long objects in a + * single linked, NULL terminated list. + */ +static PyLongObject *free_list = NULL; +static unsigned int numfree = 0; +#define PyLong_MAXFREELIST 4096 + +/* We keep a freelist for all of the smallest allocation size PyLongObjects. + * Our allocator pads allocations so we might as well treat all objects + * allocated the same physically as the same regardless of how many digits + * they actually contain. + * Typical results for 32-bit systems: (16-12) / 2 == 2 + * Typical results for 64-bit systems: (32-24) / 2 == 4 + * XXX(gregory.p.smith): This makes a bad assumption that compiler structure + * packing size will match the table of allocation block sizes shows in + * obmalloc.c. This is true for these sizes with gcc 4.0 on x86 and x86_64. + * To be safe these macros should be modified to compute the room for digits + * against the allocation sizes from obmalloc.c instead. + */ +/* the (int) casts here are important to make -FREELIST_MAX_DIGITS to work. */ +#define _1_DIGIT_PADDING_SIZE ((int)sizeof(PyLongObject) - sizeof(PyVarObject)) +#define FREELIST_MAX_DIGITS ((int)(_1_DIGIT_PADDING_SIZE >= sizeof(digit) ? \ + (_1_DIGIT_PADDING_SIZE / sizeof(digit)) : \ + (1))) + + #if NSMALLNEGINTS + NSMALLPOSINTS > 0 /* Small integers are preallocated in this array so that they can be shared. @@ -39,6 +67,7 @@ #endif return v; } + #define CHECK_SMALL_INT(ival) \ do if (-NSMALLNEGINTS <= ival && ival < NSMALLPOSINTS) { \ return get_small_int(ival); \ @@ -46,7 +75,7 @@ #else #define CHECK_SMALL_INT(ival) -#endif +#endif /* NSMALLNEGINTS + NSMALLPOSINTS > 0 */ #define MEDIUM_VALUE(x) (Py_SIZE(x) < 0 ? -(x)->ob_digit[0] : (Py_SIZE(x) == 0 ? 0 : (x)->ob_digit[0])) /* If a freshly-allocated long is already shared, it must @@ -113,6 +142,14 @@ _PyLong_New(Py_ssize_t size) { PyLongObject *result; + assert(size > 0); + + /* Use a free list of longs with size +/-FREELIST_MAX_DIGITS */ + if (free_list && size <= FREELIST_MAX_DIGITS) { + result = free_list; + free_list = (PyLongObject *)Py_TYPE(result); + numfree--; + } /* Can't use sizeof(PyLongObject) here, since the compiler takes padding at the end into account. As the consequence, this would waste 2 bytes on @@ -120,8 +157,10 @@ This computation would be incorrect on systems which have padding before the digits; with 16-bit digits this should not happen. */ - result = PyObject_MALLOC(sizeof(PyVarObject) + - size*sizeof(digit)); + else { + result = PyObject_MALLOC(sizeof(PyVarObject) + + size*sizeof(digit)); + } if (!result) { PyErr_NoMemory(); return NULL; @@ -2127,7 +2166,23 @@ static void long_dealloc(PyObject *v) { - Py_TYPE(v)->tp_free(v); + register Py_ssize_t size = Py_SIZE(v); + /* Using this also works: + * if ((abs(size) <= FREELIST_MAX_DIGITS) && + * instead of <= and >= checks against size but abs requires 4 instructions + * for all cases (x86) rather than a common 2 instruction short circuit + * fall through. The perf difference is measurable. */ + if ((size <= FREELIST_MAX_DIGITS) && + (size >= -FREELIST_MAX_DIGITS) && + PyLong_CheckExact(v) && + (numfree < PyLong_MAXFREELIST)) + { + Py_TYPE(v) = (PyTypeObject *)free_list; + free_list = (PyLongObject*)v; + numfree++; + return; + } + Py_TYPE(v)->tp_free(v); } static PyObject * @@ -3750,4 +3805,11 @@ _Py_ForgetReference((PyObject*)v); } #endif + while (free_list) { + PyObject *v = (PyObject *)free_list; + free_list = (PyLongObject *)Py_TYPE(v); + PyLong_Type.tp_free(v); + numfree--; + } + assert(numfree == 0); }