# HG changeset patch # User Kristján Valur Jónsson # Date 1397499238 0 # Node ID f3778ba0d988d0a81edcdf58fbedd05c7db13ff6 # Parent 34ace7eb67e9de12d90fe84022d3e0d063488f84 Update obmalloc to use the "lowest address" strategy for fun and profit. diff -r 34ace7eb67e9 -r f3778ba0d988 Objects/obmalloc.c --- a/Objects/obmalloc.c Mon Apr 07 11:20:22 2014 +0200 +++ b/Objects/obmalloc.c Mon Apr 14 18:13:58 2014 +0000 @@ -482,6 +482,7 @@ * allocator. */ + /*==========================================================================*/ /* @@ -568,6 +569,13 @@ #define POOL_SIZE SYSTEM_PAGE_SIZE /* must be 2^N */ #define POOL_SIZE_MASK SYSTEM_PAGE_SIZE_MASK + +/* which allocation strategies to use for arenas and pools respectively. + * see below for detailed discussion + */ +#define ARENA_STRATEGY 1 +#define POOL_STRATEGY 1 + /* * -- End of tunable settings section -- */ @@ -736,6 +744,21 @@ If the size class needed happens to be the same as the size class the pool last had, some pool initialization can be skipped. +Pool allocation strategy + +In order to reduce fragmentation and increase the chance of memory being freed +in a program that grows and shrinks its memory usage, a stratgy tentatively +known as "Lowest Address Strategy" is employed. Using this strategy, pools +available for allcoation ("used" pools above) are linked into the list +sorted by their address. +This tends to cluster memory usage into the low end of the address space +and increases the odds whole pools become free and can be released. +This results in lower fragmentation and better use of the CPU cache. +Previously, a MRU strategy was used, where free blocks were simply added +to the head of the list. +The new LAS strategy is selected by setting POOL_STRATEGY to 1 above. +The effectivity of the strategy can be measured using the script in +tools/scripts/memcrunch.py Block Management @@ -854,6 +877,22 @@ Note that an arena_object associated with an arena all of whose pools are currently in use isn't on either list. + +Arena allocation strategy: + +Similarly to the pool strategy, usable arenas are allocated using a +Lowest Address Strategy. When a new block is required, it will be +carved out of the lowest address arena. +This increases the odds that arenas at the high +end of the virtual address space are freed first, so that virtual memory +can be returned to the operation system. It also reduces the +fragmentation of virtual memory in the process by clustering the +arenas into a tighter address range. This is controlled using the +ARENA_STRATEGY=1 above. +The previous policy was to prefer to allocate pools out of the fullest +arenas first. While performing similarly, the new policy helps cluster +virtual memory into lower addresses, reducing VM fragmentation. + */ /* Array of objects used to track chunks of memory (arenas). */ @@ -1389,13 +1428,6 @@ * a pool, and there are 4 cases for arena mgmt: * 1. If all the pools are free, return the arena to * the system free(). - * 2. If this is the only free pool in the arena, - * add the arena back to the `usable_arenas` list. - * 3. If the "next" arena has a smaller count of free - * pools, we have to "slide this arena right" to - * restore that usable_arenas is sorted in order of - * nfreepools. - * 4. Else there's nothing more to do. */ if (nf == ao->ntotalpools) { /* Case 1. First unlink ao from usable_arenas. @@ -1435,12 +1467,33 @@ (void *)ao->address, ARENA_SIZE); ao->address = 0; /* mark unassociated */ --narenas_currently_allocated; +#if 0 + fprintf(stderr, "freed %d\n", narenas_currently_allocated); +#endif UNLOCK(); return; } +#if ARENA_STRATEGY == 0 + /* Arena strategy 0 (the classic one) + *The list is kept sorted so that + * the "most full" arenas are used first, which allows + * the nearly empty arenas to be completely freed. In + * a few un-scientific tests, it seems like this + * approach allowed a lot more memory to be freed. + * + * We have determined that we don't have case 1, so + * we continue with the other cases: + * 2. If this is the only free pool in the arena, + * add the arena back to the `usable_arenas` list. + * 3. If the "next" arena has a smaller count of free + * pools, we have to "slide this arena right" to + * restore that usable_arenas is sorted in order of + * nfreepools. + * 4. Else there's nothing more to do. + */ if (nf == 1) { - /* Case 2. Put ao at the head of + /* Case 2. Put at at the head of * usable_arenas. Note that because * ao->nfreepools was 0 before, ao isn't * currently on the usable_arenas list. @@ -1456,11 +1509,7 @@ return; } /* If this arena is now out of order, we need to keep - * the list sorted. The list is kept sorted so that - * the "most full" arenas are used first, which allows - * the nearly empty arenas to be completely freed. In - * a few un-scientific tests, it seems like this - * approach allowed a lot more memory to be freed. + * the list sorted. */ if (ao->nextarena == NULL || nf <= ao->nextarena->nfreepools) { @@ -1508,26 +1557,89 @@ nf <= ao->nextarena->nfreepools); assert(ao->prevarena == NULL || nf > ao->prevarena->nfreepools); + +#elif ARENA_STRATEGY == 1 + /* Strategy 1 is the new one. + * The list is kept sorted so that + * the lowest address arenas are used first, which tends + * to cluster used memory into lower address leaving higher + * address arenas more likely to get freed. + * In a few un-scientific tests, it seems like this + * approach works better than strategy 0 + * + * Case 2: If the wasn't in the list, we insert it according + * to address. + * Case 3: Otherwise, nothing to be done, since addresses don't + * change. + */ + if (nf != 1) { + /* Case 3: it was already in the list. There is no point + * in resorting, since addresses are constant + */ + UNLOCK(); + return; + } + /* Case 2: + * Locate the new insertion point by iterating over + * the list, using our nextarena pointer. + */ + { + struct arena_object* prev = NULL; + struct arena_object* next = usable_arenas; + while(next != NULL && ao->address > next->address) { + prev = next; + next = next->nextarena; + } + /* Insert ao at this point. */ + ao->prevarena = prev; + ao->nextarena = next; + if (prev) + prev->nextarena = ao; + else + usable_arenas = ao; + if (next) + next->prevarena = ao; + } + /* Verify sorting criteria */ + assert(ao->nextarena == NULL || + ao->address < ao->nextarena->address); + assert(ao->prevarena == NULL || + ao->address > ao->prevarena->address); +#endif + /* verify linkage */ assert(ao->nextarena == NULL || ao->nextarena->prevarena == ao); assert((usable_arenas == ao && ao->prevarena == NULL) || ao->prevarena->nextarena == ao); - UNLOCK(); return; } /* Pool was full, so doesn't currently live in any list: - * link it to the front of the appropriate usedpools[] list. - * This mimics LRU pool usage for new allocations and - * targets optimal filling when several pools contain - * blocks of the same size class. + * link it to into of the appropriate usedpools[] list. */ --pool->ref.count; assert(pool->ref.count > 0); /* else the pool is empty */ size = pool->szidx; next = usedpools[size + size]; prev = next->prevpool; +#if POOL_STRATEGY == 0 + /* Link it at the head. + * This mimics LRU pool usage for new allocations and + * targets optimal filling when several pools contain + * blocks of the same size class. + */ +#elif POOL_STRATEGY == 1 + /* Link it according to address in memory. This allocates + * from low address pools first, and tends to put unused + * pools together higher in memory, increasing the odds + * of releasing whole arenas. + */ + assert(&prev->nextpool == &usedpools[size+size]); + while(pool > next && next != prev) + next = next->nextpool; + prev = next->prevpool; +#endif /* insert pool before next: prev <-> pool <-> next */ pool->nextpool = next; pool->prevpool = prev; diff -r 34ace7eb67e9 -r f3778ba0d988 Tools/scripts/memcruch.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/scripts/memcruch.py Mon Apr 14 18:13:58 2014 +0000 @@ -0,0 +1,175 @@ +# memcruch.py +""" +This scritps tests the memory fragmentation in a python program that uses the object allocator. +It does this by allocating and storing a lot of "objects" and then freeing some and recreating +new ones in cycles, producing churn. +The idea is to help tune python so that memory is released to the system when an application's +memory use requirements go down. +The fragmentation of memory is measured by performing statistics on the addresses of the objects +stored. This includes the total memory range in use, the average address relative to the start of +the range, and the standard deviation. +""" + +# test memory coherency and density +from __future__ import print_function + + +import random +import sys +import math +import os + +#Because we use dicts, we need to disable hash randomzation for consistent execution +if sys.version_info[:2] >= (3,4) and os.environ.get("PYTHONHASHSEED", None) is None: + print("warning, set PYTHONHASHSEED=0 for consistency") + +random.seed(123) + +minsize = 1 +maxsize = 200 +try: + range = xrange + def values(d): + return d.itervalues() +except NameError: + def values(d): + return d.values() + + +stuff = {} # allocated objects +class mything(object): + """ An object to allocate""" + +def add_item(): + a = len(stuff) + k = random.randint(0, 2000000000) + #stuff[k] = [1]*random.randint(minsize, maxsize) + stuff[k] = mything() + if len(stuff) == a: + add_item() + +def pop_item(): + stuff.popitem() + +def add_items(n): + for i in range(n): + add_item() + +def pop_items(n): + for i in range(n): + stuff.popitem() + +def poppush(n): + for i in range(n): + stuff.popitem() + add_item() + + +def munge(m): + """Add m items, then remove m items""" + for j in range(m): + add_item() + for j in range(m): + pop_item() + +def thrash(m): + """Add and remove m times without growing""" + for i in range(m): + add_item() + pop_item() + + +def stats(obj): + #compute running variance + A = 0.0 + Q = 0.0 + for i, e in enumerate(obj): + x = float(id(e)) + a = A + A = a + (x - a)/(i+1) # mean + Q = Q + (x - a)*(x - A) + + if i == 0: + mi = ma = x + else: + mi = min(mi, x) + ma = max(ma, x) + + # sample variance + s2 = Q / i + s = math.sqrt(s2) + + # fragmentation + used = i * sys.getsizeof(e) + frag = 100.0 * ((ma - mi) - used) / (ma - mi) + + return int(mi), int(ma), A, s, frag + + +def printhead(): + if not verbose: + return + print("range (kb) mean (kb)stddev (kb) mean(n) frag(%)") + +def printstats(msg=""): + if not verbose: + return + mi, ma, mean, stddev, frag = stats(values(stuff)) + # remove the min + ma -= mi + mean -= mi + + # convert to Kb + ma /= 1024 + mean /= 1024 + stddev /= 1024 + + # normalize mean and stddev + nmean = mean / ma + nstddev = stddev / ma + + # print range, normalized average, normalized stddev + n = len(stuff) / 1000 + + print("%10.2f %10.2f %10.2f %10.2f %5.1f %4dk %s"%(ma, mean, stddev, nmean, frag, n, msg)) + + +verbose = "-v" in sys.argv + +n = 100000 #number of things +#first, burn a number of things, just to start up in "full" environmnet +add_items(n) +burn = stuff +stuff = {} + +#Fill the set up to n +add_items(n) +printhead() +printstats("start") + +#remove stuff +for i in range(3): + r = n // (2**(i+1)) + pop_items(r) + printstats("removed %ik" % (r/1000,)) + +# munge up to n // 2 a few times +m = (n // 2) - len(stuff) +for i in range(2): + munge(m) + printstats("munged %d"%(m, )) + +# Thrash memory keeping usage constant +for i in range(2): + thrash(m) + printstats("thrashed %d"%(m, )) + +# output final memory use and fragmentation level +mi, ma, mean, stddev, frag = stats(values(stuff)) +print((ma-mi), frag) +#sys.stdin.readline() + + + + +