# HG changeset patch
# User Kristján Valur Jónsson <sweskman@gmail.com>
# Date 1397499238 0
# Node ID f3778ba0d988d0a81edcdf58fbedd05c7db13ff6
# Parent  34ace7eb67e9de12d90fe84022d3e0d063488f84
Update obmalloc to use the "lowest address" strategy for fun and profit.

diff -r 34ace7eb67e9 -r f3778ba0d988 Objects/obmalloc.c
--- a/Objects/obmalloc.c	Mon Apr 07 11:20:22 2014 +0200
+++ b/Objects/obmalloc.c	Mon Apr 14 18:13:58 2014 +0000
@@ -482,6 +482,7 @@
  *      allocator.
  */
 
+
 /*==========================================================================*/
 
 /*
@@ -568,6 +569,13 @@
 #define POOL_SIZE               SYSTEM_PAGE_SIZE        /* must be 2^N */
 #define POOL_SIZE_MASK          SYSTEM_PAGE_SIZE_MASK
 
+
+/* which allocation strategies to use for arenas and pools respectively.
+ * see below for detailed discussion
+ */
+#define ARENA_STRATEGY 1
+#define POOL_STRATEGY 1
+
 /*
  * -- End of tunable settings section --
  */
@@ -736,6 +744,21 @@
     If the size class needed happens to be the same as the size class the pool
     last had, some pool initialization can be skipped.
 
+Pool allocation strategy
+
+In order to reduce fragmentation and increase the chance of memory being freed
+in a program that grows and shrinks its memory usage, a stratgy tentatively
+known as "Lowest Address Strategy" is employed.  Using this strategy, pools
+available for allcoation ("used" pools above) are linked into the list
+sorted by their address.
+This tends to cluster memory usage into the low end of the address space
+and increases the odds whole pools become free and can be released.
+This results in lower fragmentation and better use of the CPU cache.
+Previously, a MRU strategy was used, where free blocks were simply added
+to the head of the list.
+The new LAS strategy is selected by setting POOL_STRATEGY to 1 above.
+The effectivity of the strategy can be measured using the script in
+tools/scripts/memcrunch.py
 
 Block Management
 
@@ -854,6 +877,22 @@
 
 Note that an arena_object associated with an arena all of whose pools are
 currently in use isn't on either list.
+
+Arena allocation strategy:
+
+Similarly to the pool strategy, usable arenas are allocated using a
+Lowest Address Strategy.  When a new block is required, it will be
+carved out of the lowest address arena.
+This increases the odds that arenas at the high
+end of the virtual address space are freed first, so that virtual memory
+can be returned to the operation system.  It also reduces the
+fragmentation of virtual memory in the process by clustering the
+arenas into a tighter address range.  This is controlled using the
+ARENA_STRATEGY=1 above.
+The previous policy was to prefer to allocate pools out of the fullest
+arenas first.  While performing similarly, the new policy helps cluster
+virtual memory into lower addresses, reducing VM fragmentation.
+
 */
 
 /* Array of objects used to track chunks of memory (arenas). */
@@ -1389,13 +1428,6 @@
              * a pool, and there are 4 cases for arena mgmt:
              * 1. If all the pools are free, return the arena to
              *    the system free().
-             * 2. If this is the only free pool in the arena,
-             *    add the arena back to the `usable_arenas` list.
-             * 3. If the "next" arena has a smaller count of free
-             *    pools, we have to "slide this arena right" to
-             *    restore that usable_arenas is sorted in order of
-             *    nfreepools.
-             * 4. Else there's nothing more to do.
              */
             if (nf == ao->ntotalpools) {
                 /* Case 1.  First unlink ao from usable_arenas.
@@ -1435,12 +1467,33 @@
                                      (void *)ao->address, ARENA_SIZE);
                 ao->address = 0;                        /* mark unassociated */
                 --narenas_currently_allocated;
+#if 0
+                fprintf(stderr, "freed %d\n", narenas_currently_allocated);
+#endif
 
                 UNLOCK();
                 return;
             }
+#if ARENA_STRATEGY == 0
+            /* Arena strategy 0 (the classic one)
+             *The list is kept sorted so that
+             * the "most full" arenas are used first, which allows
+             * the nearly empty arenas to be completely freed.  In
+             * a few un-scientific tests, it seems like this
+             * approach allowed a lot more memory to be freed.
+             *
+             * We have determined that we don't have case 1, so 
+             * we continue with the other cases:
+             * 2. If this is the only free pool in the arena,
+             *    add the arena back to the `usable_arenas` list.
+             * 3. If the "next" arena has a smaller count of free
+             *    pools, we have to "slide this arena right" to
+             *    restore that usable_arenas is sorted in order of
+             *    nfreepools.
+             * 4. Else there's nothing more to do.
+             */
             if (nf == 1) {
-                /* Case 2.  Put ao at the head of
+                /* Case 2.  Put at at the head of
                  * usable_arenas.  Note that because
                  * ao->nfreepools was 0 before, ao isn't
                  * currently on the usable_arenas list.
@@ -1456,11 +1509,7 @@
                 return;
             }
             /* If this arena is now out of order, we need to keep
-             * the list sorted.  The list is kept sorted so that
-             * the "most full" arenas are used first, which allows
-             * the nearly empty arenas to be completely freed.  In
-             * a few un-scientific tests, it seems like this
-             * approach allowed a lot more memory to be freed.
+             * the list sorted. 
              */
             if (ao->nextarena == NULL ||
                          nf <= ao->nextarena->nfreepools) {
@@ -1508,26 +1557,89 @@
                       nf <= ao->nextarena->nfreepools);
             assert(ao->prevarena == NULL ||
                       nf > ao->prevarena->nfreepools);
+
+#elif ARENA_STRATEGY == 1
+            /* Strategy 1 is the new one.
+             * The list is kept sorted so that
+             * the lowest address arenas are used first, which tends
+             * to cluster used memory into lower address leaving higher
+             * address arenas more likely to get freed.
+             * In a few un-scientific tests, it seems like this
+             * approach works better than strategy 0
+             * 
+             * Case 2: If the wasn't in the list, we insert it according
+             * to address.
+             * Case 3: Otherwise, nothing to be done, since addresses don't
+             * change.
+             */
+            if (nf != 1) {
+                /* Case 3: it was already in the list.  There is no point
+                 * in resorting, since addresses are constant
+                 */
+                UNLOCK();
+                return;
+            }
+            /* Case 2:
+             * Locate the new insertion point by iterating over
+             * the list, using our nextarena pointer.
+             */
+            {
+                struct arena_object* prev = NULL;
+                struct arena_object* next = usable_arenas;
+                while(next != NULL && ao->address > next->address) {
+                    prev = next;
+                    next = next->nextarena;
+                }
+                /* Insert ao at this point. */
+                ao->prevarena = prev;
+                ao->nextarena = next;
+                if (prev)
+                    prev->nextarena = ao;
+                else
+                    usable_arenas = ao;
+                if (next)
+                    next->prevarena = ao;
+            }
+            /* Verify sorting criteria */
+            assert(ao->nextarena == NULL ||
+                      ao->address < ao->nextarena->address);
+            assert(ao->prevarena == NULL ||
+                      ao->address > ao->prevarena->address);
+#endif
+            /* verify linkage */
             assert(ao->nextarena == NULL ||
                 ao->nextarena->prevarena == ao);
             assert((usable_arenas == ao &&
                 ao->prevarena == NULL) ||
                 ao->prevarena->nextarena == ao);
-
             UNLOCK();
             return;
         }
         /* Pool was full, so doesn't currently live in any list:
-         * link it to the front of the appropriate usedpools[] list.
-         * This mimics LRU pool usage for new allocations and
-         * targets optimal filling when several pools contain
-         * blocks of the same size class.
+         * link it to into of the appropriate usedpools[] list.
          */
         --pool->ref.count;
         assert(pool->ref.count > 0);            /* else the pool is empty */
         size = pool->szidx;
         next = usedpools[size + size];
         prev = next->prevpool;
+#if POOL_STRATEGY == 0
+        /* Link it at the head.
+         * This mimics LRU pool usage for new allocations and
+         * targets optimal filling when several pools contain
+         * blocks of the same size class.
+         */
+#elif POOL_STRATEGY == 1
+        /* Link it according to address in memory.  This allocates
+         * from low address pools first, and tends to put unused
+         * pools together higher in memory, increasing the odds
+         * of releasing whole arenas.
+         */
+        assert(&prev->nextpool == &usedpools[size+size]);
+        while(pool > next && next != prev)
+            next = next->nextpool;
+        prev = next->prevpool;
+#endif
         /* insert pool before next:   prev <-> pool <-> next */
         pool->nextpool = next;
         pool->prevpool = prev;
diff -r 34ace7eb67e9 -r f3778ba0d988 Tools/scripts/memcruch.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Tools/scripts/memcruch.py	Mon Apr 14 18:13:58 2014 +0000
@@ -0,0 +1,175 @@
+# memcruch.py
+"""
+This scritps tests the memory fragmentation in a python program that uses the object allocator.
+It does this by allocating and storing a lot of "objects" and then freeing some and recreating
+new ones in cycles, producing churn.
+The idea is to help tune python so that memory is released to the system when an application's
+memory use requirements go down.
+The fragmentation of memory is measured by performing statistics on the addresses of the objects
+stored.  This includes the total memory range in use, the average address relative to the start of
+the range, and the standard deviation.
+"""
+
+# test memory coherency and density
+from __future__ import print_function
+
+
+import random
+import sys
+import math
+import os
+
+#Because we use dicts, we need to disable hash randomzation for consistent execution
+if sys.version_info[:2] >= (3,4) and os.environ.get("PYTHONHASHSEED", None) is None:
+    print("warning, set PYTHONHASHSEED=0 for consistency")
+
+random.seed(123)
+
+minsize = 1
+maxsize = 200
+try:
+    range = xrange
+    def values(d):
+        return d.itervalues()
+except NameError:
+    def values(d):
+        return d.values()
+
+
+stuff = {} # allocated objects
+class mything(object):
+    """ An object to allocate"""
+    
+def add_item():
+    a = len(stuff)
+    k = random.randint(0, 2000000000)
+    #stuff[k] = [1]*random.randint(minsize, maxsize)
+    stuff[k] = mything()
+    if len(stuff) == a:
+        add_item()
+
+def pop_item():
+    stuff.popitem()
+
+def add_items(n):
+    for i in range(n):
+        add_item()
+
+def pop_items(n):
+    for i in range(n):
+        stuff.popitem()
+
+def poppush(n):
+    for i in range(n):
+        stuff.popitem()
+        add_item()
+
+
+def munge(m):
+    """Add m items, then remove m items"""
+    for j in range(m):
+        add_item()
+    for j in range(m):
+        pop_item()
+
+def thrash(m):
+    """Add and remove m times without growing"""
+    for i in range(m):
+        add_item()
+        pop_item()
+
+
+def stats(obj):
+    #compute running variance
+    A = 0.0
+    Q = 0.0
+    for i, e in enumerate(obj):
+        x = float(id(e))
+        a = A
+        A = a + (x - a)/(i+1)  # mean
+        Q = Q + (x - a)*(x - A)
+
+        if i == 0:
+            mi = ma = x
+        else:
+            mi = min(mi, x)
+            ma = max(ma, x)
+
+    # sample variance
+    s2 = Q / i
+    s = math.sqrt(s2)
+
+    # fragmentation
+    used = i * sys.getsizeof(e)
+    frag = 100.0 * ((ma - mi) - used) / (ma - mi)
+  
+    return int(mi), int(ma), A, s, frag
+
+
+def printhead():
+    if not verbose:
+        return
+    print("range (kb)  mean (kb)stddev (kb)    mean(n)  frag(%)")
+    
+def printstats(msg=""):
+    if not verbose:
+        return
+    mi, ma, mean, stddev, frag = stats(values(stuff))
+    # remove the min
+    ma -= mi
+    mean -= mi
+
+    # convert to Kb
+    ma /= 1024
+    mean /= 1024
+    stddev /= 1024
+
+    # normalize mean and stddev
+    nmean = mean / ma
+    nstddev = stddev / ma
+
+    # print range, normalized average, normalized stddev
+    n = len(stuff) / 1000
+
+    print("%10.2f %10.2f %10.2f %10.2f %5.1f %4dk %s"%(ma, mean, stddev, nmean, frag, n, msg))
+
+
+verbose = "-v" in sys.argv
+
+n = 100000 #number of things
+#first, burn a number of things, just to start up in "full" environmnet
+add_items(n)
+burn = stuff
+stuff = {}
+
+#Fill the set up to n
+add_items(n)
+printhead()
+printstats("start")
+
+#remove stuff
+for i in range(3):
+    r = n // (2**(i+1))
+    pop_items(r)
+    printstats("removed %ik" % (r/1000,))
+
+# munge up to n // 2 a few times
+m = (n // 2) - len(stuff)
+for i in range(2):
+    munge(m)
+    printstats("munged %d"%(m, ))
+
+# Thrash memory keeping usage constant
+for i in range(2):
+    thrash(m)
+    printstats("thrashed %d"%(m, ))
+
+# output final memory use and fragmentation level
+mi, ma, mean, stddev, frag = stats(values(stuff))
+print((ma-mi), frag)
+#sys.stdin.readline()
+
+
+
+
+