diff --git a/Include/mutexed.h b/Include/mutexed.h
new file mode 100644
--- /dev/null
+++ b/Include/mutexed.h
@@ -0,0 +1,156 @@
+/*
+ * Mutex related macros for the BFS scheduler - taken from ceval_gil.h.
+ */
+
+#include <stdlib.h>
+
+#ifndef _POSIX_THREADS
+/* This means pthreads are not implemented in libc headers, hence the macro
+   not present in unistd.h. But they still can be implemented as an external
+   library (e.g. gnu pth in pthread emulation) */
+# ifdef HAVE_PTHREAD_H
+#  include <pthread.h> /* _POSIX_THREADS */
+# endif
+#endif
+
+#ifdef _POSIX_THREADS
+
+/*
+ * POSIX support
+ */
+
+#include <pthread.h>
+
+#define ADD_MICROSECONDS(tv, interval) \
+do { \
+    tv.tv_usec += (long) interval; \
+    tv.tv_sec += tv.tv_usec / 1000000; \
+    tv.tv_usec %= 1000000; \
+} while (0)
+
+/* We assume all modern POSIX systems have gettimeofday() */
+#ifdef GETTIMEOFDAY_NO_TZ
+#define GETTIMEOFDAY(ptv) gettimeofday(ptv)
+#else
+#define GETTIMEOFDAY(ptv) gettimeofday(ptv, (struct timezone *)NULL)
+#endif
+
+#define MUTEX_T pthread_mutex_t
+#define MUTEX_INIT(mut) \
+    if (pthread_mutex_init(&mut, NULL)) { \
+        Py_FatalError("pthread_mutex_init(" #mut ") failed"); };
+#define MUTEX_LOCK(mut) \
+    if (pthread_mutex_lock(&mut)) { \
+        Py_FatalError("pthread_mutex_lock(" #mut ") failed"); };
+#define MUTEX_UNLOCK(mut) \
+    if (pthread_mutex_unlock(&mut)) { \
+        Py_FatalError("pthread_mutex_unlock(" #mut ") failed"); };
+
+#define COND_T pthread_cond_t
+#define COND_INIT(cond) \
+    if (pthread_cond_init(&cond, NULL)) { \
+        Py_FatalError("pthread_cond_init(" #cond ") failed"); };
+#define COND_DESTROY(cond) \
+    if (pthread_cond_destroy(&cond)) { \
+        Py_FatalError("pthread_cond_destroy(" #cond ") failed"); };
+#define COND_RESET(cond)
+#define COND_SIGNAL(cond) \
+    if (pthread_cond_signal(&cond)) { \
+        Py_FatalError("pthread_cond_signal(" #cond ") failed"); };
+#define COND_WAIT(cond, mut) \
+    if (pthread_cond_wait(&cond, &mut)) { \
+        Py_FatalError("pthread_cond_wait(" #cond ") failed"); };
+#define COND_TIMED_WAIT(cond, mut, seconds, timestamp_now, timeout_result) \
+    { \
+        int r; \
+        struct timespec ts; \
+        struct timeval deadline; \
+        \
+        if (timestamp_now == 0) { \
+            GETTIMEOFDAY(&deadline); \
+        } \
+        else { \
+            deadline.tv_sec = (int) timestamp_now; \
+            deadline.tv_usec = (int) ((timestamp_now - deadline.tv_sec) * 1000000); \
+        } \
+        ADD_MICROSECONDS(deadline, ((int)(seconds * 1000000))); \
+        ts.tv_sec = deadline.tv_sec; \
+        ts.tv_nsec = deadline.tv_usec * 1000; \
+        \
+        r = pthread_cond_timedwait(&cond, &mut, &ts); \
+        if (r == ETIMEDOUT) \
+            timeout_result = 1; \
+        else if (r) \
+            Py_FatalError("pthread_cond_timedwait(" #cond ") failed"); \
+        else \
+            timeout_result = 0; \
+    } \
+
+#elif defined(NT_THREADS)
+
+/*
+ * Windows (2000 and later, as well as (hopefully) CE) support
+ */
+
+#include <windows.h>
+
+/* Use critical section since it is significantly faster than a mutex object.
+ * It should be enough for the needs of the BFS scheduler since on any 
+ * signaled event there is exactly one waiting thread. However the macros
+ * are not suited for the general case so the code should probably move
+ * somewhere else. */
+
+#define MUTEX_T CRITICAL_SECTION
+#define MUTEX_INIT(mut) \
+    InitializeCriticalSectionAndSpinCount(&mut, 4000);
+#define MUTEX_LOCK(mut) \
+    EnterCriticalSection(&mut);
+#define MUTEX_UNLOCK(mut) \
+    LeaveCriticalSection(&mut);
+
+#undef COND_T
+#define COND_T HANDLE
+#define COND_INIT(cond) \
+    /* auto-reset, non-signalled */ \
+    if (!(cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { \
+        Py_FatalError("CreateMutex(" #cond ") failed"); };
+#define COND_DESTROY(mut) \
+    if (!CloseHandle(mut)) { \
+        Py_FatalError("CloseHandle(" #mut ") failed"); };
+#define COND_RESET(cond) \
+    if (!ResetEvent(cond)) { \
+        Py_FatalError("ResetEvent(" #cond ") failed"); };
+#define COND_SIGNAL(cond) \
+    if (!SetEvent(cond)) { \
+        Py_FatalError("SetEvent(" #cond ") failed"); };
+#define COND_WAIT(cond, mut) \
+    { \
+        MUTEX_UNLOCK(mut); \
+        if (WaitForSingleObject(cond, INFINITE) != WAIT_OBJECT_0) \
+            Py_FatalError("WaitForSingleObject(" #cond ") failed"); \
+        MUTEX_LOCK(mut); \
+    }
+#define COND_TIMED_WAIT(cond, mut, seconds, timestamp_now, timeout_result) \
+    { \
+        DWORD r; \
+        MUTEX_UNLOCK(mut); \
+        r = WaitForSingleObject(cond, (int)(seconds * 1000)); \
+        if (r == WAIT_TIMEOUT) { \
+            MUTEX_LOCK(mut); \
+            timeout_result = 1; \
+        } \
+        else if (r != WAIT_OBJECT_0) \
+            Py_FatalError("WaitForSingleObject(" #cond ") failed"); \
+        else { \
+            MUTEX_LOCK(mut); \
+            timeout_result = 0; \
+        } \
+    }
+
+#else
+
+#error You need either a POSIX-compatible or a Windows system!
+
+#endif /* _POSIX_THREADS, NT_THREADS */
+
+
diff --git a/Include/pystate.h b/Include/pystate.h
--- a/Include/pystate.h
+++ b/Include/pystate.h
@@ -8,6 +8,21 @@
 extern "C" {
 #endif
 
+#include "pythread.h"
+
+/* Temporarily work around cross platform (nt/posix) compilation issues. 
+ * Can't include mutexed.h directly on NT since it includes windows.h 
+ * and breaks compliation solution-wide. */
+
+#undef COND_T
+#ifdef MS_WINDOWS
+typedef void *COND_HANDLE;
+#define COND_T COND_HANDLE
+#else
+#include "mutexed.h"
+#endif
+
+
 /* State shared between threads */
 
 struct _ts; /* Forward */
@@ -55,6 +70,46 @@
 #define PyTrace_C_EXCEPTION 5
 #define PyTrace_C_RETURN 6
 
+#define CONTAINER(type, member, ptr) \
+   ((type *)((char *)(ptr) - offsetof(type, member))) 
+
+struct _list {
+    struct _list *next;
+    struct _list *prev;
+};
+
+/* Py_LOCAL_INLINE() does not work - see issue bugs.python.org/issue5553 */
+
+#ifdef MS_WINDOWS
+#define _LOCAL(type) static type
+#else
+#define _LOCAL(type) static inline type
+#endif
+
+_LOCAL(void) _list_init(struct _list *l) {
+    l->next = l;
+    l->prev = l;
+}
+
+_LOCAL(int) _list_empty(struct _list *l) {
+    return l->next == l;
+}
+
+_LOCAL(void) _list_append(struct _list *l, struct _list *item) {
+    struct _list *tail = l->prev;
+    tail->next = item;
+    item->prev = tail;
+    item->next = l;
+    l->prev = item;
+}
+
+_LOCAL(void) _list_pop(struct _list *item) {
+    item->next->prev = item->prev;
+    item->prev->next = item->next;
+    item->prev = NULL;
+    item->next = NULL;
+}
+
 typedef struct _ts {
     /* See Python/ceval.c for comments explaining most fields */
 
@@ -88,8 +143,6 @@
 
     PyObject *dict;  /* Stores per-thread state */
 
-    /* XXX doesn't mean anything anymore (the comment below is obsolete)
-       => deprecate or remove? */
     /* tick_counter is incremented whenever the check_interval ticker
      * reaches zero. The purpose is to give a useful measure of the number
      * of interpreted bytecode instructions in a given thread.  This
@@ -98,15 +151,21 @@
      */
     int tick_counter;
 
+    COND_T bfs_cond;
+    struct _list bfs_list;
+    long double bfs_slice;
+    long double bfs_deadline;
+    long double bfs_timestamp;
+
     int gilstate_counter;
 
     PyObject *async_exc; /* Asynchronous exception to raise */
     long thread_id; /* Thread id where this tstate was created */
 
     /* XXX signal handlers should also be here */
-
 } PyThreadState;
 
+#define THREAD_STATE(ptr) CONTAINER(PyThreadState, bfs_list, ptr)
 
 PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_New(void);
 PyAPI_FUNC(void) PyInterpreterState_Clear(PyInterpreterState *);
diff --git a/Python/bfs.c b/Python/bfs.c
new file mode 100644
--- /dev/null
+++ b/Python/bfs.c
@@ -0,0 +1,432 @@
+/*
+ * Python/bfs.c
+ *
+ * Simplified implementation of the Brain F**k Scheduler by Con Kolivas
+ * http://ck.kolivas.org/patches/bfs/sched-BFS.txt
+ *
+ * "The goal of the Brain F**k Scheduler, referred to as BFS from here on, is to
+ * completely do away with the complex designs of the past for the cpu process
+ * scheduler and instead implement one that is very simple in basic design.
+ * The main focus of BFS is to achieve excellent desktop interactivity and
+ * responsiveness without heuristics and tuning knobs that are difficult to
+ * understand, impossible to model and predict the effect of, and when tuned to
+ * one workload cause massive detriment to another." - Con Kolivas
+ *
+ * Notes:
+ * There are several mechanisms in this implementation which are particular to 
+ * Python:
+ *
+ * 1) Clock - Except for Linux, most platforms do not provide an API to query
+ * thread's running time with high precision. Therefore timing is done with a
+ * wall clock. The scheduler seems to behave reasonably with a wall clock even 
+ * under load, probably since the OS scheduler does not tend to preempt IO 
+ * bound threads.
+ *
+ * 2) Clock precision - Code works best with high precision clock 1ms--.
+ * With low precision clock, scheduler will become sensitive to tasks which 
+ * are synchronized with the interval size. There is a related discussion 
+ * about this in the BFS design doc under sub-tick accounting.
+ *
+ * 3) Slow ticks - Querying the clock between ticks is too expensive, so it
+ * is done once every 1000 ticks. However, since even a single slow tick can
+ * deplete a slice (10ms), a special flag exists (bfs_check_depleted) which 
+ * is set by waiting threads and which forces the running thread to check
+ * its remaining slice by next tick.
+ *
+ * 4) Fast reschedule - Allow a thread to reschedule itself quickly after yield
+ * if the signaled next-to-run thread did not wake up yet by the OS. Since
+ * a wall clock is used, care must be taken not to allow too many such 
+ * reschedules in a row, since it will make an "IO" bound thread look like
+ * a CPU bound thread and loose the OS 'protection' (see discussion about 
+ * clock above). Therefore, fast reschedule is limited by 
+ * FAST_RESCHEDULE_THRESHOLD
+ *
+ * 5) Force switch - A thread which yields to BFS may signal the next thread
+ * but continue to run (e.g. in C library code). If the OS schedules the 
+ * next thread on the same core, it will take a while before it actually gets
+ * to run. Therefore to prevent this situation, when a thread yields to BFS, 
+ * it will block if the next thread has more urgent deadline, until that 
+ * thread starts running. This improves throughput and latency. 
+ *
+ * 6) Multi-core clock - On older multi core CPU, dynamic frequency scaling
+ * may result in unsynchronized timetamp readings between cores. The code 
+ * alleviates this problem with capping and by using a "Python" clock 
+ * which is created by accumulating timestamp diffs.
+ *
+ * 7) Multi-core contention - On 4 core machines, a thread will spin a little
+ * before being scheduled to allow the previous yielding thread to reschedule
+ * itself if it is more urgent. This reduces context switching considerebaly 
+ * on Windows and increases fairness and throughput.
+ */
+
+
+#include "time.h"
+#include "mutexed.h"
+
+
+/* Round robin interval - thread time slice for running (in seconds). 
+ * It is set at 10ms to match Windows tick resolution. */
+static const long double rr_interval = 0.010;
+
+/* Magic number taken from Con's implementation. There it is computed as
+ * 1.1 ^ 20 (nice level of regular thread). Basically it controls the load
+ * beyond which threads tend to expire their deadline. Expired threads are 
+ * handled in FIFO order, which means that at such load IO bound threads
+ * will no longer preempt running threads. */
+#define DEADLINE_FACTOR 8
+
+/* Default slice to consume on negative timestamps diff, 100usec. */
+#define MIN_SLICE 0.000100
+
+/* Protect access to data structures. */
+static MUTEX_T bfs_mutex;
+
+/* List all threads waiting for schedule. */
+static struct _list bfs_rq;
+
+/* Flag threads are waiting for schedule */
+static volatile int bfs_threads_waiting = 0;
+
+/* Point to current and previous running thread. */
+static volatile PyThreadState *bfs_thread = NULL;
+
+/* Fast reschedule variables. */
+static volatile PyThreadState *bfs_thread_prev = NULL;
+#define FAST_RESCHEDULE_THRESHOLD 0.001
+
+static volatile PyThreadState *bfs_thread_switch = NULL;
+static volatile long double bfs_last_switch = 0;
+
+/* Use as cross-multiple CPU cores "clock". */
+static volatile long double bfs_python_time = 0;
+
+/* Flag currently running thread to yield immediately. */
+static volatile int bfs_preempt = 0;
+
+/* Flag currently running thread to check remaining slice immediately. 
+ * This is required since even a single slow Python tick can deplete
+ * a 10ms slice, let alone 1000 ticks. */
+static volatile int bfs_check_depleted = 0;
+
+static volatile int bfs_spin = 0;
+
+static int bfs_initialized = 0;
+
+//#define VERBOSE
+#undef VERBOSE
+#ifdef VERBOSE
+#define TRACE(v) printf v
+#else
+#define TRACE(v)
+#endif
+
+
+#ifdef MS_WINDOWS
+
+/* Return system wide timestamp in seconds (with usec precision). */
+_LOCAL(long double) get_timestamp(void) {
+	static LARGE_INTEGER ctrStart;
+	static long double divisor = 0.0;
+	LARGE_INTEGER now;
+
+	if (divisor == 0.0) {
+		LARGE_INTEGER freq;
+		QueryPerformanceCounter(&ctrStart);
+		QueryPerformanceFrequency(&freq);
+		divisor = (long double) freq.QuadPart;
+	}
+
+	QueryPerformanceCounter(&now);
+	return (now.QuadPart - ctrStart.QuadPart) / divisor;
+}
+
+#elif defined(HAVE_GETTIMEOFDAY)
+
+/* Return system wide timestamp in seconds (with usec precision). */
+_LOCAL(long double) get_timestamp(void) {
+    struct timeval tv;
+    GETTIMEOFDAY(&tv);
+    return (long double) (tv.tv_sec + tv.tv_usec * 0.000001);
+}
+
+#else
+
+#error You need either a POSIX-compatible or a Windows system!
+
+#endif /* MS_WINDOWS, HAVE_GETTIMEOFDAY */
+
+
+static void bfs_init(void) {
+    MUTEX_INIT(bfs_mutex);
+    _list_init(&bfs_rq);
+    bfs_initialized = 1;
+}
+
+
+/* Pick next thread to schedule based on earliest deadline. */
+static PyThreadState *bfs_find_task(long double python_time) {
+    struct _list *item;
+    PyThreadState *task;
+
+    if (_list_empty(&bfs_rq))
+        return NULL;
+
+    item = bfs_rq.next;
+    task = THREAD_STATE(item);
+
+    /* Search for thread with earliest deadline or first expired deadline. 
+     * IO bound threads typically have expired deadlines since they naturally
+     * take longer than their original deadline to deplete their slice. */
+    for (item = item->next; item != &bfs_rq && task->bfs_deadline > python_time; item = item->next) {
+        PyThreadState *tstate = THREAD_STATE(item);
+        if (tstate->bfs_deadline < task->bfs_deadline) {
+            task = tstate;
+        }
+    }
+
+    return task;
+}
+
+
+/* Wait for currently running thread to signal this thread (tstate)
+ * to take over the schedule. Call with locked mutex */
+static void _bfs_timed_wait(PyThreadState *tstate, long double timestamp) {
+    /* Use timeout to guard against slow ticks of running thread, but
+     * multiply by number of waiting threads to prevent growing number of
+     * context switches. */
+    long double timeout = bfs_threads_waiting * rr_interval / 2;
+    int timed_out = 0;
+
+    COND_TIMED_WAIT(tstate->bfs_cond, bfs_mutex, timeout, timestamp, timed_out);
+    
+    /* Flag running thread to check slice depletion immediately. it has possibly
+     * been doing slow ticks (e.g. regular expression searches) and depleted its 
+     * slice. */
+    if (timed_out) {
+        bfs_check_depleted = 1;
+        COMPUTE_EVAL_BREAKER();
+    }
+}
+
+
+/* Diff timestamp capping results to protect against clock differences 
+ * between cores. */
+_LOCAL(long double) _bfs_diff_ts(long double ts1, long double ts0) {
+    if (ts1 < ts0) 
+        return MIN_SLICE;
+
+    if (ts1 - ts0 > rr_interval)
+        return rr_interval;
+
+    return ts1 - ts0;
+}
+
+
+/* Schedule (tstate) thread for running. */
+static void bfs_schedule(PyThreadState *tstate) {
+#ifdef VERBOSE
+    long double _slice = tstate->bfs_slice;
+    long double _deadline = tstate->bfs_deadline;
+#endif
+
+    long double timestamp = get_timestamp();
+    int spin;
+   
+    MUTEX_LOCK(bfs_mutex);
+
+    /* Refill depleted slice and reset scheduling deadline. */
+    if (tstate->bfs_slice <= 0) {
+        tstate->bfs_slice = rr_interval;
+        tstate->bfs_deadline = bfs_python_time + rr_interval * DEADLINE_FACTOR;
+    }
+
+    TRACE(("SCHD %p - %Lf, pt=%Lf, slice=%Lf, deadline-in=%Lf, deadline-on=%Lf\n", tstate, timestamp, bfs_python_time, _slice, _deadline - bfs_python_time, tstate->bfs_deadline));
+
+    do {
+        /* Schedule immediately if no thread is currently running. */
+        if (bfs_thread == NULL) {
+            bfs_thread = tstate;
+            bfs_thread_prev = NULL;
+            bfs_last_switch = timestamp;
+            break;
+        }
+
+        /* Schedule immediately if next thread (bfs_thread) signaled by 
+         * this thread (tstate) did not take over yet and has less urget 
+         * deadline. */ 
+        if (bfs_thread_prev == tstate && 
+            tstate->bfs_deadline <= bfs_thread->bfs_deadline && 
+            bfs_thread->bfs_deadline >= bfs_python_time && 
+            timestamp - bfs_last_switch < FAST_RESCHEDULE_THRESHOLD) {
+            bfs_thread = tstate;
+            bfs_thread_prev = NULL;
+            break;
+        }
+       
+        /* Schedule immediately if next thread (bfs_thread) did not take 
+         * over yet and has less urget deadline. */ 
+        if (bfs_thread_prev != tstate && bfs_thread_prev != NULL && 
+            tstate->bfs_deadline <= bfs_thread->bfs_deadline && 
+            bfs_thread->bfs_deadline >= bfs_python_time) {
+            bfs_thread = tstate;
+            bfs_thread_prev = NULL;
+            bfs_last_switch = timestamp;
+            break;
+        }
+          
+        /* Preempt running thread if its deadline is less urgent, unless
+         * its deadline has already expired, in which case handle 
+         * request as FIFO. */
+        if (tstate->bfs_deadline < bfs_thread->bfs_deadline && 
+            bfs_thread->bfs_deadline >= bfs_python_time) {
+            bfs_preempt = 1;
+            COMPUTE_EVAL_BREAKER();
+        }
+
+        /* Flag running thread to check depletion of its slice. */ 
+        else if (bfs_thread->bfs_slice <= timestamp - bfs_thread->bfs_timestamp) {
+            bfs_check_depleted = 1;
+            COMPUTE_EVAL_BREAKER();
+        }
+
+        /* Queue and wait for schedule */
+
+        _list_append(&bfs_rq, &tstate->bfs_list);
+        bfs_threads_waiting++;
+        
+        COND_RESET(tstate->bfs_cond);
+        while (bfs_thread != tstate) {
+            _bfs_timed_wait(tstate, timestamp);
+            /* Reduce multi-core contention on fast yield-schedule cycles. 
+             * If yielding thread is more urgent give it a chance to 
+             * reschedule by spinning a little. */
+            if (bfs_thread == tstate && bfs_thread_prev != NULL && 
+                tstate->bfs_deadline > bfs_thread_prev->bfs_deadline && 
+                tstate->bfs_deadline >= bfs_python_time) {
+                MUTEX_UNLOCK(bfs_mutex);
+                for (spin = 1000; spin > 0 && bfs_thread == tstate; spin--) {
+                    bfs_spin++;
+                }
+                MUTEX_LOCK(bfs_mutex);
+            }
+            timestamp = get_timestamp();
+        }
+
+        _list_pop(&tstate->bfs_list);
+        bfs_threads_waiting--;
+
+        bfs_thread_prev = NULL;
+        bfs_last_switch = timestamp;
+        break;
+    } while(0);
+
+    /* Signal previous thread which is waiting for the switch. */
+    if (bfs_thread_switch != NULL) {
+        COND_SIGNAL(((PyThreadState*)bfs_thread_switch)->bfs_cond);
+        bfs_thread_switch = NULL;
+    }
+
+    tstate->bfs_timestamp = timestamp;
+
+    TRACE(("TAKE %p - %Lf, pt=%Lf\n", tstate, timestamp, bfs_python_time));
+    MUTEX_UNLOCK(bfs_mutex);
+}
+
+
+/* Yield schedule to another thread. */
+static void _bfs_yield(PyThreadState *tstate, int fast_reschedule) {
+    long double timestamp = get_timestamp();    
+    long double interval;
+
+    MUTEX_LOCK(bfs_mutex);
+
+    /* Update running time slice (book keeping). */
+    if (tstate != NULL) {
+        interval = _bfs_diff_ts(timestamp, tstate->bfs_timestamp);
+        bfs_python_time += interval;
+        tstate->bfs_slice -= interval;
+    } 
+    
+    TRACE(("DROP %p - %Lf, pt=%Lf\n", tstate, timestamp, bfs_python_time));
+
+    /* Reset preemption flags - we heard the shout. */
+    bfs_preempt = 0;
+    bfs_check_depleted = 0;
+    COMPUTE_EVAL_BREAKER();
+
+    /* Allow re-grabbing schedule back before next thread in fast
+     * yield-schedule cycles. */
+    if (fast_reschedule) {
+        bfs_thread_prev = tstate;
+    }
+
+    /* Find earliest deadline thread to run */
+    bfs_thread = bfs_find_task(bfs_python_time);
+    if (bfs_thread != NULL) {
+        TRACE(("SGNL %p - %Lf, signal %p, bfs_thread_switch=%p\n", tstate, timestamp, bfs_thread, bfs_thread_switch));
+        COND_SIGNAL(((PyThreadState*)bfs_thread)->bfs_cond);
+    }
+
+    /* Force yield of OS slice if next thread is more urgent. This 
+     * is required since OS may schedule next thread to run on same core. */
+    if (bfs_thread != NULL && bfs_thread_switch == NULL && tstate != NULL && 
+        (tstate->bfs_slice <= 0 || bfs_thread->bfs_deadline < bfs_python_time || 
+        tstate->bfs_deadline >= bfs_thread->bfs_deadline)) {
+        bfs_thread_switch = tstate;
+        COND_RESET(tstate->bfs_cond);
+        while (bfs_thread_switch == tstate) {
+            COND_WAIT(tstate->bfs_cond, bfs_mutex);
+        }
+    }
+
+    MUTEX_UNLOCK(bfs_mutex);
+}
+
+
+/* Yield schedule to another thread. */
+static void bfs_yield(PyThreadState *tstate) {
+    _bfs_yield(tstate, 1);
+}
+
+
+/* Check if thread depleted its running time slice. */
+_LOCAL(int) bfs_depleted_slice(PyThreadState *tstate) {
+    /* A reading is about 1 usec on a modern CPU. */
+    return tstate->bfs_slice <= _bfs_diff_ts(get_timestamp(), tstate->bfs_timestamp);
+}
+
+
+/* Propose scheduler to switch to a different thread - a la cooperative 
+ * multitasking. */
+_LOCAL(void) bfs_checkpoint(PyThreadState *tstate) {
+    TRACE(("CHCK %p - %Lf, preempt=%d, check_depleted=%d\n", tstate, get_timestamp(), bfs_preempt, bfs_check_depleted));
+
+    bfs_check_depleted = 0;
+    COMPUTE_EVAL_BREAKER();
+
+    if (bfs_preempt || bfs_depleted_slice(tstate)) {
+        _bfs_yield(tstate, 0);
+        bfs_schedule(tstate);
+    }
+}
+
+
+/* Cancel thread request for schedule. 
+ * Assumes canceled thread is not running or being scheduled concurrently.*/
+void bfs_cancel(PyThreadState *tstate) {
+    if (tstate->bfs_list.next == NULL) 
+        return;
+
+    MUTEX_LOCK(bfs_mutex);
+
+    if (bfs_thread == tstate) 
+        Py_FatalError("Attempt to cancel scheduled thread.");
+
+    if (tstate->bfs_list.next != NULL) {
+        _list_pop(&tstate->bfs_list);
+    }
+
+    MUTEX_UNLOCK(bfs_mutex);
+}
+
+
diff --git a/Python/ceval.c b/Python/ceval.c
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -217,13 +217,7 @@
 
 
 #define COMPUTE_EVAL_BREAKER() \
-	(eval_breaker = gil_drop_request | pendingcalls_to_do | pending_async_exc)
-
-#define SET_GIL_DROP_REQUEST() \
-	do { gil_drop_request = 1; eval_breaker = 1; } while (0)
-
-#define RESET_GIL_DROP_REQUEST() \
-	do { gil_drop_request = 0; COMPUTE_EVAL_BREAKER(); } while (0)
+	(eval_breaker = bfs_preempt | bfs_check_depleted | pendingcalls_to_do | pending_async_exc)
 
 #define SIGNAL_PENDING_CALLS() \
 	do { pendingcalls_to_do = 1; eval_breaker = 1; } while (0)
@@ -247,34 +241,42 @@
 
 static PyThread_type_lock pending_lock = 0; /* for pending calls */
 static long main_thread = 0;
+static int threads_init = 0;
 /* This single variable consolidates all requests to break out of the fast path
    in the eval loop. */
 static volatile int eval_breaker = 0;
-/* Request for droppping the GIL */
-static volatile int gil_drop_request = 0;
 /* Request for running pending calls */
 static volatile int pendingcalls_to_do = 0; 
 /* Request for looking at the `async_exc` field of the current thread state */
 static volatile int pending_async_exc = 0;
 
-#include "ceval_gil.h"
+#define SCHEDULER_INTERVAL_TICKS 2000
+static volatile int ticks = SCHEDULER_INTERVAL_TICKS;
+
+#include "bfs.c"
+
+#define EVAL_BREAKER() (eval_breaker)
 
 int
 PyEval_ThreadsInitialized(void)
 {
-	return gil_created();
+    return threads_init == 1;
 }
 
 void
 PyEval_InitThreads(void)
 {
-	if (gil_created())
-		return;
-	create_gil();
-	take_gil(PyThreadState_GET());
+    if (PyEval_ThreadsInitialized())
+        return;
+
+    bfs_init();
+    bfs_schedule(PyThreadState_GET());
+
 	main_thread = PyThread_get_thread_ident();
 	if (!pending_lock)
 		pending_lock = PyThread_allocate_lock();
+    
+    threads_init = 1;
 }
 
 void
@@ -283,7 +285,8 @@
 	PyThreadState *tstate = PyThreadState_GET();
 	if (tstate == NULL)
 		Py_FatalError("PyEval_AcquireLock: current thread state is NULL");
-	take_gil(tstate);
+	
+    bfs_schedule(tstate);
 }
 
 void
@@ -293,7 +296,7 @@
 	   We therefore avoid PyThreadState_GET() which dumps a fatal error
 	   in debug mode.
 	*/
-	drop_gil(_PyThreadState_Current);
+	bfs_yield(_PyThreadState_Current);
 }
 
 void
@@ -301,9 +304,11 @@
 {
 	if (tstate == NULL)
 		Py_FatalError("PyEval_AcquireThread: NULL new thread state");
-	/* Check someone has called PyEval_InitThreads() to create the lock */
-	assert(gil_created());
-	take_gil(tstate);
+	
+    /* Check someone has called PyEval_InitThreads() to create the lock */
+	assert(PyEval_ThreadsInitialized());
+
+	bfs_schedule(tstate);
 	if (PyThreadState_Swap(tstate) != NULL)
 		Py_FatalError(
 			"PyEval_AcquireThread: non-NULL old thread state");
@@ -316,7 +321,8 @@
 		Py_FatalError("PyEval_ReleaseThread: NULL thread state");
 	if (PyThreadState_Swap(NULL) != tstate)
 		Py_FatalError("PyEval_ReleaseThread: wrong thread state");
-	drop_gil(tstate);
+	
+    bfs_yield(tstate);
 }
 
 /* This function is called from PyOS_AfterFork to ensure that newly
@@ -330,15 +336,17 @@
 	PyObject *threading, *result;
 	PyThreadState *tstate = PyThreadState_GET();
 
-	if (!gil_created())
-		return;
+	if (!PyEval_ThreadsInitialized())
+		return;
+
 	/*XXX Can't use PyThread_free_lock here because it does too
 	  much error-checking.  Doing this cleanly would require
 	  adding a new function to each thread_*.h.  Instead, just
 	  create a new lock and waste a little bit of memory */
-	recreate_gil();
+	bfs_init();
+
 	pending_lock = PyThread_allocate_lock();
-	take_gil(tstate);
+	bfs_schedule(tstate);
 	main_thread = PyThread_get_thread_ident();
 
 	/* Update the threading module with the new state.
@@ -361,8 +369,10 @@
 
 #else
 static int eval_breaker = 0;
-static int gil_drop_request = 0;
 static int pending_async_exc = 0;
+
+#define EVAL_BREAKER() (eval_breaker)
+
 #endif /* WITH_THREAD */
 
 /* This function is used to signal that async exceptions are waiting to be
@@ -384,11 +394,14 @@
 	PyThreadState *tstate = PyThreadState_Swap(NULL);
 	if (tstate == NULL)
 		Py_FatalError("PyEval_SaveThread: NULL tstate");
-#ifdef WITH_THREAD
-	if (gil_created())
-		drop_gil(tstate);
-#endif
-	return tstate;
+
+#ifdef WITH_THREAD
+	if (PyEval_ThreadsInitialized()) {
+		bfs_yield(tstate);
+    }
+#endif
+
+    return tstate;
 }
 
 void
@@ -396,14 +409,16 @@
 {
 	if (tstate == NULL)
 		Py_FatalError("PyEval_RestoreThread: NULL tstate");
-#ifdef WITH_THREAD
-	if (gil_created()) {
+
+#ifdef WITH_THREAD
+	if (PyEval_ThreadsInitialized()) {
 		int err = errno;
-		take_gil(tstate);
+		bfs_schedule(tstate);
 		errno = err;
 	}
 #endif
-	PyThreadState_Swap(tstate);
+
+    PyThreadState_Swap(tstate);
 }
 
 
@@ -843,7 +858,10 @@
 
 #define DISPATCH() \
 	{ \
-		if (!eval_breaker) { \
+        if (bfs_threads_waiting) { \
+            ticks--; \
+        } \
+		if (!eval_breaker && ticks) { \
 			FAST_DISPATCH(); \
 		} \
 		continue; \
@@ -1217,7 +1235,16 @@
 		   async I/O handler); see Py_AddPendingCall() and
 		   Py_MakePendingCalls() above. */
 
-		if (eval_breaker) {
+#ifndef USE_COMPUTED_GOTOS
+        /* gcc will segfault on an attempt to increment ticks without
+         * the if clause. */
+        if (bfs_threads_waiting)
+            ticks--;
+#endif
+
+		if (EVAL_BREAKER() || !ticks) {
+            ticks = SCHEDULER_INTERVAL_TICKS;
+
 			if (*next_instr == SETUP_FINALLY) {
 				/* Make the last opcode before
 				   a try: finally: block uninterruptable. */
@@ -1233,20 +1260,18 @@
 					goto on_error;
 				}
 			}
-			if (gil_drop_request) {
-#ifdef WITH_THREAD
-				/* Give another thread a chance */
-				if (PyThreadState_Swap(NULL) != tstate)
-					Py_FatalError("ceval: tstate mix-up");
-				drop_gil(tstate);
-	
-				/* Other threads may run now */
-	
-				take_gil(tstate);
-				if (PyThreadState_Swap(tstate) != NULL)
-					Py_FatalError("ceval: orphan tstate");
-#endif
-			}
+            
+#ifdef WITH_THREAD
+            /* Give another thread a chance */
+            if (PyThreadState_Swap(NULL) != tstate)
+                Py_FatalError("ceval: tstate mix-up");
+
+            bfs_checkpoint(tstate);
+
+            if (PyThreadState_Swap(tstate) != NULL)
+                Py_FatalError("ceval: orphan tstate");
+#endif
+
 			/* Check for asynchronous exceptions. */
 			if (tstate->async_exc != NULL) {
 				x = tstate->async_exc;
diff --git a/Python/pystate.c b/Python/pystate.c
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -2,6 +2,7 @@
 /* Thread and interpreter state structures and their interfaces */
 
 #include "Python.h"
+#include "mutexed.h"
 
 /* --------------------------------------------------------------------------
 CAUTION
@@ -198,6 +199,13 @@
 		tstate->c_profileobj = NULL;
 		tstate->c_traceobj = NULL;
 
+        COND_INIT(tstate->bfs_cond);
+        tstate->bfs_list.next = NULL;
+        tstate->bfs_list.prev = NULL;
+        tstate->bfs_slice = 0;
+        tstate->bfs_deadline = 0;
+        tstate->bfs_timestamp = 0;
+
 		if (init)
 			_PyThreadState_Init(tstate);
 
@@ -271,8 +279,8 @@
 	if (Py_VerboseFlag && tstate->frame != NULL)
 		fprintf(stderr,
 		  "PyThreadState_Clear: warning: thread still has a frame\n");
-
-	Py_CLEAR(tstate->frame);
+	
+    Py_CLEAR(tstate->frame);
 
 	Py_CLEAR(tstate->dict);
 	Py_CLEAR(tstate->async_exc);
@@ -304,7 +312,8 @@
 	interp = tstate->interp;
 	if (interp == NULL)
 		Py_FatalError("PyThreadState_Delete: NULL interp");
-	HEAD_LOCK();
+
+        HEAD_LOCK();
 	for (p = &interp->tstate_head; ; p = &(*p)->next) {
 		if (*p == NULL)
 			Py_FatalError(
@@ -345,6 +354,10 @@
 
 
 #ifdef WITH_THREAD
+
+/* Defined in Python/bfs.c */
+PyAPI_FUNC(void) bfs_cancel(PyThreadState *tstate);
+
 void
 PyThreadState_DeleteCurrent()
 {
@@ -354,6 +367,11 @@
 			"PyThreadState_DeleteCurrent: no current tstate");
 	_PyThreadState_Current = NULL;
 	tstate_delete_common(tstate);
+
+    bfs_cancel(tstate);
+    COND_SIGNAL(tstate->bfs_cond);
+    COND_DESTROY(tstate->bfs_cond);
+
 	if (autoTLSkey && PyThread_get_key_value(autoTLSkey) == tstate)
 		PyThread_delete_key_value(autoTLSkey);
 	PyEval_ReleaseLock();
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -490,7 +490,7 @@
 static PyObject *
 sys_setswitchinterval(PyObject *self, PyObject *args)
 {
-	double d;
+	/*double d;
 	if (!PyArg_ParseTuple(args, "d:setswitchinterval", &d))
 		return NULL;
 	if (d <= 0.0) {
@@ -498,7 +498,7 @@
 				"switch interval must be strictly positive");
 		return NULL;
 	}
-	_PyEval_SetSwitchInterval((unsigned long) (1e6 * d));
+	_PyEval_SetSwitchInterval((unsigned long) (1e6 * d));*/
 	Py_INCREF(Py_None);
 	return Py_None;
 }
@@ -518,7 +518,7 @@
 static PyObject *
 sys_getswitchinterval(PyObject *self, PyObject *args)
 {
-	return PyFloat_FromDouble(1e-6 * _PyEval_GetSwitchInterval());
+	return PyFloat_FromDouble(1000.0);//1e-6 * _PyEval_GetSwitchInterval());
 }
 
 PyDoc_STRVAR(getswitchinterval_doc,