diff --git a/Include/bfs.h b/Include/bfs.h
new file mode 100644
--- /dev/null
+++ b/Include/bfs.h
@@ -0,0 +1,32 @@
+/*
+ * Python/bfs.h
+ *
+ * Simplified implementation of the Brain F**k Scheduler by Con Kolivas
+ * http://ck.kolivas.org/patches/bfs/sched-BFS.txt
+ *
+ * "The goal of the Brain F**k Scheduler, referred to as BFS from here on, is to
+ * completely do away with the complex designs of the past for the cpu process
+ * scheduler and instead implement one that is very simple in basic design.
+ * The main focus of BFS is to achieve excellent desktop interactivity and
+ * responsiveness without heuristics and tuning knobs that are difficult to
+ * understand, impossible to model and predict the effect of, and when tuned to
+ * one workload cause massive detriment to another." - Con Kolivas
+ */
+
+
+#ifndef Py_BFS_H
+#define Py_BFS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+PyAPI_FUNC(void) bfs_clear(PyThreadState *tstate);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_BFS_H */
+
+
diff --git a/Include/cycle.h b/Include/cycle.h
new file mode 100644
--- /dev/null
+++ b/Include/cycle.h
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2003, 2007-8 Matteo Frigo
+ * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+
+/* machine-dependent cycle counters code. Needs to be inlined. */
+
+/***************************************************************************/
+/* To use the cycle counters in your code, simply #include "cycle.h" (this
+   file), and then use the functions/macros:
+
+                 ticks getticks(void);
+
+   ticks is an opaque typedef defined below, representing the current time.
+   You extract the elapsed time between two calls to gettick() via:
+
+                 double elapsed(ticks t1, ticks t0);
+
+   which returns a double-precision variable in arbitrary units.  You
+   are not expected to convert this into human units like seconds; it
+   is intended only for *comparisons* of time intervals.
+
+   (In order to use some of the OS-dependent timer routines like
+   Solaris' gethrtime, you need to paste the autoconf snippet below
+   into your configure.ac file and #include "config.h" before cycle.h,
+   or define the relevant macros manually if you are not using autoconf.)
+*/
+
+/***************************************************************************/
+/* This file uses macros like HAVE_GETHRTIME that are assumed to be
+   defined according to whether the corresponding function/type/header
+   is available on your system.  The necessary macros are most
+   conveniently defined if you are using GNU autoconf, via the tests:
+   
+   dnl ---------------------------------------------------------------------
+
+   AC_C_INLINE
+   AC_HEADER_TIME
+   AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
+
+   AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif])
+
+   AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time])
+
+   dnl Cray UNICOS _rtc() (real-time clock) intrinsic
+   AC_MSG_CHECKING([for _rtc intrinsic])
+   rtc_ok=yes
+   AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H
+#include <intrinsics.h>
+#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no])
+   AC_MSG_RESULT($rtc_ok)
+
+   dnl ---------------------------------------------------------------------
+*/
+
+/***************************************************************************/
+
+#if TIME_WITH_SYS_TIME
+# include <sys/time.h>
+# include <time.h>
+#else
+# if HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+
+#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \
+{									  \
+     return (double)t1 - (double)t0;					  \
+}
+
+/*----------------------------------------------------------------*/
+/* Solaris */
+#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER)
+typedef hrtime_t ticks;
+
+#define getticks gethrtime
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* AIX v. 4+ routines to read the real-time clock or time-base register */
+#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER)
+typedef timebasestruct_t ticks;
+
+static __inline ticks getticks(void)
+{
+     ticks t;
+     read_real_time(&t, TIMEBASE_SZ);
+     return t;
+}
+
+static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */
+{
+     time_base_to_time(&t1, TIMEBASE_SZ);
+     time_base_to_time(&t0, TIMEBASE_SZ);
+     return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + 
+	     ((double)t1.tb_low - (double)t0.tb_low));
+}
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PowerPC ``cycle'' counter using the time base register.
+ */
+#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__))))  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned int tbl, tbu0, tbu1;
+
+     do {
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+	  __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+     } while (tbu0 != tbu1);
+
+     return (((unsigned long long)tbu0) << 32) | tbl;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime,
+   from Carbon, requires no additional libraries to be linked). */
+#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER)
+#include <mach/mach_time.h>
+typedef uint64_t ticks;
+#define getticks mach_absolute_time
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * Pentium cycle counter 
+ */
+#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("rdtsc": "=A" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/* Visual C++ -- thanks to Morten Nissov for his help with this */
+#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER)
+#include <windows.h>
+typedef LARGE_INTEGER ticks;
+#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */
+
+static __inline ticks getticks(void)
+{
+     ticks retval;
+
+     __asm {
+	  RDTSC
+	  mov retval.HighPart, edx
+	  mov retval.LowPart, eax
+     }
+     return retval;
+}
+
+static __inline double elapsed(ticks t1, ticks t0)
+{  
+     return (double)t1.QuadPart - (double)t0.QuadPart;
+}  
+
+#define HAVE_TICK_COUNTER_LARGE_INTEGER
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * X86-64 cycle counter
+ */
+#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned a, d; 
+     asm volatile("rdtsc" : "=a" (a), "=d" (d)); 
+     return ((ticks)a) | (((ticks)d) << 32); 
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori.
+   NOTE: this code will fail to link unless you use the -Masmkeyword compiler
+   option (grrr). */
+#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) 
+typedef unsigned long long ticks;
+static ticks getticks(void)
+{
+    asm(" rdtsc; shl    $0x20,%rdx; mov    %eax,%eax; or     %rdx,%rax;    ");
+}
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Visual C++, courtesy of Dirk Michaelis */
+#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER)
+
+#include <intrin.h>
+#pragma intrinsic(__rdtsc)
+typedef unsigned __int64 ticks;
+#define getticks __rdtsc
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * IA64 cycle counter
+ */
+
+/* intel's icc/ecc compiler */
+#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+#include <ia64intrin.h>
+
+static __inline__ ticks getticks(void)
+{
+     return __getReg(_IA64_REG_AR_ITC);
+}
+ 
+INLINE_ELAPSED(__inline__)
+ 
+#define HAVE_TICK_COUNTER
+#endif
+
+/* gcc */
+#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */
+#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER)
+#include <machine/sys/inline.h>
+typedef unsigned long ticks;
+
+static inline ticks getticks(void)
+{
+     ticks ret;
+
+     ret = _Asm_mov_from_ar (_AREG_ITC);
+     return ret;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Microsoft Visual C++ */
+#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned __int64 ticks;
+
+#  ifdef __cplusplus
+extern "C"
+#  endif
+ticks __getReg(int whichReg);
+#pragma intrinsic(__getReg)
+
+static __inline ticks getticks(void)
+{
+     volatile ticks temp;
+     temp = __getReg(3116);
+     return temp;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PA-RISC cycle counter 
+ */
+#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+#  ifdef __GNUC__
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("mfctl 16, %0": "=r" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+#  else
+#  include <machine/inline.h>
+static inline unsigned long getticks(void)
+{
+     register ticks ret;
+     _MFCTL(16, ret);
+     return ret;
+}
+#  endif
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* S390, courtesy of James Treacy */
+#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks cycles;
+     __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc");
+     return cycles;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER)
+/*
+ * The 32-bit cycle counter on alpha overflows pretty quickly, 
+ * unfortunately.  A 1GHz machine overflows in 4 seconds.
+ */
+typedef unsigned int ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned long cc;
+     __asm__ __volatile__ ("rpcc %0" : "=r"(cc));
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+     __asm__ __volatile__("rd %%tick, %0" : "=r" (ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER)
+#  include <c_asm.h>
+typedef unsigned int ticks;
+
+static __inline ticks getticks(void)
+{
+     unsigned long cc;
+     cc = asm("rpcc %v0");
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+/* SGI/Irix */
+#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER)
+typedef struct timespec ticks;
+
+static inline ticks getticks(void)
+{
+     struct timespec t;
+     clock_gettime(CLOCK_SGI_CYCLE, &t);
+     return t;
+}
+
+static inline double elapsed(ticks t1, ticks t0)
+{
+     return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 +
+	  ((double)t1.tv_nsec - (double)t0.tv_nsec);
+}
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* Cray UNICOS _rtc() intrinsic function */
+#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER)
+#ifdef HAVE_INTRINSICS_H
+#  include <intrinsics.h>
+#endif
+
+typedef long long ticks;
+
+#define getticks _rtc
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* MIPS ZBus */
+#if HAVE_MIPS_ZBUS_TIMER
+#if defined(__mips__) && !defined(HAVE_TICK_COUNTER)
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+typedef uint64_t ticks;
+
+static inline ticks getticks(void)
+{
+  static uint64_t* addr = 0;
+
+  if (addr == 0)
+  {
+    uint32_t rq_addr = 0x10030000;
+    int fd;
+    int pgsize;
+
+    pgsize = getpagesize();
+    fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0);
+    if (fd < 0) {
+      perror("open");
+      return NULL;
+    }
+    addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr);
+    close(fd);
+    if (addr == (uint64_t *)-1) {
+      perror("mmap");
+      return NULL;
+    }
+  }
+
+  return *addr;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+#endif /* HAVE_MIPS_ZBUS_TIMER */
+
diff --git a/Include/intrusive.h b/Include/intrusive.h
new file mode 100644
--- /dev/null
+++ b/Include/intrusive.h
@@ -0,0 +1,62 @@
+/*
+ * Python/intrusive.h
+ *
+ * Definitions for intrusive list.
+ */
+
+
+#ifndef Py_INTRUSIVE_H
+#define Py_INTRUSIVE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define CONTAINER(type, member, ptr) \
+   ((type *)((char *)(ptr) - offsetof(type, member))) 
+
+struct _list {
+    struct _list *next;
+    struct _list *prev;
+};
+
+/* Py_LOCAL_INLINE() does not work - see issue bugs.python.org/issue5553 */
+
+#ifdef MS_WINDOWS
+#define _LOCAL(type) static type
+#else
+#define _LOCAL(type) static inline type
+#endif
+
+_LOCAL(void) _list_init(struct _list *l) {
+    l->next = l;
+    l->prev = l;
+}
+
+_LOCAL(int) _list_empty(struct _list *l) {
+    return l->next == l;
+}
+
+_LOCAL(void) _list_append(struct _list *l, struct _list *item) {
+    struct _list *tail = l->prev;
+    tail->next = item;
+    item->prev = tail;
+    item->next = l;
+    l->prev = item;
+}
+
+_LOCAL(void) _list_pop(struct _list *item) {
+    item->next->prev = item->prev;
+    item->prev->next = item->next;
+    item->prev = NULL;
+    item->next = NULL;
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTRUSIVE_H */
+
+
diff --git a/Include/mutexed.h b/Include/mutexed.h
new file mode 100644
--- /dev/null
+++ b/Include/mutexed.h
@@ -0,0 +1,156 @@
+/*
+ * Mutex related macros for the BFS scheduler - taken from ceval_gil.h.
+ */
+
+#include <stdlib.h>
+
+#ifndef _POSIX_THREADS
+/* This means pthreads are not implemented in libc headers, hence the macro
+   not present in unistd.h. But they still can be implemented as an external
+   library (e.g. gnu pth in pthread emulation) */
+# ifdef HAVE_PTHREAD_H
+#  include <pthread.h> /* _POSIX_THREADS */
+# endif
+#endif
+
+#ifdef _POSIX_THREADS
+
+/*
+ * POSIX support
+ */
+
+#include <pthread.h>
+
+#define ADD_MICROSECONDS(tv, interval) \
+do { \
+    tv.tv_usec += (long) interval; \
+    tv.tv_sec += tv.tv_usec / 1000000; \
+    tv.tv_usec %= 1000000; \
+} while (0)
+
+/* We assume all modern POSIX systems have gettimeofday() */
+#ifdef GETTIMEOFDAY_NO_TZ
+#define GETTIMEOFDAY(ptv) gettimeofday(ptv)
+#else
+#define GETTIMEOFDAY(ptv) gettimeofday(ptv, (struct timezone *)NULL)
+#endif
+
+#define MUTEX_T pthread_mutex_t
+#define MUTEX_INIT(mut) \
+    if (pthread_mutex_init(&mut, NULL)) { \
+        Py_FatalError("pthread_mutex_init(" #mut ") failed"); };
+#define MUTEX_LOCK(mut) \
+    if (pthread_mutex_lock(&mut)) { \
+        Py_FatalError("pthread_mutex_lock(" #mut ") failed"); };
+#define MUTEX_UNLOCK(mut) \
+    if (pthread_mutex_unlock(&mut)) { \
+        Py_FatalError("pthread_mutex_unlock(" #mut ") failed"); };
+
+#define COND_T pthread_cond_t
+#define COND_INIT(cond) \
+    if (pthread_cond_init(&cond, NULL)) { \
+        Py_FatalError("pthread_cond_init(" #cond ") failed"); };
+#define COND_DESTROY(cond) \
+    if (pthread_cond_destroy(&cond)) { \
+        Py_FatalError("pthread_cond_destroy(" #cond ") failed"); };
+#define COND_RESET(cond)
+#define COND_SIGNAL(cond) \
+    if (pthread_cond_signal(&cond)) { \
+        Py_FatalError("pthread_cond_signal(" #cond ") failed"); };
+#define COND_WAIT(cond, mut) \
+    if (pthread_cond_wait(&cond, &mut)) { \
+        Py_FatalError("pthread_cond_wait(" #cond ") failed"); };
+#define COND_TIMED_WAIT(cond, mut, seconds, timestamp_now, timeout_result) \
+    { \
+        int r; \
+        struct timespec ts; \
+        struct timeval deadline; \
+        \
+        if (timestamp_now == 0) { \
+            GETTIMEOFDAY(&deadline); \
+        } \
+        else { \
+            deadline.tv_sec = (int) timestamp_now; \
+            deadline.tv_usec = (int) ((timestamp_now - deadline.tv_sec) * 1000000); \
+        } \
+        ADD_MICROSECONDS(deadline, ((int)(seconds * 1000000))); \
+        ts.tv_sec = deadline.tv_sec; \
+        ts.tv_nsec = deadline.tv_usec * 1000; \
+        \
+        r = pthread_cond_timedwait(&cond, &mut, &ts); \
+        if (r == ETIMEDOUT) \
+            timeout_result = 1; \
+        else if (r) \
+            Py_FatalError("pthread_cond_timedwait(" #cond ") failed"); \
+        else \
+            timeout_result = 0; \
+    } \
+
+#elif defined(NT_THREADS)
+
+/*
+ * Windows (2000 and later, as well as (hopefully) CE) support
+ */
+
+#include <windows.h>
+
+/* Use critical section since it is significantly faster than a mutex object.
+ * It should be enough for the needs of the BFS scheduler since on any 
+ * signaled event there is exactly one waiting thread. However the macros
+ * are not suited for the general case so the code should probably move
+ * somewhere else. */
+
+#define MUTEX_T CRITICAL_SECTION
+#define MUTEX_INIT(mut) \
+    InitializeCriticalSectionAndSpinCount(&mut, 4000);
+#define MUTEX_LOCK(mut) \
+    EnterCriticalSection(&mut);
+#define MUTEX_UNLOCK(mut) \
+    LeaveCriticalSection(&mut);
+
+#undef COND_T
+#define COND_T HANDLE
+#define COND_INIT(cond) \
+    /* auto-reset, non-signalled */ \
+    if (!(cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { \
+        Py_FatalError("CreateMutex(" #cond ") failed"); };
+#define COND_DESTROY(mut) \
+    if (!CloseHandle(mut)) { \
+        Py_FatalError("CloseHandle(" #mut ") failed"); };
+#define COND_RESET(cond) \
+    if (!ResetEvent(cond)) { \
+        Py_FatalError("ResetEvent(" #cond ") failed"); };
+#define COND_SIGNAL(cond) \
+    if (!SetEvent(cond)) { \
+        Py_FatalError("SetEvent(" #cond ") failed"); };
+#define COND_WAIT(cond, mut) \
+    { \
+        MUTEX_UNLOCK(mut); \
+        if (WaitForSingleObject(cond, INFINITE) != WAIT_OBJECT_0) \
+            Py_FatalError("WaitForSingleObject(" #cond ") failed"); \
+        MUTEX_LOCK(mut); \
+    }
+#define COND_TIMED_WAIT(cond, mut, seconds, timestamp_now, timeout_result) \
+    { \
+        DWORD r; \
+        MUTEX_UNLOCK(mut); \
+        r = WaitForSingleObject(cond, (int)(seconds * 1000)); \
+        if (r == WAIT_TIMEOUT) { \
+            MUTEX_LOCK(mut); \
+            timeout_result = 1; \
+        } \
+        else if (r != WAIT_OBJECT_0) \
+            Py_FatalError("WaitForSingleObject(" #cond ") failed"); \
+        else { \
+            MUTEX_LOCK(mut); \
+            timeout_result = 0; \
+        } \
+    }
+
+#else
+
+#error You need either a POSIX-compatible or a Windows system!
+
+#endif /* _POSIX_THREADS, NT_THREADS */
+
+
diff --git a/Include/pystate.h b/Include/pystate.h
--- a/Include/pystate.h
+++ b/Include/pystate.h
@@ -8,6 +8,22 @@
 extern "C" {
 #endif
 
+#include "pythread.h"
+#include "intrusive.h"
+
+/* Temporarily work around cross platform (nt/posix) compilation issues. 
+ * Can't include mutexed.h directly on NT since it includes windows.h 
+ * and breaks compliation solution-wide. */
+
+#undef COND_T
+#ifdef MS_WINDOWS
+typedef void *COND_HANDLE;
+#define COND_T COND_HANDLE
+#else
+#include "mutexed.h"
+#endif
+
+
 /* State shared between threads */
 
 struct _ts; /* Forward */
@@ -88,8 +104,6 @@
 
     PyObject *dict;  /* Stores per-thread state */
 
-    /* XXX doesn't mean anything anymore (the comment below is obsolete)
-       => deprecate or remove? */
     /* tick_counter is incremented whenever the check_interval ticker
      * reaches zero. The purpose is to give a useful measure of the number
      * of interpreted bytecode instructions in a given thread.  This
@@ -98,15 +112,21 @@
      */
     int tick_counter;
 
+    COND_T bfs_cond;
+    struct _list bfs_list;
+    long double bfs_slice;
+    long double bfs_deadline;
+    long double bfs_timestamp;
+
     int gilstate_counter;
 
     PyObject *async_exc; /* Asynchronous exception to raise */
     long thread_id; /* Thread id where this tstate was created */
 
     /* XXX signal handlers should also be here */
-
 } PyThreadState;
 
+#define THREAD_STATE(ptr) CONTAINER(PyThreadState, bfs_list, ptr)
 
 PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_New(void);
 PyAPI_FUNC(void) PyInterpreterState_Clear(PyInterpreterState *);
diff --git a/Python/bfs.c b/Python/bfs.c
new file mode 100644
--- /dev/null
+++ b/Python/bfs.c
@@ -0,0 +1,570 @@
+/*
+ * Python/bfs.c
+ *
+ * Simplified implementation of the Brain F**k Scheduler by Con Kolivas
+ * http://ck.kolivas.org/patches/bfs/sched-BFS.txt
+ *
+ * Copyright (c) 2010 Nir Aides <nir@winpdb.org> and individual contributors.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, 
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of Nir Aides nor the names of other contributors may 
+ * be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Simplified implementation of the Brain F**k Scheduler by Con Kolivas
+ * http://ck.kolivas.org/patches/bfs/sched-BFS.txt
+ *
+ * "The goal of the Brain F**k Scheduler, referred to as BFS from here on, is to
+ * completely do away with the complex designs of the past for the cpu process
+ * scheduler and instead implement one that is very simple in basic design.
+ * The main focus of BFS is to achieve excellent desktop interactivity and
+ * responsiveness without heuristics and tuning knobs that are difficult to
+ * understand, impossible to model and predict the effect of, and when tuned to
+ * one workload cause massive detriment to another." - Con Kolivas
+ *
+ * Notes:
+ * There are several mechanisms in this implementation which are particular to 
+ * Python:
+ *
+ * 1) Clock - Except for Linux, most platforms do not provide API to query
+ * thread's running time with high precision. Therefore timing is done with
+ * wall clock. The scheduler seems to behave reasonably with wall clock even 
+ * under load since the OS scheduler does not tend to preempt IO bound 
+ * threads.
+ *
+ * 2) Clock precision - Code works best with high precision clock 1ms--.
+ * With low precision clock, scheduler will become sensitive to tasks which 
+ * are synchronized with the interval size. There is a related discussion 
+ * about this in the BFS design doc under sub-tick accounting.
+ *
+ * 3) TSC - When available the code will try to use TSC for timing. TSC is
+ * high precision and fast to read and eliminates the timestamp reading 
+ * overhead. Other time sources such as HPET typically cost around 1usec per 
+ * call, sometimes even 3usec or more. Code is written such that it is 
+ * generally indifferent to multi-core counter sync and CPU-cycles/sec 
+ * ratio swings.
+ *
+ * 3) Slow Python ticks - Querying the clock between Python ticks is too 
+ * expensive, so it is done once every 5000 ticks. However, since even a 
+ * single slow Python tick can deplete a slice (10ms), a special flag exists 
+ * (bfs_check_depleted) which is set by waiting threads and which forces the 
+ * running thread to check its remaining slice by next tick.
+ *
+ * 4) Schedule snatching - Allow thread to snatch schedule if the signaled 
+ * next-to-run thread did not take over yet and is less urgent. This
+ * reduces context switching, for example in case of fast yield-schedule 
+ * cycles, such as with IO call that ends being non-blocking.
+ *
+ * 5) Force switch - A thread which yields to BFS may signal the next thread
+ * but continue to run (e.g. in C library code). If the OS schedules the 
+ * next thread on the same core, it will take a while before it actually gets
+ * to run. Therefore to prevent this situation, when a thread yields to BFS, 
+ * it will block if the next thread has more urgent deadline, until that 
+ * thread starts running. This improves throughput and latency. 
+ *
+ * 6) Multi-core clock - On older multi core CPU, dynamic frequency scaling
+ * may result in unsynchronized timetamp readings between cores. The code 
+ * addresses this problem with by using a "Python" clock created by 
+ * accumulating timestamp (capped) diffs.
+ *
+ * 7) Disable thread boost - On Windows code disables thread boosting 
+ * associated with scheduler-condition for CPU bound threads. This prevents
+ * them from getting higher priority, messing up scheduling across the system.
+ */
+
+
+#include "time.h"
+#include "mutexed.h"
+#include "cycle.h"
+
+
+/* Round robin interval - thread time slice for running (in seconds). 
+ * It is set at 10ms to match Windows tick resolution. */
+static const long double rr_interval = 0.010;
+
+/* Magic number taken from Con's implementation. There it is computed as
+ * 1.1 ^ 20 (nice level of regular thread). Basically it controls the load
+ * beyond which threads tend to expire their deadline. Expired threads are 
+ * handled in FIFO order, which means that at such load IO bound threads
+ * will no longer preempt running threads. */
+#define DEADLINE_FACTOR 8
+
+/* Protect access to data structures. */
+static MUTEX_T bfs_mutex;
+
+/* List all threads waiting for schedule. */
+static struct _list bfs_rq;
+
+/* Number of threads waiting for schedule */
+static volatile int bfs_threads_waiting = 0;
+
+/* Current running thread. */
+static volatile PyThreadState *bfs_thread = NULL;
+
+/* Mark current thread pointer as opaque so code does not try to access it.
+ * This is relevant to shutdown. */
+static int bfs_thread_opaque = 0;
+
+/* Previous running thread. */
+static PyThreadState *bfs_thread_prev = NULL;
+
+/* Pointer to yielding thread waiting for switch confirmation. */
+static volatile PyThreadState *bfs_thread_switch = NULL;
+
+/* Time to way for urgent thread to reaquire schedule in fast 
+ * yield, schedule scenarios (non-blocking IO). */
+#define SWITCH_BACK_TIMEOUT 0.000012
+
+/* Use as cross-multiple-CPU-cores "clock". */
+static volatile long double bfs_python_time = 0;
+
+/* Number of TSC ticks per second. */
+static long double bfs_tsc_freq = 0;
+static int bfs_tsc_disabled = 0;
+
+/* Flag currently running thread to yield immediately. */
+static int bfs_preempt = 0;
+
+/* Flag currently running thread to check remaining slice immediately. */
+static int bfs_check_depleted = 0;
+
+static int bfs_initialized = 0;
+
+//#define VERBOSE
+#undef VERBOSE
+#ifdef VERBOSE
+#define TRACE(v) printf v
+#else
+#define TRACE(v)
+#endif
+
+
+#ifdef MS_WINDOWS
+
+#define DISABLE_CPU_BOUND_THREAD_BOOST(tstate) \
+    if (tstate == NULL || tstate->bfs_slice <= 0 || tstate->bfs_deadline > bfs_python_time) { \
+        SetThreadPriorityBoost(GetCurrentThread(), TRUE); \
+    }
+
+#define RESTORE_THREAD_BOOST() \
+    SetThreadPriorityBoost(GetCurrentThread(), FALSE);
+
+/* Return system wide timestamp in seconds (with usec precision). */
+_LOCAL(long double) get_timestamp(void) {
+	static LARGE_INTEGER ctrStart;
+	static long double divisor = 0.0;
+	LARGE_INTEGER now;
+
+	if (divisor == 0.0) {
+		LARGE_INTEGER freq;
+		QueryPerformanceCounter(&ctrStart);
+		QueryPerformanceFrequency(&freq);
+		divisor = (long double) freq.QuadPart;
+	}
+
+	QueryPerformanceCounter(&now);
+	return (now.QuadPart - ctrStart.QuadPart) / divisor;
+}
+
+#elif defined(HAVE_GETTIMEOFDAY)
+
+#define DISABLE_CPU_BOUND_THREAD_BOOST(tstate)
+#define RESTORE_THREAD_BOOST() 
+
+/* Return system wide timestamp in seconds (with usec precision). */
+_LOCAL(long double) get_timestamp(void) {
+    struct timeval tv;
+    GETTIMEOFDAY(&tv);
+    return (long double) (tv.tv_sec + tv.tv_usec * 0.000001);
+}
+
+#else
+
+#error You need either a POSIX-compatible or a Windows system!
+
+#endif /* MS_WINDOWS, HAVE_GETTIMEOFDAY */
+
+
+#ifdef HAVE_TICK_COUNTER_LARGE_INTEGER
+#define GETTICKS() ((long double)getticks().QuadPart)
+#elif defined(HAVE_TICK_COUNTER)
+#define GETTICKS() ((long double)getticks())
+#else
+#define GETTICKS() 0
+#endif
+
+
+/* Calibrate TSC counter frequency. */
+long double bfs_tsc_calibrate0(void) {
+    long double tk0, tk1, ts0, ts1;
+    int i;
+
+    tk0 = GETTICKS();
+    ts0 = get_timestamp();
+    do {
+        for(i = 0; i < 1000; i++);
+    } while (get_timestamp() - ts0 < 0.000030);
+    tk1 = GETTICKS();
+    ts1 = get_timestamp();
+    return  (tk1 - tk0) / (ts1 - ts0);
+}
+
+
+/* Calibrate TSC counter frequency. */
+long double bfs_tsc_calibrate1(void) {
+    long double f0 = bfs_tsc_calibrate0(); 
+    long double f1 = bfs_tsc_calibrate0(); 
+
+    if (0.6 * f0 < f1 && f1 < 1.5 * f0)
+        return (f0 + f1) / 2;
+
+    return 0;
+}
+
+
+/* Calibrate TSC counter frequency. */
+void bfs_tsc_calibrate(void) {
+    if (GETTICKS() != 0) {
+        bfs_tsc_freq = bfs_tsc_calibrate1();
+        if (bfs_tsc_freq != 0) 
+            return;
+
+        bfs_tsc_freq = bfs_tsc_calibrate1();
+        if (bfs_tsc_freq != 0) 
+            return;
+    }
+
+    bfs_tsc_disabled = 1;
+}
+
+
+/* Query TSC or fall back to gettimeofday() */
+_LOCAL(long double) get_cpu_timestamp(void) {
+    if (bfs_tsc_disabled)
+        return get_timestamp();
+
+    return GETTICKS() / bfs_tsc_freq;
+}
+
+
+/* Lock bfs mutex before fork() to make sure it is not held by an uncopied 
+ * thread */
+void bfs_fork_prepare(void) {
+    MUTEX_LOCK(bfs_mutex);
+}
+
+
+/* Unlock mutex after fork() at parent to allow normal operation. */
+void bfs_fork_parent(void) {
+    MUTEX_UNLOCK(bfs_mutex);
+}
+
+
+/* Initialize bfs and unlock mutex after fork() at child to allow normal 
+ * operation. */
+void bfs_fork_child(void) {
+    _list_init(&bfs_rq);
+    MUTEX_UNLOCK(bfs_mutex);
+}
+
+
+static void bfs_init(void) {
+#ifdef _POSIX_THREADS
+    pthread_atfork(bfs_fork_prepare, bfs_fork_parent, bfs_fork_child);
+#endif 
+    MUTEX_INIT(bfs_mutex);
+    _list_init(&bfs_rq);
+    bfs_tsc_calibrate();
+    bfs_initialized = 1;
+}
+
+
+/* Pick next thread to schedule based on earliest deadline. */
+static PyThreadState *bfs_find_task(long double python_time) {
+    struct _list *item;
+    PyThreadState *task;
+
+    if (_list_empty(&bfs_rq))
+        return NULL;
+
+    item = bfs_rq.next;
+    task = THREAD_STATE(item);
+
+    /* Search for thread with earliest deadline or first expired deadline. 
+     * IO bound threads typically have expired deadlines since they naturally
+     * take longer than their original deadline to deplete their slice. */
+    for (item = item->next; item != &bfs_rq && task->bfs_deadline > python_time; item = item->next) {
+        PyThreadState *tstate = THREAD_STATE(item);
+        if (tstate->bfs_deadline < task->bfs_deadline) {
+            task = tstate;
+        }
+    }
+
+    return task;
+}
+
+
+/* Wait for currently running thread to signal this thread (tstate)
+ * to take over the schedule. Call with locked mutex */
+static void _bfs_timed_wait(PyThreadState *tstate, long double timestamp) {
+    /* Use timeout to guard against slow ticks of running thread, but
+     * multiply by number of waiting threads to prevent growing number of
+     * context switches. */
+    long double timeout = bfs_threads_waiting * rr_interval * 0.55;
+    int timed_out = 0;
+
+    COND_TIMED_WAIT(tstate->bfs_cond, bfs_mutex, timeout, timestamp, timed_out);
+    
+    /* Flag running thread to check slice depletion immediately. it has possibly
+     * been doing slow ticks (e.g. regular expression searches) and depleted its 
+     * slice. */
+    if (timed_out) {
+        bfs_check_depleted = 1;
+        COMPUTE_EVAL_BREAKER();
+    }
+}
+
+
+/* Diff timestamp capping results to protect against clock differences 
+ * between cores. */
+_LOCAL(long double) _bfs_diff_ts(long double ts1, long double ts0) {
+    if (ts1 < ts0) 
+        return 0;
+
+    if (ts1 - ts0 > rr_interval)
+        return rr_interval;
+
+    return ts1 - ts0;
+}
+
+
+/* Schedule (tstate) thread for running. */
+static void bfs_schedule(PyThreadState *tstate) {
+#ifdef VERBOSE
+    long double _slice = tstate->bfs_slice;
+    long double _deadline = tstate->bfs_deadline;
+#endif
+
+    int is_urgent;
+    volatile int spin;
+    long double ts0;
+
+    MUTEX_LOCK(bfs_mutex);
+    
+    /* Disable boost inside mutex due to tstate lifetime trickiness during 
+     * shutdown. */
+    DISABLE_CPU_BOUND_THREAD_BOOST(tstate);
+
+    /* Refill depleted slice and reset scheduling deadline. */
+    if (tstate->bfs_slice <= 0) {
+        tstate->bfs_slice = rr_interval;
+        tstate->bfs_deadline = bfs_python_time + rr_interval * DEADLINE_FACTOR;
+    }
+
+    TRACE(("SCHD %p - %Lf, pt=%Lf, slice=%Lf, deadline-in=%Lf, deadline-on=%Lf\n", tstate, get_timestamp(), bfs_python_time, _slice, _deadline - bfs_python_time, tstate->bfs_deadline));
+
+    do {
+        /* Schedule immediately if no thread is currently running. */
+        if (bfs_thread == NULL) 
+            break;
+
+        /* Determine urgency based on deadline. If deadline expired handle
+         * in FIFO order. */
+        is_urgent =  !bfs_thread_opaque && 
+            tstate->bfs_deadline <= bfs_thread->bfs_deadline && 
+            bfs_thread->bfs_deadline >= bfs_python_time;
+            
+        /* Schedule immediately if next thread (bfs_thread) did not take 
+         * over yet and is less urgent. */ 
+        if (is_urgent && bfs_thread_prev != NULL)
+            break;
+          
+        /* Preempt running thread if it is less urgent. */
+        if (is_urgent) {
+            bfs_preempt = 1;
+            COMPUTE_EVAL_BREAKER();
+        }
+
+        /* Flag running thread to check depletion of its slice. */ 
+        else {
+            bfs_check_depleted = 1;
+            COMPUTE_EVAL_BREAKER();
+        }
+
+        /* Queue and wait for schedule */
+
+        _list_append(&bfs_rq, &tstate->bfs_list);
+        bfs_threads_waiting++;
+        
+        COND_RESET(tstate->bfs_cond);
+        while (bfs_thread != tstate) {
+            _bfs_timed_wait(tstate, 0);
+            /* If yielding thread is more urgent give it a chance to 
+             * reschedule by spinning a little. Reduces multi-core switching 
+             * on fast yield-schedule cycles. */
+            if (bfs_thread == tstate && bfs_thread_prev != NULL && 
+                tstate->bfs_deadline > bfs_thread_prev->bfs_deadline && 
+                tstate->bfs_deadline >= bfs_python_time) {
+                MUTEX_UNLOCK(bfs_mutex);
+                ts0 = get_timestamp();
+                do {
+                    for (spin = 500; spin > 0 && bfs_thread == tstate; spin--);
+                } while (get_timestamp() - ts0 < SWITCH_BACK_TIMEOUT);
+                MUTEX_LOCK(bfs_mutex);
+            }
+        }
+
+        _list_pop(&tstate->bfs_list);
+        bfs_threads_waiting--;
+        break;
+    } while(0);
+
+    bfs_thread = tstate;
+    bfs_thread_prev = NULL;
+
+    /* Signal previous thread if it is waiting for the switch. */
+    if (bfs_thread_switch != NULL) {
+        COND_SIGNAL(((PyThreadState*)bfs_thread_switch)->bfs_cond);
+        bfs_thread_switch = NULL;
+    }
+
+    tstate->bfs_timestamp = get_cpu_timestamp();
+
+    TRACE(("TAKE %p - %Lf, pt=%Lf, tsc=%Lf\n", tstate, get_timestamp(), bfs_python_time, tstate->bfs_timestamp));
+    MUTEX_UNLOCK(bfs_mutex);
+    RESTORE_THREAD_BOOST();
+}
+
+
+/* Yield schedule to another thread. */
+static void _bfs_yield(PyThreadState *tstate, int fast_reschedule) {
+    long double interval;
+
+    /* Update running time slice (book keeping). */
+    if (!bfs_thread_opaque && tstate != NULL) {
+        interval = _bfs_diff_ts(get_cpu_timestamp(), tstate->bfs_timestamp);
+        bfs_python_time += interval;
+        tstate->bfs_slice -= interval;
+    } 
+    
+    DISABLE_CPU_BOUND_THREAD_BOOST(tstate);
+    MUTEX_LOCK(bfs_mutex);
+    TRACE(("DROP %p - %Lf, pt=%Lf, tsc=%Lf\n", tstate, get_timestamp(), bfs_python_time, get_cpu_timestamp()));
+
+    /* Reset preemption flags - we heard the shout. */
+    bfs_preempt = 0;
+    bfs_check_depleted = 0;
+    COMPUTE_EVAL_BREAKER();
+
+    /* Allow re-grabbing schedule back before next thread in fast
+     * yield-schedule cycles. */
+    if (!bfs_thread_opaque && fast_reschedule) {
+        bfs_thread_prev = tstate;
+    }
+
+    /* Find earliest deadline thread to run */
+    bfs_thread = bfs_find_task(bfs_python_time);
+    if (bfs_thread != NULL) {
+        TRACE(("SGNL %p - %Lf, signal %p\n", tstate, get_timestamp(), bfs_thread));
+        COND_SIGNAL(((PyThreadState*)bfs_thread)->bfs_cond);
+    }
+
+    /* Force yield of OS slice if next thread is more urgent. This 
+     * is required since OS may schedule next thread to run on same core. */
+    if (!bfs_thread_opaque && bfs_thread != NULL && 
+        bfs_thread_switch == NULL && tstate != NULL && 
+        (tstate->bfs_slice <= 0 || bfs_thread->bfs_deadline < bfs_python_time || 
+        tstate->bfs_deadline >= bfs_thread->bfs_deadline)) {
+        bfs_thread_switch = tstate;
+        COND_RESET(tstate->bfs_cond);
+        while (bfs_thread_switch == tstate) {
+            COND_WAIT(tstate->bfs_cond, bfs_mutex);
+        }
+    }
+
+    bfs_thread_opaque = 0;
+
+    MUTEX_UNLOCK(bfs_mutex);
+    RESTORE_THREAD_BOOST();
+}
+
+
+/* Yield schedule to another thread. */
+static void bfs_yield(PyThreadState *tstate) {
+    _bfs_yield(tstate, 1);
+}
+
+
+/* Check if thread depleted its running time slice. */
+_LOCAL(int) bfs_depleted_slice(PyThreadState *tstate) {
+    /* A reading is about 1 usec on a modern CPU. */
+    return tstate->bfs_slice <= _bfs_diff_ts(get_cpu_timestamp(), tstate->bfs_timestamp);
+}
+
+
+/* Propose scheduler to switch to a different thread - a la cooperative 
+ * multitasking. */
+_LOCAL(void) bfs_checkpoint(PyThreadState *tstate) {
+    TRACE(("CHCK %p - %Lf, preempt=%d, check_depleted=%d\n", tstate, get_timestamp(), bfs_preempt, bfs_check_depleted));
+
+    bfs_check_depleted = 0;
+    COMPUTE_EVAL_BREAKER();
+
+    if (bfs_preempt || bfs_depleted_slice(tstate)) {
+        _bfs_yield(tstate, 0);
+        bfs_schedule(tstate);
+    }
+}
+
+
+/* Clean up, called when clearing up a Python thread state. */
+void bfs_clear(PyThreadState *tstate) {
+    if (bfs_initialized == 0)
+        return;
+
+    MUTEX_LOCK(bfs_mutex);
+    
+    /* If attempt to clear current thread, mark it as opaque so it is 
+     * not accessed by other threads attempting to schedule. */
+    if (bfs_thread == tstate) {
+        bfs_thread_opaque = 1;
+    }
+    /* If attempt to clear prev-thread, reset it to NULL so it is not accessed
+     * by other threads. */
+    else if (bfs_thread_prev == tstate) {
+        bfs_thread_prev = NULL;
+    }
+    
+    /* Remove cleared thread from waiting list. */
+    if (tstate->bfs_list.next != NULL) {
+        _list_pop(&tstate->bfs_list);
+    } 
+    else {
+        COND_DESTROY(tstate->bfs_cond);
+    }
+
+    MUTEX_UNLOCK(bfs_mutex);
+}
+
+
diff --git a/Python/ceval.c b/Python/ceval.c
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -222,22 +222,10 @@
 #define COMPUTE_EVAL_BREAKER() \
     _Py_atomic_store_relaxed( \
         &eval_breaker, \
-        _Py_atomic_load_relaxed(&gil_drop_request) | \
+        bfs_preempt | bfs_check_depleted | \
         _Py_atomic_load_relaxed(&pendingcalls_to_do) | \
         pending_async_exc)
 
-#define SET_GIL_DROP_REQUEST() \
-    do { \
-        _Py_atomic_store_relaxed(&gil_drop_request, 1); \
-        _Py_atomic_store_relaxed(&eval_breaker, 1); \
-    } while (0)
-
-#define RESET_GIL_DROP_REQUEST() \
-    do { \
-        _Py_atomic_store_relaxed(&gil_drop_request, 0); \
-        COMPUTE_EVAL_BREAKER(); \
-    } while (0)
-
 /* Pending calls are only modified under pending_lock */
 #define SIGNAL_PENDING_CALLS() \
     do { \
@@ -270,35 +258,43 @@
 
 static PyThread_type_lock pending_lock = 0; /* for pending calls */
 static long main_thread = 0;
+static int threads_init = 0;
 /* This single variable consolidates all requests to break out of the fast path
    in the eval loop. */
 static _Py_atomic_int eval_breaker = {0};
-/* Request for dropping the GIL */
-static _Py_atomic_int gil_drop_request = {0};
 /* Request for running pending calls. */
 static _Py_atomic_int pendingcalls_to_do = {0};
 /* Request for looking at the `async_exc` field of the current thread state.
    Guarded by the GIL. */
 static int pending_async_exc = 0;
 
-#include "ceval_gil.h"
+#define SCHEDULER_INTERVAL_TICKS 5000
+static volatile int bfs_ticks = SCHEDULER_INTERVAL_TICKS;
+
+#include "bfs.c"
 
 int
 PyEval_ThreadsInitialized(void)
 {
-    return gil_created();
+    return threads_init == 1;
 }
 
 void
 PyEval_InitThreads(void)
 {
-    if (gil_created())
-        return;
-    create_gil();
-    take_gil(PyThreadState_GET());
+    if (PyEval_ThreadsInitialized())
+        return;
+
+    /* PyEval_InitThreads() either initializes the scheduler or is called 
+     * by a scheduled thread - "has the lock". */
+    bfs_init();
+    bfs_schedule(PyThreadState_GET());
+
     main_thread = PyThread_get_thread_ident();
     if (!pending_lock)
         pending_lock = PyThread_allocate_lock();
+
+    threads_init = 1;
 }
 
 void
@@ -307,7 +303,7 @@
     PyThreadState *tstate = PyThreadState_GET();
     if (tstate == NULL)
         Py_FatalError("PyEval_AcquireLock: current thread state is NULL");
-    take_gil(tstate);
+    bfs_schedule(tstate);
 }
 
 void
@@ -317,7 +313,7 @@
        We therefore avoid PyThreadState_GET() which dumps a fatal error
        in debug mode.
     */
-    drop_gil((PyThreadState*)_Py_atomic_load_relaxed(
+    bfs_yield((PyThreadState*)_Py_atomic_load_relaxed(
         &_PyThreadState_Current));
 }
 
@@ -326,9 +322,11 @@
 {
     if (tstate == NULL)
         Py_FatalError("PyEval_AcquireThread: NULL new thread state");
+
     /* Check someone has called PyEval_InitThreads() to create the lock */
-    assert(gil_created());
-    take_gil(tstate);
+	assert(PyEval_ThreadsInitialized());
+
+	bfs_schedule(tstate);
     if (PyThreadState_Swap(tstate) != NULL)
         Py_FatalError(
             "PyEval_AcquireThread: non-NULL old thread state");
@@ -341,7 +339,10 @@
         Py_FatalError("PyEval_ReleaseThread: NULL thread state");
     if (PyThreadState_Swap(NULL) != tstate)
         Py_FatalError("PyEval_ReleaseThread: wrong thread state");
-    drop_gil(tstate);
+
+    /* Call yield with NULL since tstate not guaranteed to be valid pointer:
+     * http://docs.python.org/py3k/c-api/init.html?highlight=gil#PyEval_ReleaseThread */
+    bfs_yield(NULL);
 }
 
 /* This function is called from PyOS_AfterFork to ensure that newly
@@ -355,15 +356,10 @@
     PyObject *threading, *result;
     PyThreadState *tstate = PyThreadState_GET();
 
-    if (!gil_created())
-        return;
-    /*XXX Can't use PyThread_free_lock here because it does too
-      much error-checking.  Doing this cleanly would require
-      adding a new function to each thread_*.h.  Instead, just
-      create a new lock and waste a little bit of memory */
-    recreate_gil();
+	if (!PyEval_ThreadsInitialized())
+        return;
+
     pending_lock = PyThread_allocate_lock();
-    take_gil(tstate);
     main_thread = PyThread_get_thread_ident();
 
     /* Update the threading module with the new state.
@@ -386,8 +382,8 @@
 
 #else
 static _Py_atomic_int eval_breaker = {0};
-static _Py_atomic_int gil_drop_request = {0};
 static int pending_async_exc = 0;
+
 #endif /* WITH_THREAD */
 
 /* This function is used to signal that async exceptions are waiting to be
@@ -410,8 +406,9 @@
     if (tstate == NULL)
         Py_FatalError("PyEval_SaveThread: NULL tstate");
 #ifdef WITH_THREAD
-    if (gil_created())
-        drop_gil(tstate);
+	if (PyEval_ThreadsInitialized()) {
+		bfs_yield(tstate);
+    }
 #endif
     return tstate;
 }
@@ -422,9 +419,9 @@
     if (tstate == NULL)
         Py_FatalError("PyEval_RestoreThread: NULL tstate");
 #ifdef WITH_THREAD
-    if (gil_created()) {
+	if (PyEval_ThreadsInitialized()) {
         int err = errno;
-        take_gil(tstate);
+		bfs_schedule(tstate);
         errno = err;
     }
 #endif
@@ -868,7 +865,10 @@
 
 #define DISPATCH() \
     { \
-        if (!_Py_atomic_load_relaxed(&eval_breaker)) {      \
+        if (bfs_threads_waiting) { \
+            bfs_ticks--; \
+        } \
+		if (!_Py_atomic_load_relaxed(&eval_breaker) && bfs_ticks) { \
                     FAST_DISPATCH(); \
         } \
         continue; \
@@ -1242,7 +1242,15 @@
            async I/O handler); see Py_AddPendingCall() and
            Py_MakePendingCalls() above. */
 
-        if (_Py_atomic_load_relaxed(&eval_breaker)) {
+#ifndef USE_COMPUTED_GOTOS
+        /* gcc will segfault on an attempt to increment bfs_ticks without
+         * the if clause. */
+        if (bfs_threads_waiting)
+            bfs_ticks--;
+#endif
+
+		if (_Py_atomic_load_relaxed(&eval_breaker) || !bfs_ticks) {
+            bfs_ticks = SCHEDULER_INTERVAL_TICKS;
             if (*next_instr == SETUP_FINALLY) {
                 /* Make the last opcode before
                    a try: finally: block uninterruptable. */
@@ -1258,20 +1266,16 @@
                     goto on_error;
                 }
             }
-            if (_Py_atomic_load_relaxed(&gil_drop_request)) {
 #ifdef WITH_THREAD
                 /* Give another thread a chance */
                 if (PyThreadState_Swap(NULL) != tstate)
                     Py_FatalError("ceval: tstate mix-up");
-                drop_gil(tstate);
-
-                /* Other threads may run now */
-
-                take_gil(tstate);
+
+            bfs_checkpoint(tstate);
+
                 if (PyThreadState_Swap(tstate) != NULL)
                     Py_FatalError("ceval: orphan tstate");
 #endif
-            }
             /* Check for asynchronous exceptions. */
             if (tstate->async_exc != NULL) {
                 x = tstate->async_exc;
diff --git a/Python/pystate.c b/Python/pystate.c
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -2,6 +2,7 @@
 /* Thread and interpreter state structures and their interfaces */
 
 #include "Python.h"
+#include "mutexed.h"
 
 /* --------------------------------------------------------------------------
 CAUTION
@@ -24,6 +25,7 @@
 
 
 #ifdef WITH_THREAD
+#include "bfs.h"
 #include "pythread.h"
 static PyThread_type_lock head_mutex = NULL; /* Protects interp->tstate_head */
 #define HEAD_INIT() (void)(head_mutex || (head_mutex = PyThread_allocate_lock()))
@@ -200,6 +202,13 @@
         tstate->c_profileobj = NULL;
         tstate->c_traceobj = NULL;
 
+        COND_INIT(tstate->bfs_cond);
+        tstate->bfs_list.next = NULL;
+        tstate->bfs_list.prev = NULL;
+        tstate->bfs_slice = 0;
+        tstate->bfs_deadline = 0;
+        tstate->bfs_timestamp = 0;
+
         if (init)
             _PyThreadState_Init(tstate);
 
@@ -291,6 +300,10 @@
     tstate->c_tracefunc = NULL;
     Py_CLEAR(tstate->c_profileobj);
     Py_CLEAR(tstate->c_traceobj);
+
+#ifdef WITH_THREAD
+    bfs_clear(tstate);
+#endif
 }
 
 
@@ -347,6 +360,7 @@
 
 
 #ifdef WITH_THREAD
+
 void
 PyThreadState_DeleteCurrent()
 {
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -490,7 +490,7 @@
 static PyObject *
 sys_setswitchinterval(PyObject *self, PyObject *args)
 {
-    double d;
+    /*double d;
     if (!PyArg_ParseTuple(args, "d:setswitchinterval", &d))
         return NULL;
     if (d <= 0.0) {
@@ -498,7 +498,7 @@
                         "switch interval must be strictly positive");
         return NULL;
     }
-    _PyEval_SetSwitchInterval((unsigned long) (1e6 * d));
+    _PyEval_SetSwitchInterval((unsigned long) (1e6 * d));*/
     Py_INCREF(Py_None);
     return Py_None;
 }
@@ -518,7 +518,7 @@
 static PyObject *
 sys_getswitchinterval(PyObject *self, PyObject *args)
 {
-    return PyFloat_FromDouble(1e-6 * _PyEval_GetSwitchInterval());
+    return PyFloat_FromDouble(1000.0);//1e-6 * _PyEval_GetSwitchInterval());
 }
 
 PyDoc_STRVAR(getswitchinterval_doc,