Index: Objects/unicodeobject.c
===================================================================
--- Objects/unicodeobject.c	(revision 68145)
+++ Objects/unicodeobject.c	(working copy)
@@ -2001,6 +2001,13 @@
     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
 }
 
+#define LONG_MASK (size_t) (SIZEOF_LONG - 1)
+#if (SIZEOF_LONG == 8)
+#define ASCII_MASK 0x8080808080808080L
+#else
+#define ASCII_MASK 0x80808080L
+#endif
+
 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
 			                Py_ssize_t size,
 			                const char *errors,
@@ -2011,7 +2018,7 @@
     Py_ssize_t startinpos;
     Py_ssize_t endinpos;
     Py_ssize_t outpos;
-    const char *e;
+    const char *e, *aligned_end;
     PyUnicodeObject *unicode;
     Py_UNICODE *p;
     const char *errmsg = "";
@@ -2032,11 +2039,43 @@
     /* Unpack UTF-8 encoded data */
     p = unicode->str;
     e = s + size;
+    aligned_end = (const char *) ((size_t) e & ~LONG_MASK);
 
     while (s < e) {
         Py_UCS4 ch = (unsigned char)*s;
 
         if (ch < 0x80) {
+            /* Fast path for runs of ASCII characters. */
+            if (!((size_t) s & LONG_MASK)) {
+                /* Help register allocation */
+                register const char *_s = s;
+                register Py_UNICODE *_p = p;
+                while (_s < aligned_end) {
+                    long word = *(long *) _s;
+                    if (word & ASCII_MASK)
+                        break;
+                    _p[0] = (unsigned char) _s[0];
+                    _p[1] = (unsigned char) _s[1];
+                    _p[2] = (unsigned char) _s[2];
+                    _p[3] = (unsigned char) _s[3];
+#if (SIZEOF_LONG == 8)
+                    _p[4] = (unsigned char) _s[4];
+                    _p[5] = (unsigned char) _s[5];
+                    _p[6] = (unsigned char) _s[6];
+                    _p[7] = (unsigned char) _s[7];
+#endif
+                    _s += SIZEOF_LONG;
+                    _p += SIZEOF_LONG;
+                }
+                s = _s;
+                p = _p;
+                if (s == e)
+                    break;
+                ch = (unsigned char)*s;
+            }
+        }
+
+        if (ch < 0x80) {
             *p++ = (Py_UNICODE)ch;
             s++;
             continue;
@@ -2169,6 +2208,7 @@
 	     &starts, &e, &startinpos, &endinpos, &exc, &s,
 	     &unicode, &outpos, &p))
 	goto onError;
+	aligned_end = (const char *) ((size_t) e & ~LONG_MASK);
     }
     if (consumed)
 	*consumed = s-starts;
@@ -2188,6 +2228,9 @@
     return NULL;
 }
 
+#undef LONG_MASK
+#undef ASCII_MASK
+
 /* Allocation strategy:  if the string is short, convert into a stack buffer
    and allocate exactly as much space needed at the end.  Else allocate the
    maximum possible needed (4 result bytes per Unicode character), and return