diff -r 59fb79d0411e Objects/stringlib/codecs.h
--- a/Objects/stringlib/codecs.h	Wed Dec 11 21:26:36 2013 +0200
+++ b/Objects/stringlib/codecs.h	Wed Dec 11 23:52:15 2013 +0200
@@ -718,6 +718,93 @@
     return len - (end - in + 1);
 #endif
 }
+
+#if STRINGLIB_SIZEOF_CHAR == 1
+# define SWAB4(CH, tmp)  ((CH) << 24) /* high bytes are zero */
+#elif STRINGLIB_SIZEOF_CHAR == 2
+# define SWAB4(CH, tmp)  (tmp = (CH), \
+            ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
+            /* high bytes are zero */
+#else
+# define SWAB4(CH, tmp)  (tmp = (CH), \
+            tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
+            ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
+#endif
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
+                        Py_ssize_t len,
+                        PY_UINT32_T **outptr,
+                        int native_ordering)
+{
+    PY_UINT32_T *out = *outptr;
+    const STRINGLIB_CHAR *end = in + len;
+    if (native_ordering) {
+        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
+        while (in < unrolled_end) {
+#if STRINGLIB_SIZEOF_CHAR > 1
+            /* check if any character is a surrogate character */
+            if (((in[0] ^ 0xd800) &
+                 (in[1] ^ 0xd800) &
+                 (in[2] ^ 0xd800) &
+                 (in[3] ^ 0xd800) & 0xf800) == 0)
+                break;
+#endif
+            out[0] = in[0];
+            out[1] = in[1];
+            out[2] = in[2];
+            out[3] = in[3];
+            in += 4; out += 4;
+        }
+        while (in < end) {
+            Py_UCS4 ch;
+            ch = *in++;
+#if STRINGLIB_SIZEOF_CHAR > 1
+            if (Py_UNICODE_IS_SURROGATE(ch)) {
+                /* reject surrogate characters (U+DC800-U+DFFF) */
+                goto fail;
+            }
+#endif
+            *out++ = ch;
+        }
+    } else {
+        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
+        while (in < unrolled_end) {
+#if STRINGLIB_SIZEOF_CHAR > 1
+            Py_UCS4 ch1, ch2, ch3, ch4;
+            /* check if any character is a surrogate character */
+            if (((in[0] ^ 0xd800) &
+                 (in[1] ^ 0xd800) &
+                 (in[2] ^ 0xd800) &
+                 (in[3] ^ 0xd800) & 0xf800) == 0)
+                break;
+#endif
+            out[0] = SWAB4(in[0], ch1);
+            out[1] = SWAB4(in[1], ch2);
+            out[2] = SWAB4(in[2], ch3);
+            out[3] = SWAB4(in[3], ch4);
+            in += 4; out += 4;
+        }
+        while (in < end) {
+            Py_UCS4 ch = *in++;
+#if STRINGLIB_SIZEOF_CHAR > 1
+            if (Py_UNICODE_IS_SURROGATE(ch)) {
+                /* reject surrogate characters (U+DC800-U+DFFF) */
+                goto fail;
+            }
+#endif
+            *out++ = SWAB4(ch, ch);
+        }
+    }
+    *outptr = out;
+    return len;
+#if STRINGLIB_SIZEOF_CHAR > 1
+  fail:
+    *outptr = out;
+    return len - (end - in + 1);
+#endif
+}
+#undef SWAB4
+
 #endif
 
 #endif /* STRINGLIB_IS_UNICODE */
diff -r 59fb79d0411e Objects/unicodeobject.c
--- a/Objects/unicodeobject.c	Wed Dec 11 21:26:36 2013 +0200
+++ b/Objects/unicodeobject.c	Wed Dec 11 23:52:15 2013 +0200
@@ -5100,32 +5100,22 @@
                        const char *errors,
                        int byteorder)
 {
-    int kind;
-    void *data;
+    enum PyUnicode_Kind kind;
+    const void *data;
     Py_ssize_t len;
     PyObject *v;
-    unsigned char *p;
-    Py_ssize_t nsize, i;
-    /* Offsets from p for storing byte pairs in the right order. */
+    PY_UINT32_T *out;
 #if PY_LITTLE_ENDIAN
-    int iorder[] = {0, 1, 2, 3};
+    int native_ordering = byteorder <= 0;
 #else
-    int iorder[] = {3, 2, 1, 0};
+    int native_ordering = byteorder >= 0;
 #endif
     const char *encoding;
+    Py_ssize_t nsize, pos;
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
     PyObject *rep = NULL;
 
-#define STORECHAR(CH)                           \
-    do {                                        \
-        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
-        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
-        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
-        p[iorder[0]] = (CH) & 0xff;             \
-        p += 4;                                 \
-    } while(0)
-
     if (!PyUnicode_Check(str)) {
         PyErr_BadArgument();
         return NULL;
@@ -5136,59 +5126,53 @@
     data = PyUnicode_DATA(str);
     len = PyUnicode_GET_LENGTH(str);
 
+    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
+        return PyErr_NoMemory();
     nsize = len + (byteorder == 0);
-    if (nsize > PY_SSIZE_T_MAX / 4)
-        return PyErr_NoMemory();
     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
     if (v == NULL)
         return NULL;
 
-    p = (unsigned char *)PyBytes_AS_STRING(v);
+    /* output buffer is 4-bytes aligned */
+    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
+    out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
     if (byteorder == 0)
-        STORECHAR(0xFEFF);
+        *out++ = 0xFEFF;
     if (len == 0)
-        return v;
-
-    if (byteorder == -1) {
-        /* force LE */
-        iorder[0] = 0;
-        iorder[1] = 1;
-        iorder[2] = 2;
-        iorder[3] = 3;
+        goto done;
+
+    if (byteorder == -1)
         encoding = "utf-32-le";
-    }
-    else if (byteorder == 1) {
-        /* force BE */
-        iorder[0] = 3;
-        iorder[1] = 2;
-        iorder[2] = 1;
-        iorder[3] = 0;
+    else if (byteorder == 1)
         encoding = "utf-32-be";
-    }
     else
         encoding = "utf-32";
 
     if (kind == PyUnicode_1BYTE_KIND) {
-        for (i = 0; i < len; i++)
-            STORECHAR(PyUnicode_READ(kind, data, i));
-        return v;
-    }
-
-    for (i = 0; i < len;) {
+        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
+        goto done;
+    }
+
+    pos = 0;
+    while (pos < len) {
         Py_ssize_t repsize, moreunits;
-        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
-        i++;
-        assert(ch <= MAX_UNICODE);
-        if (!Py_UNICODE_IS_SURROGATE(ch)) {
-            STORECHAR(ch);
-            continue;
-        }
+
+        if (kind == PyUnicode_2BYTE_KIND) {
+            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
+                                        &out, native_ordering);
+        }
+        else {
+            assert(kind == PyUnicode_4BYTE_KIND);
+            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
+                                        &out, native_ordering);
+        }
+        if (pos == len)
+            break;
 
         rep = unicode_encode_call_errorhandler(
                 errors, &errorHandler,
                 encoding, "surrogates not allowed",
-                str, &exc, i-1, i, &i);
-
+                str, &exc, pos, pos + 1, &pos);
         if (!rep)
             goto error;
 
@@ -5196,7 +5180,7 @@
             repsize = PyBytes_GET_SIZE(rep);
             if (repsize & 3) {
                 raise_encode_exception(&exc, encoding,
-                                       str, i - 1, i,
+                                       str, pos - 1, pos,
                                        "surrogates not allowed");
                 goto error;
             }
@@ -5209,7 +5193,7 @@
             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
             if (!PyUnicode_IS_ASCII(rep)) {
                 raise_encode_exception(&exc, encoding,
-                                       str, i - 1, i,
+                                       str, pos - 1, pos,
                                        "surrogates not allowed");
                 goto error;
             }
@@ -5217,7 +5201,7 @@
 
         /* four bytes are reserved for each surrogate */
         if (moreunits > 1) {
-            Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
+            Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
             Py_ssize_t morebytes = 4 * (moreunits - 1);
             if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
                 /* integer overflow */
@@ -5226,20 +5210,16 @@
             }
             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
                 goto error;
-            p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
+            out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
         }
 
         if (PyBytes_Check(rep)) {
-            Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
-            p += repsize;
+            Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
+            out += moreunits;
         } else /* rep is unicode */ {
-            const Py_UCS1 *repdata;
             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
-            repdata = PyUnicode_1BYTE_DATA(rep);
-            while (repsize--) {
-                Py_UCS4 ch = *repdata++;
-                STORECHAR(ch);
-            }
+            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
+                                 &out, native_ordering);
         }
 
         Py_CLEAR(rep);
@@ -5248,11 +5228,12 @@
     /* Cut back to size actually needed. This is necessary for, for example,
        encoding of a string containing isolated surrogates and the 'ignore'
        handler is used. */
-    nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
+    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
     if (nsize != PyBytes_GET_SIZE(v))
       _PyBytes_Resize(&v, nsize);
     Py_XDECREF(errorHandler);
     Py_XDECREF(exc);
+  done:
     return v;
   error:
     Py_XDECREF(rep);
@@ -5260,7 +5241,6 @@
     Py_XDECREF(exc);
     Py_XDECREF(v);
     return NULL;
-#undef STORECHAR
 }
 
 PyObject *