diff -r bfe042a363e6 Objects/stringlib/codecs.h
--- a/Objects/stringlib/codecs.h	Sat Oct 20 20:13:42 2012 +1000
+++ b/Objects/stringlib/codecs.h	Sat Oct 20 20:48:29 2012 +0300
@@ -626,4 +626,40 @@
 #endif
 #undef SWAB2
 }
+
+
+Py_LOCAL_INLINE(void)
+STRINGLIB(utf32_encode)(PY_UINT32_T *out,
+                        const STRINGLIB_CHAR *in,
+                        Py_ssize_t len,
+                        int native_ordering)
+{
+    const STRINGLIB_CHAR *end = in + len;
+    if (native_ordering) {
+#if STRINGLIB_SIZEOF_CHAR == 4
+        Py_MEMCPY(out, in, 4 * len);
+#else
+        _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, PY_UINT32_T, in, end, out);
+#endif
+    } else {
+        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
+#define SWAB4(CH, tmp)  (tmp = (CH), \
+            tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
+            ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
+        while (in < unrolled_end) {
+            Py_UCS4 ch1, ch2, ch3, ch4;
+            out[0] = SWAB4(in[0], ch1);
+            out[1] = SWAB4(in[1], ch2);
+            out[2] = SWAB4(in[2], ch3);
+            out[3] = SWAB4(in[3], ch4);
+            in += 4;
+            out += 4;
+        }
+        while (in < end) {
+            Py_UCS4 ch;
+            *out++ = SWAB4(*in++, ch);
+        }
+#undef SWAB4
+    }
+}
 #endif /* STRINGLIB_IS_UNICODE */
diff -r bfe042a363e6 Objects/unicodeobject.c
--- a/Objects/unicodeobject.c	Sat Oct 20 20:13:42 2012 +1000
+++ b/Objects/unicodeobject.c	Sat Oct 20 20:48:29 2012 +0300
@@ -4934,27 +4934,17 @@
                        const char *errors,
                        int byteorder)
 {
-    int kind;
-    void *data;
+    enum PyUnicode_Kind kind;
+    const void *data;
     Py_ssize_t len;
     PyObject *v;
-    unsigned char *p;
-    Py_ssize_t nsize, i;
-    /* Offsets from p for storing byte pairs in the right order. */
-#if PY_LITTLE_ENDIAN
-    int iorder[] = {0, 1, 2, 3};
+    PY_UINT32_T *out;
+    Py_ssize_t bytesize;
+#ifdef WORDS_BIGENDIAN
+    int native_ordering = byteorder >= 0;
 #else
-    int iorder[] = {3, 2, 1, 0};
-#endif
-
-#define STORECHAR(CH)                           \
-    do {                                        \
-        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
-        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
-        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
-        p[iorder[0]] = (CH) & 0xff;             \
-        p += 4;                                 \
-    } while(0)
+    int native_ordering = byteorder <= 0;
+#endif
 
     if (!PyUnicode_Check(str)) {
         PyErr_BadArgument();
@@ -4966,40 +4956,37 @@
     data = PyUnicode_DATA(str);
     len = PyUnicode_GET_LENGTH(str);
 
-    nsize = len + (byteorder == 0);
-    if (nsize > PY_SSIZE_T_MAX / 4)
+    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
         return PyErr_NoMemory();
-    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
+    bytesize = (len + (byteorder == 0)) * 4;
+    v = PyBytes_FromStringAndSize(NULL, bytesize);
     if (v == NULL)
         return NULL;
 
-    p = (unsigned char *)PyBytes_AS_STRING(v);
+    /* output buffer is 4-bytes aligned */
+    assert((Py_uintptr_t)PyBytes_AS_STRING(v) & 3 == 0);
+    out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
     if (byteorder == 0)
-        STORECHAR(0xFEFF);
+        *out++ = 0xFEFF;
     if (len == 0)
         goto done;
 
-    if (byteorder == -1) {
-        /* force LE */
-        iorder[0] = 0;
-        iorder[1] = 1;
-        iorder[2] = 2;
-        iorder[3] = 3;
-    }
-    else if (byteorder == 1) {
-        /* force BE */
-        iorder[0] = 3;
-        iorder[1] = 2;
-        iorder[2] = 1;
-        iorder[3] = 0;
-    }
-
-    for (i = 0; i < len; i++)
-        STORECHAR(PyUnicode_READ(kind, data, i));
-
-  done:
+    switch (kind) {
+    case PyUnicode_1BYTE_KIND:
+        ucs1lib_utf32_encode(out, (const Py_UCS1 *)data, len, native_ordering);
+        break;
+    case PyUnicode_2BYTE_KIND:
+        ucs2lib_utf32_encode(out, (const Py_UCS2 *)data, len, native_ordering);
+        break;
+    case PyUnicode_4BYTE_KIND:
+        ucs4lib_utf32_encode(out, (const Py_UCS4 *)data, len, native_ordering);
+        break;
+    default:
+        assert(0);
+    }
+
+done:
     return v;
-#undef STORECHAR
 }
 
 PyObject *