diff -r 36cd1cf5a160 Include/unicodeobject.h --- a/Include/unicodeobject.h Wed Jun 06 15:57:18 2012 +0200 +++ b/Include/unicodeobject.h Thu Jun 07 14:54:22 2012 +0300 @@ -188,9 +188,9 @@ (((((Py_UCS4)(high) & 0x03FF) << 10) | \ ((Py_UCS4)(low) & 0x03FF)) + 0x10000) /* high surrogate = top 10 bits added to D800 */ -#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 | (((ch) - 0x10000) >> 10)) +#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10)) /* low surrogate = bottom 10 bits added to DC00 */ -#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 | (((ch) - 0x10000) & 0x3FF)) +#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF)) /* Check if substring matches at given offset. The offset must be valid, and the substring must not be empty. */ diff -r 36cd1cf5a160 Objects/stringlib/codecs.h --- a/Objects/stringlib/codecs.h Wed Jun 06 15:57:18 2012 +0200 +++ b/Objects/stringlib/codecs.h Thu Jun 07 14:54:22 2012 +0300 @@ -562,4 +562,68 @@ #undef STRIPPED_MASK #undef SWAB #undef LONG_PTR_MASK + + +Py_LOCAL_INLINE(void) +STRINGLIB(utf16_encode)(unsigned short *out, + const STRINGLIB_CHAR *in, + Py_ssize_t len, + int native_ordering) +{ + const STRINGLIB_CHAR *end = in + len; +#if STRINGLIB_SIZEOF_CHAR == 1 +# define SWAB2(CH) ((CH) << 8) +#else +# define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) +#endif +#if STRINGLIB_MAX_CHAR < 0x10000 + if (native_ordering) { +# if STRINGLIB_SIZEOF_CHAR == 2 + Py_MEMCPY(out, in, 2 * len); +# else + _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out); +# endif + } else { + const STRINGLIB_CHAR *unrolled_end = in + (len & ~ (Py_ssize_t) 3); + while (in < unrolled_end) { + out[0] = SWAB2(in[0]); + out[1] = SWAB2(in[1]); + out[2] = SWAB2(in[2]); + out[3] = SWAB2(in[3]); + in += 4; out += 4; + } + while (in < end) { + *out++ = SWAB2(*in); + ++in; + } + } +#else + if (native_ordering) { + while (in < end) { + Py_UCS4 ch = *in++; + if (ch < 0x10000) + *out++ = ch; + else { + out[0] = Py_UNICODE_HIGH_SURROGATE(ch); + out[1] = Py_UNICODE_LOW_SURROGATE(ch); + out += 2; + } + } + } else { + while (in < end) { + Py_UCS4 ch = *in++; + if (ch < 0x10000) + *out++ = SWAB2((Py_UCS2)ch); + else { + Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); + Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); + out[0] = SWAB2(ch1); + out[1] = SWAB2(ch2); + out += 2; + } + } + } +#endif +#undef SWAB2 +} #endif /* STRINGLIB_IS_UNICODE */ diff -r 36cd1cf5a160 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Wed Jun 06 15:57:18 2012 +0200 +++ b/Objects/unicodeobject.c Thu Jun 07 14:54:22 2012 +0300 @@ -5359,26 +5359,18 @@ const char *errors, int byteorder) { - int kind; - void *data; + enum PyUnicode_Kind kind; + const void *data; Py_ssize_t len; PyObject *v; - unsigned char *p; - Py_ssize_t nsize, bytesize; - Py_ssize_t i, pairs; - /* Offsets from p for storing byte pairs in the right order. */ -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - int ihi = 1, ilo = 0; + unsigned short *out; + Py_ssize_t bytesize; + Py_ssize_t pairs; +#ifdef WORDS_BIGENDIAN + int native_ordering = byteorder >= 0; #else - int ihi = 0, ilo = 1; -#endif - -#define STORECHAR(CH) \ - do { \ - p[ihi] = ((CH) >> 8) & 0xff; \ - p[ilo] = (CH) & 0xff; \ - p += 2; \ - } while(0) + int native_ordering = byteorder <= 0; +#endif if (!PyUnicode_Check(str)) { PyErr_BadArgument(); @@ -5391,53 +5383,47 @@ len = PyUnicode_GET_LENGTH(str); pairs = 0; - if (kind == PyUnicode_4BYTE_KIND) - for (i = 0; i < len; i++) - if (PyUnicode_READ(kind, data, i) >= 0x10000) + if (kind == PyUnicode_4BYTE_KIND) { + const Py_UCS4 *in = (const Py_UCS4 *)data; + const Py_UCS4 *end = in + len; + while (in < end) + if (*in++ >= 0x10000) pairs++; - /* 2 * (len + pairs + (byteorder == 0)) */ - if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) + } + if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) return PyErr_NoMemory(); - nsize = len + pairs + (byteorder == 0); - bytesize = nsize * 2; - if (bytesize / 2 != nsize) - return PyErr_NoMemory(); + bytesize = (len + pairs + (byteorder == 0)) * 2; v = PyBytes_FromStringAndSize(NULL, bytesize); if (v == NULL) return NULL; - p = (unsigned char *)PyBytes_AS_STRING(v); + /* output buffer is 2-bytes aligned */ + assert((Py_uintptr_t)PyBytes_AS_STRING(v) & 1 == 0); + out = (unsigned short *)PyBytes_AS_STRING(v); if (byteorder == 0) - STORECHAR(0xFEFF); + *out++ = 0xFEFF; if (len == 0) goto done; - if (byteorder == -1) { - /* force LE */ - ihi = 1; - ilo = 0; - } - else if (byteorder == 1) { - /* force BE */ - ihi = 0; - ilo = 1; - } - - for (i = 0; i < len; i++) { - Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_UCS4 ch2 = 0; - if (ch >= 0x10000) { - ch2 = Py_UNICODE_LOW_SURROGATE(ch); - ch = Py_UNICODE_HIGH_SURROGATE(ch); - } - STORECHAR(ch); - if (ch2) - STORECHAR(ch2); + switch (kind) { + case PyUnicode_1BYTE_KIND: { + ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering); + break; + } + case PyUnicode_2BYTE_KIND: { + ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering); + break; + } + case PyUnicode_4BYTE_KIND: { + ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering); + break; + } + default: + assert(0); } done: return v; -#undef STORECHAR } PyObject *