diff -r 0a9143d7b097 Objects/stringlib/asciilib.h --- a/Objects/stringlib/asciilib.h Thu May 03 13:43:07 2012 +0200 +++ b/Objects/stringlib/asciilib.h Thu May 03 15:50:11 2012 +0300 @@ -7,6 +7,7 @@ #define STRINGLIB(F) asciilib_##F #define STRINGLIB_OBJECT PyUnicodeObject #define STRINGLIB_SIZEOF_CHAR 1 +#define STRINGLIB_MAX_CHAR 0x7Fu #define STRINGLIB_CHAR Py_UCS1 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" diff -r 0a9143d7b097 Objects/stringlib/codecs.h --- a/Objects/stringlib/codecs.h Thu May 03 13:43:07 2012 +0200 +++ b/Objects/stringlib/codecs.h Thu May 03 15:50:11 2012 +0300 @@ -150,7 +150,6 @@ return ret; } -#undef LONG_PTR_MASK #undef ASCII_CHAR_MASK @@ -350,4 +349,153 @@ #undef MAX_SHORT_UNICHARS } +#define UCS2_REPEAT_MASK (~0ul / 0xFFFFul) + +/* The mask for fast checking of whether a C 'long' may contain + UTF16-encoded surrogate characters. This is an efficient heuristic, + assuming that non-surrogate characters with a code point >= 0x8000 are + rare in most input. +*/ +#if STRINGLIB_SIZEOF_CHAR == 1 +# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) +#else +# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) +#endif +/* The mask for fast byteswapping. */ +#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) +/* Swap bytes. */ +#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ + (((value) & STRIPPED_MASK) << 8)) + +Py_LOCAL_INLINE(Py_UCS4) +STRINGLIB(utf16_try_decode)(STRINGLIB_CHAR *dest, Py_ssize_t *outpos, + const unsigned char **inptr, + const unsigned char *e, + int native_ordering) +{ + const unsigned char *aligned_end = + (const unsigned char *) ((size_t) (e + 1) & ~LONG_PTR_MASK); + const unsigned char *q = *inptr; + STRINGLIB_CHAR *p = dest + *outpos; + /* Offsets from q for retrieving byte pairs in the right order. */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + int ihi = !!native_ordering, ilo = !native_ordering; +#else + int ihi = !native_ordering, ilo = !!native_ordering; +#endif + + while (q < e) { + Py_UCS4 ch; + /* First check for possible aligned read of a C 'long'. Unaligned + reads are more expensive, better to defer to another iteration. */ + if (!((size_t) q & LONG_PTR_MASK)) { + /* Fast path for runs of non-surrogate chars. */ + register const unsigned char *_q = q; + while (_q < aligned_end) { + unsigned long block = * (unsigned long *) _q; + /* Fast checking of whether a C 'long' may contain + UTF16-encoded surrogate characters. This is an efficient + heuristic, assuming that non-surrogate characters with + a code point >= 0x8000 are rare in most input. + */ + if (native_ordering) { + /* Can use buffer directly */ + if (block & FAST_CHAR_MASK) + break; + } + else { + /* Need to byte-swap */ + if (block & SWAB(FAST_CHAR_MASK)) + break; +#if STRINGLIB_SIZEOF_CHAR == 1 + block >>= 8; +#else + block = SWAB(block); +#endif + } +#ifdef BYTEORDER_IS_LITTLE_ENDIAN +#if SIZEOF_LONG == 4 + *(p + 0) = (STRINGLIB_CHAR)(block & 0xFFFFu); + *(p + 1) = (STRINGLIB_CHAR)(block >> 16); +#endif +#if SIZEOF_LONG == 8 + *(p + 0) = (STRINGLIB_CHAR)(block & 0xFFFFu); + *(p + 1) = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); + *(p + 2) = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); + *(p + 3) = (STRINGLIB_CHAR)(block >> 48); +#endif +#else +#if SIZEOF_LONG == 4 + *(p + 0) = (STRINGLIB_CHAR)(block >> 16); + *(p + 1) = (STRINGLIB_CHAR)(block & 0xFFFFu); +#endif +#if SIZEOF_LONG == 8 + *(p + 0) = (STRINGLIB_CHAR)(block >> 48); + *(p + 1) = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); + *(p + 2) = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); + *(p + 3) = (STRINGLIB_CHAR)(block & 0xFFFFu); +#endif +#endif + _q += SIZEOF_LONG; + p += SIZEOF_LONG / 2; + } + q = _q; + if (q >= e) + break; + } + ch = (q[ihi] << 8) | q[ilo]; + q += 2; +#if STRINGLIB_SIZEOF_CHAR == 1 + if (ch <= STRINGLIB_MAX_CHAR) { + *p++ = (STRINGLIB_CHAR)ch; + continue; + } +#endif + if (!Py_UNICODE_IS_SURROGATE(ch)) { +#if STRINGLIB_SIZEOF_CHAR >= 2 + *p++ = (STRINGLIB_CHAR)ch; + continue; +#else + *inptr = q; + *outpos = p - dest; + return ch; +#endif + } + /* UTF-16 code pair: */ + if (q < e) { + if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) { + Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo]; + q += 2; + if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) { + ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); +#if STRINGLIB_SIZEOF_CHAR == 4 + *p++ = (STRINGLIB_CHAR)ch; + continue; +#else + *inptr = q; + *outpos = p - dest; + return ch; +#endif + } + *inptr = q; + *outpos = p - dest; + return 3; /* illegal UTF-16 surrogate */ + } + *inptr = q; + *outpos = p - dest; + return 2; /* illegal encoding */ + } + *inptr = q; + *outpos = p - dest; + return 1; /* unexpected end of data */ + } + *inptr = q; + *outpos = p - dest; + return 0; +} +#undef UCS2_REPEAT_MASK +#undef FAST_CHAR_MASK +#undef STRIPPED_MASK +#undef SWAB +#undef LONG_PTR_MASK #endif /* STRINGLIB_IS_UNICODE */ diff -r 0a9143d7b097 Objects/stringlib/ucs1lib.h --- a/Objects/stringlib/ucs1lib.h Thu May 03 13:43:07 2012 +0200 +++ b/Objects/stringlib/ucs1lib.h Thu May 03 15:50:11 2012 +0300 @@ -7,6 +7,7 @@ #define STRINGLIB(F) ucs1lib_##F #define STRINGLIB_OBJECT PyUnicodeObject #define STRINGLIB_SIZEOF_CHAR 1 +#define STRINGLIB_MAX_CHAR 0xFFu #define STRINGLIB_CHAR Py_UCS1 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" diff -r 0a9143d7b097 Objects/stringlib/ucs2lib.h --- a/Objects/stringlib/ucs2lib.h Thu May 03 13:43:07 2012 +0200 +++ b/Objects/stringlib/ucs2lib.h Thu May 03 15:50:11 2012 +0300 @@ -7,6 +7,7 @@ #define STRINGLIB(F) ucs2lib_##F #define STRINGLIB_OBJECT PyUnicodeObject #define STRINGLIB_SIZEOF_CHAR 2 +#define STRINGLIB_MAX_CHAR 0xFFFFu #define STRINGLIB_CHAR Py_UCS2 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" diff -r 0a9143d7b097 Objects/stringlib/ucs4lib.h --- a/Objects/stringlib/ucs4lib.h Thu May 03 13:43:07 2012 +0200 +++ b/Objects/stringlib/ucs4lib.h Thu May 03 15:50:11 2012 +0300 @@ -7,6 +7,7 @@ #define STRINGLIB(F) ucs4lib_##F #define STRINGLIB_OBJECT PyUnicodeObject #define STRINGLIB_SIZEOF_CHAR 4 +#define STRINGLIB_MAX_CHAR 0x10FFFFu #define STRINGLIB_CHAR Py_UCS4 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" diff -r 0a9143d7b097 Objects/stringlib/undef.h --- a/Objects/stringlib/undef.h Thu May 03 13:43:07 2012 +0200 +++ b/Objects/stringlib/undef.h Thu May 03 15:50:11 2012 +0300 @@ -1,6 +1,7 @@ #undef FASTSEARCH #undef STRINGLIB #undef STRINGLIB_SIZEOF_CHAR +#undef STRINGLIB_MAX_CHAR #undef STRINGLIB_CHAR #undef STRINGLIB_STR #undef STRINGLIB_LEN diff -r 0a9143d7b097 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Thu May 03 13:43:07 2012 +0200 +++ b/Objects/unicodeobject.c Thu May 03 15:50:11 2012 +0300 @@ -4644,6 +4644,10 @@ return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); } +#include "stringlib/asciilib.h" +#include "stringlib/codecs.h" +#include "stringlib/undef.h" + #include "stringlib/ucs1lib.h" #include "stringlib/codecs.h" #include "stringlib/undef.h" @@ -5472,25 +5476,6 @@ return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); } -/* Two masks for fast checking of whether a C 'long' may contain - UTF16-encoded surrogate characters. This is an efficient heuristic, - assuming that non-surrogate characters with a code point >= 0x8000 are - rare in most input. - FAST_CHAR_MASK is used when the input is in native byte ordering, - SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. -*/ -#if (SIZEOF_LONG == 8) -# define FAST_CHAR_MASK 0x8000800080008000L -# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L -# define STRIPPED_MASK 0x00FF00FF00FF00FFL -#elif (SIZEOF_LONG == 4) -# define FAST_CHAR_MASK 0x80008000L -# define SWAPPED_FAST_CHAR_MASK 0x00800080L -# define STRIPPED_MASK 0x00FF00FFL -#else -# error C 'long' size should be either 4 or 8! -#endif - PyObject * PyUnicode_DecodeUTF16Stateful(const char *s, Py_ssize_t size, @@ -5503,30 +5488,22 @@ Py_ssize_t endinpos; Py_ssize_t outpos; PyObject *unicode; - const unsigned char *q, *e, *aligned_end; + const unsigned char *q, *e; int bo = 0; /* assume native ordering by default */ - int native_ordering = 0; + int native_ordering; const char *errmsg = ""; - /* Offsets from q for retrieving byte pairs in the right order. */ -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - int ihi = 1, ilo = 0; -#else - int ihi = 0, ilo = 1; -#endif PyObject *errorHandler = NULL; PyObject *exc = NULL; - /* Note: size will always be longer than the resulting Unicode - character count */ - unicode = PyUnicode_New(size, 127); - if (!unicode) - return NULL; - if (size == 0) - return unicode; - outpos = 0; + if (size == 0) { + if (consumed) + *consumed = 0; + Py_INCREF(unicode_empty); + return unicode_empty; + } q = (unsigned char *)s; - e = q + size - 1; + e = q + size; if (byteorder) bo = *byteorder; @@ -5537,8 +5514,7 @@ stream as-is (giving a ZWNBSP character). */ if (bo == 0) { if (size >= 2) { - const Py_UCS4 bom = (q[ihi] << 8) | q[ilo]; -#ifdef BYTEORDER_IS_LITTLE_ENDIAN + const Py_UCS4 bom = (q[1] << 8) | q[0]; if (bom == 0xFEFF) { q += 2; bo = -1; @@ -5547,143 +5523,80 @@ q += 2; bo = 1; } + } + } + +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + native_ordering = bo <= 0; #else - if (bom == 0xFEFF) { - q += 2; - bo = 1; - } - else if (bom == 0xFFFE) { - q += 2; - bo = -1; - } -#endif - } - } - - if (bo == -1) { - /* force LE */ - ihi = 1; - ilo = 0; - } - else if (bo == 1) { - /* force BE */ - ihi = 0; - ilo = 1; - } -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - native_ordering = ilo < ihi; -#else - native_ordering = ilo > ihi; -#endif - - aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); - while (q < e) { - Py_UCS4 ch; - /* First check for possible aligned read of a C 'long'. Unaligned - reads are more expensive, better to defer to another iteration. */ - if (!((size_t) q & LONG_PTR_MASK)) { - /* Fast path for runs of non-surrogate chars. */ - register const unsigned char *_q = q; + native_ordering = bo >= 0; +#endif + + /* Note: size will always be longer than the resulting Unicode + character count */ + unicode = PyUnicode_New(size, 127); + if (!unicode) + return NULL; + outpos = 0; + + while (1) { + Py_UCS4 ch = 0; + if (e - q > 1) { + const unsigned char *e2 = e - 1; int kind = PyUnicode_KIND(unicode); - void *data = PyUnicode_DATA(unicode); - while (_q < aligned_end) { - unsigned long block = * (unsigned long *) _q; - Py_UCS4 maxch; - if (native_ordering) { - /* Can use buffer directly */ - if (block & FAST_CHAR_MASK) - break; - } - else { - /* Need to byte-swap */ - if (block & SWAPPED_FAST_CHAR_MASK) - break; - block = ((block >> 8) & STRIPPED_MASK) | - ((block & STRIPPED_MASK) << 8); - } - maxch = (Py_UCS2)(block & 0xFFFF); -#if SIZEOF_LONG == 8 - ch = (Py_UCS2)((block >> 16) & 0xFFFF); - maxch = MAX_MAXCHAR(maxch, ch); - ch = (Py_UCS2)((block >> 32) & 0xFFFF); - maxch = MAX_MAXCHAR(maxch, ch); - ch = (Py_UCS2)(block >> 48); - maxch = MAX_MAXCHAR(maxch, ch); -#else - ch = (Py_UCS2)(block >> 16); - maxch = MAX_MAXCHAR(maxch, ch); -#endif - if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) { - if (unicode_widen(&unicode, outpos, maxch) < 0) - goto onError; - kind = PyUnicode_KIND(unicode); - data = PyUnicode_DATA(unicode); - } -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF)); -#if SIZEOF_LONG == 8 - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF)); - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF)); - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48))); -#else - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16)); -#endif -#else -#if SIZEOF_LONG == 8 - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48))); - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF)); - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF)); -#else - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16)); -#endif - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF)); -#endif - _q += SIZEOF_LONG; - } - q = _q; - if (q >= e) - break; - } - ch = (q[ihi] << 8) | q[ilo]; - - q += 2; - - if (!Py_UNICODE_IS_SURROGATE(ch)) { + if (kind == PyUnicode_1BYTE_KIND) { + if (PyUnicode_IS_ASCII(unicode)) + ch = asciilib_utf16_try_decode( + PyUnicode_1BYTE_DATA(unicode), &outpos, + &q, e2, native_ordering); + else + ch = ucs1lib_utf16_try_decode( + PyUnicode_1BYTE_DATA(unicode), &outpos, + &q, e2, native_ordering); + } else if (kind == PyUnicode_2BYTE_KIND) { + ch = ucs2lib_utf16_try_decode( + PyUnicode_2BYTE_DATA(unicode), &outpos, + &q, e2, native_ordering); + } else { + assert(kind == PyUnicode_4BYTE_KIND); + ch = ucs4lib_utf16_try_decode( + PyUnicode_4BYTE_DATA(unicode), &outpos, + &q, e2, native_ordering); + } + } + switch (ch) + { + case 0: + /* remaining byte at the end? (size should be even) */ + if (q == e || consumed) + goto End; + errmsg = "truncated data"; + startinpos = ((const char *)q) - starts; + endinpos = ((const char *)e) - starts; + break; + /* The remaining input chars are ignored if the callback + chooses to skip the input */ + case 1: + errmsg = "unexpected end of data"; + startinpos = ((const char *)q) - 2 - starts; + endinpos = ((const char *)e) - starts; + break; + case 2: + errmsg = "illegal encoding"; + startinpos = ((const char *)q) - 2 - starts; + endinpos = startinpos + 2; + break; + case 3: + errmsg = "illegal UTF-16 surrogate"; + startinpos = ((const char *)q) - 4 - starts; + endinpos = startinpos + 2; + break; + default: if (unicode_putchar(&unicode, &outpos, ch) < 0) goto onError; continue; } - /* UTF-16 code pair: */ - if (q > e) { - errmsg = "unexpected end of data"; - startinpos = (((const char *)q) - 2) - starts; - endinpos = ((const char *)e) + 1 - starts; - goto utf16Error; - } - if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) { - Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo]; - q += 2; - if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) { - if (unicode_putchar(&unicode, &outpos, - Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0) - goto onError; - continue; - } - else { - errmsg = "illegal UTF-16 surrogate"; - startinpos = (((const char *)q)-4)-starts; - endinpos = startinpos+2; - goto utf16Error; - } - - } - errmsg = "illegal encoding"; - startinpos = (((const char *)q)-2)-starts; - endinpos = startinpos+2; - /* Fall through to report the error */ - - utf16Error: if (unicode_decode_call_errorhandler( errors, &errorHandler, @@ -5698,30 +5611,8 @@ &outpos)) goto onError; } - /* remaining byte at the end? (size should be even) */ - if (e == q) { - if (!consumed) { - errmsg = "truncated data"; - startinpos = ((const char *)q) - starts; - endinpos = ((const char *)e) + 1 - starts; - if (unicode_decode_call_errorhandler( - errors, - &errorHandler, - "utf16", errmsg, - &starts, - (const char **)&e, - &startinpos, - &endinpos, - &exc, - (const char **)&q, - &unicode, - &outpos)) - goto onError; - /* The remaining input chars are ignored if the callback - chooses to skip the input */ - } - } - + +End: if (byteorder) *byteorder = bo; @@ -5743,9 +5634,6 @@ return NULL; } -#undef FAST_CHAR_MASK -#undef SWAPPED_FAST_CHAR_MASK - PyObject * _PyUnicode_EncodeUTF16(PyObject *str, const char *errors,