diff -r 293180d199f2 Objects/stringlib/asciilib.h --- a/Objects/stringlib/asciilib.h Thu Apr 19 18:21:04 2012 +0200 +++ b/Objects/stringlib/asciilib.h Thu Apr 19 23:28:23 2012 +0300 @@ -7,6 +7,7 @@ #define STRINGLIB(F) asciilib_##F #define STRINGLIB_OBJECT PyUnicodeObject #define STRINGLIB_SIZEOF_CHAR 1 +#define STRINGLIB_MAX_CHAR 0x7Fu #define STRINGLIB_CHAR Py_UCS1 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" diff -r 293180d199f2 Objects/stringlib/codecs.h --- a/Objects/stringlib/codecs.h Thu Apr 19 18:21:04 2012 +0200 +++ b/Objects/stringlib/codecs.h Thu Apr 19 23:28:23 2012 +0300 @@ -350,4 +350,38 @@ #undef MAX_SHORT_UNICHARS } +Py_LOCAL_INLINE(Py_UCS4) +STRINGLIB(utf32_try_decode)(STRINGLIB_CHAR *dest, Py_ssize_t *outpos, + const unsigned char **inptr, + const unsigned char *e, + int le) +{ + const unsigned char *q = *inptr; + STRINGLIB_CHAR *p = dest + *outpos; + Py_UCS4 ch; + + if (le) + while (q < e) { + ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; + if (ch > STRINGLIB_MAX_CHAR) + goto Overflow; + *p++ = ch; + q += 4; + } + else + while (q < e) { + ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; + if (ch > STRINGLIB_MAX_CHAR) + goto Overflow; + *p++ = ch; + q += 4; + } + *inptr = q; + *outpos = p - dest; + return 0; +Overflow: + *inptr = q; + *outpos = p - dest; + return ch; +} #endif /* STRINGLIB_IS_UNICODE */ diff -r 293180d199f2 Objects/stringlib/ucs1lib.h --- a/Objects/stringlib/ucs1lib.h Thu Apr 19 18:21:04 2012 +0200 +++ b/Objects/stringlib/ucs1lib.h Thu Apr 19 23:28:23 2012 +0300 @@ -7,6 +7,7 @@ #define STRINGLIB(F) ucs1lib_##F #define STRINGLIB_OBJECT PyUnicodeObject #define STRINGLIB_SIZEOF_CHAR 1 +#define STRINGLIB_MAX_CHAR 0xFFu #define STRINGLIB_CHAR Py_UCS1 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" diff -r 293180d199f2 Objects/stringlib/ucs2lib.h --- a/Objects/stringlib/ucs2lib.h Thu Apr 19 18:21:04 2012 +0200 +++ b/Objects/stringlib/ucs2lib.h Thu Apr 19 23:28:23 2012 +0300 @@ -7,6 +7,7 @@ #define STRINGLIB(F) ucs2lib_##F #define STRINGLIB_OBJECT PyUnicodeObject #define STRINGLIB_SIZEOF_CHAR 2 +#define STRINGLIB_MAX_CHAR 0xFFFFu #define STRINGLIB_CHAR Py_UCS2 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" diff -r 293180d199f2 Objects/stringlib/ucs4lib.h --- a/Objects/stringlib/ucs4lib.h Thu Apr 19 18:21:04 2012 +0200 +++ b/Objects/stringlib/ucs4lib.h Thu Apr 19 23:28:23 2012 +0300 @@ -7,6 +7,7 @@ #define STRINGLIB(F) ucs4lib_##F #define STRINGLIB_OBJECT PyUnicodeObject #define STRINGLIB_SIZEOF_CHAR 4 +#define STRINGLIB_MAX_CHAR 0x10FFFFu #define STRINGLIB_CHAR Py_UCS4 #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" diff -r 293180d199f2 Objects/stringlib/undef.h --- a/Objects/stringlib/undef.h Thu Apr 19 18:21:04 2012 +0200 +++ b/Objects/stringlib/undef.h Thu Apr 19 23:28:23 2012 +0300 @@ -1,6 +1,7 @@ #undef FASTSEARCH #undef STRINGLIB #undef STRINGLIB_SIZEOF_CHAR +#undef STRINGLIB_MAX_CHAR #undef STRINGLIB_CHAR #undef STRINGLIB_STR #undef STRINGLIB_LEN diff -r 293180d199f2 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Thu Apr 19 18:21:04 2012 +0200 +++ b/Objects/unicodeobject.c Thu Apr 19 23:28:23 2012 +0300 @@ -4555,6 +4555,10 @@ return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); } +#include "stringlib/asciilib.h" +#include "stringlib/codecs.h" +#include "stringlib/undef.h" + #include "stringlib/ucs1lib.h" #include "stringlib/codecs.h" #include "stringlib/undef.h" @@ -5150,14 +5154,8 @@ Py_ssize_t outpos; PyObject *unicode; const unsigned char *q, *e; - int bo = 0; /* assume native ordering by default */ + int le, bo = 0; /* assume native ordering by default */ const char *errmsg = ""; - /* Offsets from q for retrieving bytes in the right order. */ -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - int iorder[] = {0, 1, 2, 3}; -#else - int iorder[] = {3, 2, 1, 0}; -#endif PyObject *errorHandler = NULL; PyObject *exc = NULL; @@ -5173,85 +5171,86 @@ stream as-is (giving a ZWNBSP character). */ if (bo == 0) { if (size >= 4) { - const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | - (q[iorder[1]] << 8) | q[iorder[0]]; + Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; + if (bom == 0x0000FEFF) { + bo = -1; + q += 4; + } + else if (bom == 0xFFFE0000) { + bo = 1; + q += 4; + } + } + } + #ifdef BYTEORDER_IS_LITTLE_ENDIAN - if (bom == 0x0000FEFF) { + le = bo <= 0; +#else + le = bo < 0; +#endif + + /* This might be one to much, because of a BOM */ + unicode = PyUnicode_New((e - q + 3) / 4, 127); + if (!unicode) + return NULL; + outpos = 0; + + while (1) { + Py_UCS4 ch = 0; + if (e - q > 3) { + const unsigned char *e2 = e - 3; + int kind = PyUnicode_KIND(unicode); + switch (kind) { + case PyUnicode_1BYTE_KIND: + if (PyUnicode_IS_ASCII(unicode)) + ch = asciilib_utf32_try_decode( + PyUnicode_1BYTE_DATA(unicode), + &outpos, &q, e2, le); + else + ch = ucs1lib_utf32_try_decode( + PyUnicode_1BYTE_DATA(unicode), &outpos, + &q, e2, le); + break; + case PyUnicode_2BYTE_KIND: + ch = ucs2lib_utf32_try_decode( + PyUnicode_2BYTE_DATA(unicode), &outpos, + &q, e2, le); + break; + case PyUnicode_4BYTE_KIND: + ch = ucs4lib_utf32_try_decode( + PyUnicode_4BYTE_DATA(unicode), &outpos, + &q, e2, le); + break; + default: + assert(0); + } + } + if (ch) { + if (ch < 0x110000) { + if (unicode_putchar(&unicode, &outpos, ch) < 0) + goto onError; q += 4; - bo = -1; - } - else if (bom == 0xFFFE0000) { - q += 4; - bo = 1; - } -#else - if (bom == 0x0000FEFF) { - q += 4; - bo = 1; - } - else if (bom == 0xFFFE0000) { - q += 4; - bo = -1; - } -#endif - } - } - - if (bo == -1) { - /* force LE */ - iorder[0] = 0; - iorder[1] = 1; - iorder[2] = 2; - iorder[3] = 3; - } - else if (bo == 1) { - /* force BE */ - iorder[0] = 3; - iorder[1] = 2; - iorder[2] = 1; - iorder[3] = 0; - } - - /* This might be one to much, because of a BOM */ - unicode = PyUnicode_New((size+3)/4, 127); - if (!unicode) - return NULL; - if (size == 0) - return unicode; - outpos = 0; - - while (q < e) { - Py_UCS4 ch; - /* remaining bytes at the end? (size should be divisible by 4) */ - if (e-q<4) { - if (consumed) + continue; + } + errmsg = "codepoint not in range(0x110000)"; + startinpos = ((const char *)q)-starts; + endinpos = startinpos+4; + } + else { + /* remaining bytes at the end? (size should be divisible by 4) */ + if (q == e || consumed) break; errmsg = "truncated data"; startinpos = ((const char *)q)-starts; endinpos = ((const char *)e)-starts; - goto utf32Error; /* The remaining input chars are ignored if the callback chooses to skip the input */ } - ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | - (q[iorder[1]] << 8) | q[iorder[0]]; - - if (ch >= 0x110000) - { - errmsg = "codepoint not in range(0x110000)"; - startinpos = ((const char *)q)-starts; - endinpos = startinpos+4; - goto utf32Error; - } - if (unicode_putchar(&unicode, &outpos, ch) < 0) - goto onError; - q += 4; - continue; - utf32Error: if (unicode_decode_call_errorhandler( errors, &errorHandler, "utf32", errmsg, - &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, + &starts, (const char **)&e, &startinpos, &endinpos, + &exc, (const char **)&q, &unicode, &outpos)) goto onError; }