diff -r 4a02212064ce Doc/library/codecs.rst --- a/Doc/library/codecs.rst Mon Sep 02 03:23:21 2013 -0700 +++ b/Doc/library/codecs.rst Mon Sep 02 19:50:59 2013 +0300 @@ -346,14 +346,16 @@ | | in :pep:`383`. | +-------------------------+-----------------------------------------------+ -In addition, the following error handlers are specific to a single codec: +In addition, the following error handlers are specific to Unicode encoding +schemes: -+-------------------+---------+-------------------------------------------+ -| Value | Codec | Meaning | -+===================+=========+===========================================+ -|``'surrogatepass'``| utf-8 | Allow encoding and decoding of surrogate | -| | | codes in UTF-8. | -+-------------------+---------+-------------------------------------------+ ++-------------------+------------------------+-------------------------------------------+ +| Value | Codec | Meaning | ++===================+========================+===========================================+ +|``'surrogatepass'``| utf-8, utf-16, utf-32, | Allow encoding and decoding of surrogate | +| | utf-16-be, utf-16-le, | codes in all the Unicode encoding schemes.| +| | utf-32-be, utf-32-le | | ++-------------------+------------------------+-------------------------------------------+ .. versionadded:: 3.1 The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers. @@ -1146,6 +1148,14 @@ | utf_8_sig | | all languages | +-----------------+--------------------------------+--------------------------------+ +.. versionchanged:: 3.4 + ``utf_16``, ``utf_16_be``, ``utf_16_le``, ``utf_32``, ``utf_32_be`` and + ``utf_32_le`` encoders no longer allow surrogate code points + (U+D800-U+DFFF) to be encoded anymore. ``utf_32``, ``utf_32_be`` and + ``utf_32_le`` encoders no longer decode byte sequences that correspond to + surrogate code points. + + Python Specific Encodings ------------------------- diff -r 4a02212064ce Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Mon Sep 02 03:23:21 2013 -0700 +++ b/Lib/test/test_codecs.py Mon Sep 02 19:50:59 2013 +0300 @@ -299,8 +299,46 @@ self.assertEqual(reader.readline(), s5) self.assertEqual(reader.readline(), "") + ill_formed_sequence_replace = "\ufffd" + + def test_lone_surrogates(self): + self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding) + self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"), + "[\\udc80]".encode(self.encoding)) + self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"), + "[�]".encode(self.encoding)) + self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"), + "[]".encode(self.encoding)) + self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), + "[?]".encode(self.encoding)) + + bom = "".encode(self.encoding) + for before, after in [("\U00010fff", "A"), ("[", "]"), + ("A", "\U00010fff")]: + before_sequence = before.encode(self.encoding)[len(bom):] + after_sequence = after.encode(self.encoding)[len(bom):] + test_string = before + "\uDC80" + after + test_sequence = (bom + before_sequence + + self.ill_formed_sequence + after_sequence) + self.assertRaises(UnicodeDecodeError, test_sequence.decode, + self.encoding) + self.assertEqual(test_string.encode(self.encoding, + "surrogatepass"), + test_sequence) + self.assertEqual(test_sequence.decode(self.encoding, + "surrogatepass"), + test_string) + self.assertEqual(test_sequence.decode(self.encoding, "ignore"), + before + after) + self.assertEqual(test_sequence.decode(self.encoding, "replace"), + before + self.ill_formed_sequence_replace + after) + class UTF32Test(ReadTest, unittest.TestCase): encoding = "utf-32" + if sys.byteorder == 'little': + ill_formed_sequence = b"\x80\xdc\x00\x00" + else: + ill_formed_sequence = b"\x00\x00\xdc\x80" spamle = (b'\xff\xfe\x00\x00' b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' @@ -392,6 +430,7 @@ class UTF32LETest(ReadTest, unittest.TestCase): encoding = "utf-32-le" + ill_formed_sequence = b"\x80\xdc\x00\x00" def test_partial(self): self.check_partial( @@ -436,6 +475,7 @@ class UTF32BETest(ReadTest, unittest.TestCase): encoding = "utf-32-be" + ill_formed_sequence = b"\x00\x00\xdc\x80" def test_partial(self): self.check_partial( @@ -481,6 +521,10 @@ class UTF16Test(ReadTest, unittest.TestCase): encoding = "utf-16" + if sys.byteorder == 'little': + ill_formed_sequence = b"\x80\xdc" + else: + ill_formed_sequence = b"\xdc\x80" spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' @@ -561,6 +605,7 @@ class UTF16LETest(ReadTest, unittest.TestCase): encoding = "utf-16-le" + ill_formed_sequence = b"\x80\xdc" def test_partial(self): self.check_partial( @@ -604,6 +649,7 @@ class UTF16BETest(ReadTest, unittest.TestCase): encoding = "utf-16-be" + ill_formed_sequence = b"\xdc\x80" def test_partial(self): self.check_partial( @@ -647,6 +693,8 @@ class UTF8Test(ReadTest, unittest.TestCase): encoding = "utf-8" + ill_formed_sequence = b"\xed\xb2\x80" + ill_formed_sequence_replace = "\ufffd" * 3 def test_partial(self): self.check_partial( @@ -676,18 +724,11 @@ u, u.encode(self.encoding)) def test_lone_surrogates(self): - self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8") - self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8") - self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"), - b'[\\udc80]') - self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"), - b'[�]') - self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"), + super().test_lone_surrogates() + # not sure if this is making sense for + # UTF-16 and UTF-32 + self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"), b'[\x80]') - self.assertEqual("[\uDC80]".encode("utf-8", "ignore"), - b'[]') - self.assertEqual("[\uDC80]".encode("utf-8", "replace"), - b'[?]') def test_surrogatepass_handler(self): self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"), @@ -820,6 +861,8 @@ ] ) + test_lone_surrogates = None + class UTF16ExTest(unittest.TestCase): def test_errors(self): @@ -846,6 +889,8 @@ class UTF8SigTest(ReadTest, unittest.TestCase): encoding = "utf-8-sig" + ill_formed_sequence = b"\xed\xb2\x80" + ill_formed_sequence_replace = "\ufffd" * 3 def test_partial(self): self.check_partial( diff -r 4a02212064ce Objects/stringlib/codecs.h --- a/Objects/stringlib/codecs.h Mon Sep 02 03:23:21 2013 -0700 +++ b/Objects/stringlib/codecs.h Mon Sep 02 19:50:59 2013 +0300 @@ -596,66 +596,106 @@ #undef SWAB -Py_LOCAL_INLINE(void) -STRINGLIB(utf16_encode)(unsigned short *out, - const STRINGLIB_CHAR *in, +#if STRINGLIB_MAX_CHAR >= 0x80 +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, Py_ssize_t len, + unsigned short **outptr, int native_ordering) { + unsigned short *out = *outptr; const STRINGLIB_CHAR *end = in + len; #if STRINGLIB_SIZEOF_CHAR == 1 # define SWAB2(CH) ((CH) << 8) #else # define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) #endif + if (native_ordering) { #if STRINGLIB_MAX_CHAR < 0x10000 - if (native_ordering) { -# if STRINGLIB_SIZEOF_CHAR == 2 - Py_MEMCPY(out, in, 2 * len); -# else - _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out); -# endif - } else { const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); while (in < unrolled_end) { +# if STRINGLIB_MAX_CHAR >= 0xd800 + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; +# endif + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + in += 4; out += 4; + } +#endif + while (in < end) { + Py_UCS4 ch; + ch = *in++; +#if STRINGLIB_MAX_CHAR >= 0xd800 + if (ch < 0xd800) + *out++ = ch; + else if (ch < 0xe000) + goto fail; +# if STRINGLIB_MAX_CHAR >= 0x10000 + else if (ch >= 0x10000) { + out[0] = Py_UNICODE_HIGH_SURROGATE(ch); + out[1] = Py_UNICODE_LOW_SURROGATE(ch); + out += 2; + } +# endif + else +#endif + *out++ = ch; + } + } else { +#if STRINGLIB_MAX_CHAR < 0x10000 + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { +# if STRINGLIB_MAX_CHAR >= 0xd800 + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; +# endif out[0] = SWAB2(in[0]); out[1] = SWAB2(in[1]); out[2] = SWAB2(in[2]); out[3] = SWAB2(in[3]); in += 4; out += 4; } - while (in < end) { - *out++ = SWAB2(*in); - ++in; - } - } -#else - if (native_ordering) { +#endif while (in < end) { Py_UCS4 ch = *in++; - if (ch < 0x10000) - *out++ = ch; - else { - out[0] = Py_UNICODE_HIGH_SURROGATE(ch); - out[1] = Py_UNICODE_LOW_SURROGATE(ch); - out += 2; - } - } - } else { - while (in < end) { - Py_UCS4 ch = *in++; - if (ch < 0x10000) +#if STRINGLIB_MAX_CHAR >= 0xd800 + if (ch < 0xd800) *out++ = SWAB2((Py_UCS2)ch); - else { + else if (ch < 0xe000) + goto fail; +# if STRINGLIB_MAX_CHAR >= 0x10000 + else if (ch >= 0x10000) { Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); out[0] = SWAB2(ch1); out[1] = SWAB2(ch2); out += 2; } +# endif + else +#endif + *out++ = SWAB2((Py_UCS2)ch); } } + *outptr = out; + return len; +#if STRINGLIB_MAX_CHAR >= 0xd800 + fail: #endif + *outptr = out; + return len - (end - in + 1); +} +#endif + #undef SWAB2 -} + #endif /* STRINGLIB_IS_UNICODE */ diff -r 4a02212064ce Objects/unicodeobject.c --- a/Objects/unicodeobject.c Mon Sep 02 03:23:21 2013 -0700 +++ b/Objects/unicodeobject.c Mon Sep 02 19:50:59 2013 +0300 @@ -4943,6 +4943,7 @@ _PyUnicodeWriter writer; const unsigned char *q, *e; int le, bo = 0; /* assume native ordering by default */ + const char *encoding; const char *errmsg = ""; PyObject *errorHandler = NULL; PyObject *exc = NULL; @@ -4982,6 +4983,7 @@ #else le = bo <= 0; #endif + encoding = le ? "utf-32-le" : "utf-32-be"; _PyUnicodeWriter_Init(&writer); writer.min_length = (e - q + 3) / 4; @@ -5002,6 +5004,8 @@ ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; if (ch > maxch) break; + if (kind != PyUnicode_1BYTE_KIND && Py_UNICODE_IS_SURROGATE(ch)) + break; PyUnicode_WRITE(kind, data, pos++, ch); q += 4; } while (q <= last); @@ -5011,6 +5015,8 @@ ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; if (ch > maxch) break; + if (kind != PyUnicode_1BYTE_KIND && Py_UNICODE_IS_SURROGATE(ch)) + break; PyUnicode_WRITE(kind, data, pos++, ch); q += 4; } while (q <= last); @@ -5018,7 +5024,12 @@ writer.pos = pos; } - if (ch <= maxch) { + if (Py_UNICODE_IS_SURROGATE(ch)) { + errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)"; + startinpos = ((const char *)q) - starts; + endinpos = startinpos + 4; + } + else if (ch <= maxch) { if (q == e || consumed) break; /* remaining bytes at the end? (size should be divisible by 4) */ @@ -5042,7 +5053,7 @@ chooses to skip the input */ if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, - "utf32", errmsg, + encoding, errmsg, &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, &writer)) goto onError; @@ -5079,6 +5090,14 @@ #else int iorder[] = {3, 2, 1, 0}; #endif + const char *encoding; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *rep = NULL; + +#define SWAB4(CH, tmp) (tmp = (CH), \ + tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \ + ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu)) #define STORECHAR(CH) \ do { \ @@ -5118,6 +5137,7 @@ iorder[1] = 1; iorder[2] = 2; iorder[3] = 3; + encoding = "utf-32-le"; } else if (byteorder == 1) { /* force BE */ @@ -5125,13 +5145,102 @@ iorder[1] = 2; iorder[2] = 1; iorder[3] = 0; - } - - for (i = 0; i < len; i++) - STORECHAR(PyUnicode_READ(kind, data, i)); - + encoding = "utf-32-be"; + } + else + encoding = "utf-32"; + + if (kind == PyUnicode_1BYTE_KIND) { + for (i = 0; i < len; i++) + STORECHAR(PyUnicode_READ(kind, data, i)); + goto done; + } + + for (i = 0; i < len;) { + Py_ssize_t repsize, moreunits; + Py_UCS4 ch = PyUnicode_READ(kind, data, i++); + assert(ch <= MAX_UNICODE); + if (!Py_UNICODE_IS_SURROGATE(ch)) { + STORECHAR(ch); + continue; + } + + rep = unicode_encode_call_errorhandler( + errors, &errorHandler, + encoding, "surrogates not allowed", + str, &exc, i-1, i, &i); + + if (!rep) + goto error; + + if (PyBytes_Check(rep)) { + repsize = PyBytes_GET_SIZE(rep); + if (repsize & 3) { + raise_encode_exception(&exc, encoding, + str, i - 1, i, + "surrogates not allowed"); + goto error; + } + moreunits = repsize / 4; + } + else { + assert(PyUnicode_Check(rep)); + if (PyUnicode_READY(rep) < 0) + goto error; + moreunits = repsize = PyUnicode_GET_LENGTH(rep); + if (!PyUnicode_IS_ASCII(rep)) { + raise_encode_exception(&exc, encoding, + str, i - 1, i, + "surrogates not allowed"); + goto error; + } + } + + /* four bytes are reserved for each surrogate */ + if (moreunits > 1) { + Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v); + if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - 4 * moreunits + 4) { + /* integer overflow */ + PyErr_NoMemory(); + goto error; + } + if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0) + goto error; + p = (unsigned char*) PyBytes_AS_STRING(v) + outpos; + } + + if (PyBytes_Check(rep)) { + Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize); + p += repsize; + } else /* rep is unicode */ { + const Py_UCS1 *repdata; + assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); + repdata = PyUnicode_1BYTE_DATA(rep); + while (repsize--) { + Py_UCS4 ch = *repdata++; + STORECHAR(ch); + } + } + + Py_CLEAR(rep); + } + + /* Cut back to size actually needed. This is necessary for, for example, + encoding of a string containing isolated surrogates and the 'ignore' + handler is used. */ + nsize = p - (unsigned char*) PyBytes_AS_STRING(v); + if (nsize != PyBytes_GET_SIZE(v)) + _PyBytes_Resize(&v, nsize); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); done: return v; + error: + Py_XDECREF(rep); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + Py_XDECREF(v); + return NULL; #undef STORECHAR } @@ -5184,6 +5293,7 @@ const char *errmsg = ""; PyObject *errorHandler = NULL; PyObject *exc = NULL; + const char *encoding; q = (unsigned char *)s; e = q + size; @@ -5217,8 +5327,10 @@ #if PY_LITTLE_ENDIAN native_ordering = bo <= 0; + encoding = bo <= 0 ? "utf-16-le" : "utf-16-be"; #else native_ordering = bo >= 0; + encoding = bo >= 0 ? "utf-16-be" : "utf-16-le"; #endif /* Note: size will always be longer than the resulting Unicode @@ -5292,7 +5404,7 @@ if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, - "utf16", errmsg, + encoding, errmsg, &starts, (const char **)&e, &startinpos, @@ -5328,13 +5440,17 @@ Py_ssize_t len; PyObject *v; unsigned short *out; - Py_ssize_t bytesize; Py_ssize_t pairs; #if PY_BIG_ENDIAN int native_ordering = byteorder >= 0; #else int native_ordering = byteorder <= 0; #endif + const char *encoding; + Py_ssize_t nsize, pos; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *rep = NULL; if (!PyUnicode_Check(str)) { PyErr_BadArgument(); @@ -5356,8 +5472,8 @@ } if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) return PyErr_NoMemory(); - bytesize = (len + pairs + (byteorder == 0)) * 2; - v = PyBytes_FromStringAndSize(NULL, bytesize); + nsize = len + pairs + (byteorder == 0); + v = PyBytes_FromStringAndSize(NULL, nsize * 2); if (v == NULL) return NULL; @@ -5369,25 +5485,106 @@ if (len == 0) goto done; - switch (kind) { - case PyUnicode_1BYTE_KIND: { - ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering); - break; - } - case PyUnicode_2BYTE_KIND: { - ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering); - break; - } - case PyUnicode_4BYTE_KIND: { - ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering); - break; - } - default: - assert(0); - } - + if (kind == PyUnicode_1BYTE_KIND) { + ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); + goto done; + } + + if (byteorder < 0) + encoding = "utf-16-le"; + else if (byteorder > 0) + encoding = "utf-16-be"; + else + encoding = "utf-16"; + + pos = 0; + while (pos < len) { + Py_ssize_t repsize, moreunits; + + if (kind == PyUnicode_2BYTE_KIND) { + pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, + &out, native_ordering); + } + else { + assert(kind == PyUnicode_4BYTE_KIND); + pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos, + &out, native_ordering); + } + if (pos == len) + break; + + rep = unicode_encode_call_errorhandler( + errors, &errorHandler, + encoding, "surrogates not allowed", + str, &exc, pos, pos + 1, &pos); + if (!rep) + goto error; + + if (PyBytes_Check(rep)) { + repsize = PyBytes_GET_SIZE(rep); + if (repsize & 1) { + raise_encode_exception(&exc, encoding, + str, pos - 1, pos, + "surrogates not allowed"); + goto error; + } + moreunits = repsize / 2; + } + else { + assert(PyUnicode_Check(rep)); + if (PyUnicode_READY(rep) < 0) + goto error; + moreunits = repsize = PyUnicode_GET_LENGTH(rep); + if (!PyUnicode_IS_ASCII(rep)) { + raise_encode_exception(&exc, encoding, + str, pos - 1, pos, + "surrogates not allowed"); + goto error; + } + } + + /* two bytes are reserved for each surrogate */ + if (moreunits > 1) { + Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); + if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - 2 * moreunits + 2) { + /* integer overflow */ + PyErr_NoMemory(); + goto error; + } + if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0) + goto error; + out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; + } + + if (PyBytes_Check(rep)) { + Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); + out += moreunits; + } else /* rep is unicode */ { + assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); + ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, + &out, native_ordering); + } + + Py_CLEAR(rep); + } + + /* Cut back to size actually needed. This is necessary for, for example, + encoding of a string containing isolated surrogates and the 'ignore' handler + is used. */ + nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); + if (nsize != PyBytes_GET_SIZE(v)) + _PyBytes_Resize(&v, nsize); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); done: return v; + error: + Py_XDECREF(rep); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + Py_XDECREF(v); + return NULL; +#undef STORECHAR } PyObject * diff -r 4a02212064ce Python/codecs.c --- a/Python/codecs.c Mon Sep 02 03:23:21 2013 -0700 +++ b/Python/codecs.c Mon Sep 02 19:50:59 2013 +0300 @@ -740,24 +740,67 @@ { PyObject *restuple; PyObject *object; + PyObject *encode; + char *encoding; + int bytelength = 3; + int le = 1; Py_ssize_t i; Py_ssize_t start; Py_ssize_t end; PyObject *res; if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { - char *outp; + unsigned char *outp; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; if (PyUnicodeEncodeError_GetEnd(exc, &end)) return NULL; if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; - res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); + if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) { + Py_DECREF(object); + return NULL; + } + if (!(encoding = PyUnicode_AsUTF8(encode))) { + Py_DECREF(object); + Py_DECREF(encode); + return NULL; + } + + if (strcmp(encoding, "utf-8") == 0) { + /*no need to check others*/ + } + else if (strcmp(encoding, "utf-16") == 0) { + bytelength = 2; +#ifdef WORDS_BIGENDIAN + le = 0; +#endif + } + else if (strcmp(encoding, "utf-16-le") == 0) + bytelength = 2; + else if (strcmp(encoding, "utf-16-be") == 0) { + bytelength = 2; + le = 0; + } + else if (strcmp(encoding, "utf-32") == 0) { + bytelength = 4; +#ifdef WORDS_BIGENDIAN + le = 0; +#endif + } + else if (strcmp(encoding, "utf-32-le") == 0) + bytelength = 4; + else if (strcmp(encoding, "utf-32-be") == 0) { + bytelength = 4; + le = 0; + } + Py_DECREF(encode); + + res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start)); if (!res) { Py_DECREF(object); return NULL; } - outp = PyBytes_AsString(res); + outp = (unsigned char*)PyBytes_AsString(res); for (i = start; i < end; i++) { /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); @@ -768,9 +811,37 @@ Py_DECREF(object); return NULL; } - *outp++ = (char)(0xe0 | (ch >> 12)); - *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *outp++ = (char)(0x80 | (ch & 0x3f)); + switch (bytelength) { + case 3: + *outp++ = (unsigned char)(0xe0 | (ch >> 12)); + *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); + *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); + break; + case 2: + if (le) { + *outp++ = (unsigned char) ch; + *outp++ = (unsigned char)(ch >> 8); + } + else { + *outp++ = (unsigned char)(ch >> 8); + *outp++ = (unsigned char) ch; + } + break; + case 4: + if (le) { + *outp++ = (unsigned char) ch; + *outp++ = (unsigned char)(ch >> 8); + *outp++ = (unsigned char)(ch >> 16); + *outp++ = (unsigned char)(ch >> 24); + } + else { + *outp++ = (unsigned char)(ch >> 24); + *outp++ = (unsigned char)(ch >> 16); + *outp++ = (unsigned char)(ch >> 8); + *outp++ = (unsigned char) ch; + } + break; + } } restuple = Py_BuildValue("(On)", res, end); Py_DECREF(res); @@ -782,34 +853,79 @@ Py_UCS4 ch = 0; if (PyUnicodeDecodeError_GetStart(exc, &start)) return NULL; + if (PyUnicodeDecodeError_GetEnd(exc, &end)) + return NULL; if (!(object = PyUnicodeDecodeError_GetObject(exc))) return NULL; if (!(p = (unsigned char*)PyBytes_AsString(object))) { Py_DECREF(object); return NULL; } + if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) { + Py_DECREF(object); + return NULL; + } + if (!(encoding = PyUnicode_AsUTF8(encode))) { + Py_DECREF(object); + Py_DECREF(encode); + return NULL; + } + + if (strcmp(encoding, "utf-8") == 0) { + /*no need to check others*/ + } + else if (strcmp(encoding, "utf-16-le") == 0) + bytelength = 2; + else if (strcmp(encoding, "utf-16-be") == 0) { + bytelength = 2; + le = 0; + } + else if (strcmp(encoding, "utf-32-le") == 0) + bytelength = 4; + else if (strcmp(encoding, "utf-32-be") == 0) { + bytelength = 4; + le = 0; + } + Py_DECREF(encode); + /* Try decoding a single surrogate character. If there are more, let the codec call us again. */ p += start; - if (PyBytes_GET_SIZE(object) - start >= 3 && - (p[0] & 0xf0) == 0xe0 && - (p[1] & 0xc0) == 0x80 && - (p[2] & 0xc0) == 0x80) { - /* it's a three-byte code */ - ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); - if (!Py_UNICODE_IS_SURROGATE(ch)) - /* it's not a surrogate - fail */ - ch = 0; + if (PyBytes_GET_SIZE(object) - start >= bytelength) { + switch (bytelength) { + case 3: + if ((p[0] & 0xf0) == 0xe0 && + (p[1] & 0xc0) == 0x80 && + (p[2] & 0xc0) == 0x80) { + /* it's a three-byte code */ + ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); + } + break; + case 2: + if (le) + ch = p[1] << 8 | p[0]; + else + ch = p[0] << 8 | p[1]; + break; + case 4: + if (le) + ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; + else + ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; + break; + } } + Py_DECREF(object); - if (ch == 0) { + if (!Py_UNICODE_IS_SURROGATE(ch)) { + /* it's not a surrogate - fail */ PyErr_SetObject(PyExceptionInstance_Class(exc), exc); return NULL; } res = PyUnicode_FromOrdinal(ch); if (res == NULL) return NULL; - return Py_BuildValue("(Nn)", res, start+3); + return Py_BuildValue("(Nn)", res, start + bytelength); } else { wrong_exception_type(exc);