# HG changeset patch # Parent 0c914a2be764f54217cd93a089fbef862ad4da05 diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -337,14 +337,15 @@ | | in :pep:`383`. | +-------------------------+-----------------------------------------------+ -In addition, the following error handlers are specific to a single codec: +In addition, the following error handlers are specific to Unicode encoding schemes: -+-------------------+---------+-------------------------------------------+ -| Value | Codec | Meaning | -+===================+=========+===========================================+ -|``'surrogatepass'``| utf-8 | Allow encoding and decoding of surrogate | -| | | codes in UTF-8. | -+-------------------+---------+-------------------------------------------+ ++-------------------+------------------------+-------------------------------------------+ +| Value | Codec | Meaning | ++===================+========================+===========================================+ +|``'surrogatepass'``| utf-8, utf-16, utf-32, | Allow encoding and decoding of surrogate | +| | utf-16-be, utf-16-le, | codes in all the Unicode encoding schemes.| +| | utf-32-be, utf-32-le | | ++-------------------+------------------------+-------------------------------------------+ .. versionadded:: 3.1 The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers. diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -304,13 +304,27 @@ "[]".encode(self.encoding)) self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), "[?]".encode(self.encoding)) - if (hasattr(self,"ill_formed_sequence")): - test_string = "A" - bom = "".encode(self.encoding) - well_formed_sequence = test_string.encode(self.encoding)[len(bom):] - test_sequence = bom + self.ill_formed_sequence + well_formed_sequence + + bom = "".encode(self.encoding) + for before, after in [("\U00010fff", "A"), ("[", "]"), + ("A", "\U00010fff")]: + before_sequence = before.encode(self.encoding)[len(bom):] + after_sequence = after.encode(self.encoding)[len(bom):] + test_string = before + "\uDC80" + after + test_sequence = (bom + before_sequence + + self.ill_formed_sequence + after_sequence) self.assertRaises(UnicodeDecodeError, test_sequence.decode, self.encoding) + self.assertEqual(test_string.encode(self.encoding, + "surrogatepass"), + test_sequence) + self.assertEqual(test_string, + test_sequence.decode(self.encoding, + "surrogatepass")) + self.assertEqual(before + after, + test_sequence.decode(self.encoding, "ignore")) + self.assertEqual(before + "\ufffd" + after, + test_sequence.decode(self.encoding, "replace")) class UTF32Test(CommonUTFTest): encoding = "utf-32" @@ -407,6 +421,8 @@ class UTF32LETest(CommonUTFTest): encoding = "utf-32-le" + ill_formed_sequence = b"\x80\xdc\x00\x00" + def test_partial(self): self.check_partial( "\x00\xff\u0100\uffff", @@ -447,6 +463,8 @@ class UTF32BETest(CommonUTFTest): encoding = "utf-32-be" + ill_formed_sequence = b"\x00\x00\xdc\x80" + def test_partial(self): self.check_partial( "\x00\xff\u0100\uffff", @@ -569,6 +587,8 @@ class UTF16LETest(CommonUTFTest): encoding = "utf-16-le" + ill_formed_sequence = b"\x80\xdc" + def test_partial(self): self.check_partial( "\x00\xff\u0100\uffff", diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5093,6 +5093,7 @@ const unsigned char *q, *e; int bo = 0; /* assume native ordering by default */ const char *errmsg = ""; + const char *encoding = "utf32le"; /* Offsets from q for retrieving bytes in the right order. */ #ifdef BYTEORDER_IS_LITTLE_ENDIAN int iorder[] = {0, 1, 2, 3}; @@ -5194,9 +5195,11 @@ q += 4; continue; utf32Error: + if (bo == 1) + encoding = "utf32be"; if (unicode_decode_call_errorhandler( errors, &errorHandler, - "utf32", errmsg, + encoding, errmsg, &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, &unicode, &outpos)) goto onError; @@ -5300,9 +5303,22 @@ else { Py_ssize_t newpos; Py_ssize_t repsize, k, morebytes; + const char* encoding; + switch (byteorder){ + case 0: + encoding = "utf-32"; + break; + case -1: + encoding = "utf-32-le"; + break; + case 1: + encoding = "utf-32-be"; + break; + } rep = unicode_encode_call_errorhandler( - errors, &errorHandler, "utf-32", "surrogates not allowed", + errors, &errorHandler, encoding, "surrogates not allowed", str, &exc, i-1, i, &newpos); + if (!rep) goto error; @@ -5437,6 +5453,7 @@ int bo = 0; /* assume native ordering by default */ int native_ordering = 0; const char *errmsg = ""; + const char *encoding = "utf16le";; /* Offsets from q for retrieving byte pairs in the right order. */ #ifdef BYTEORDER_IS_LITTLE_ENDIAN int ihi = 1, ilo = 0; @@ -5578,6 +5595,7 @@ errmsg = "unexpected end of data"; startinpos = (((const char *)q) - 2) - starts; endinpos = ((const char *)e) + 1 - starts; + goto utf16Error; } if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) { @@ -5600,13 +5618,19 @@ errmsg = "illegal encoding"; startinpos = (((const char *)q)-2)-starts; endinpos = startinpos+2; + /* Fall through to report the error */ utf16Error: + if (ilo) + encoding = "utf16be"; + /* e doesn't point to the end ('\0') but the following function expects + that. */ + e += 1; if (unicode_decode_call_errorhandler( errors, &errorHandler, - "utf16", errmsg, + encoding, errmsg, &starts, (const char **)&e, &startinpos, @@ -5616,6 +5640,8 @@ &unicode, &outpos)) goto onError; + /* e in this function always points to the char before the end ('\0') */ + e -= 1; } /* remaining byte at the end? (size should be even) */ if (e == q) { @@ -5753,8 +5779,20 @@ else { Py_ssize_t newpos; Py_ssize_t repsize, k, morebytes; + const char* encoding; + switch (byteorder){ + case 0: + encoding = "utf-16"; + break; + case -1: + encoding = "utf-16-le"; + break; + case 1: + encoding = "utf-16-be"; + break; + } rep = unicode_encode_call_errorhandler( - errors, &errorHandler, "utf-16", "surrogates not allowed", + errors, &errorHandler, encoding, "surrogates not allowed", str, &exc, i-1, i, &newpos); if (!rep) goto error; diff --git a/Python/codecs.c b/Python/codecs.c --- a/Python/codecs.c +++ b/Python/codecs.c @@ -731,6 +731,14 @@ } } +/* Endianness switches; defaults to little endian */ + +#ifdef WORDS_BIGENDIAN +# define BYTEORDER_IS_BIG_ENDIAN +#else +# define BYTEORDER_IS_LITTLE_ENDIAN +#endif + /* This handler is declared static until someone demonstrates a need to call it directly. */ static PyObject * @@ -738,24 +746,67 @@ { PyObject *restuple; PyObject *object; + PyObject *encode; + char *encoding; + int bytelength = 3; + int le = -1; Py_ssize_t i; Py_ssize_t start; Py_ssize_t end; PyObject *res; if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { - char *outp; + unsigned char *outp; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; if (PyUnicodeEncodeError_GetEnd(exc, &end)) return NULL; if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; - res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); + if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) { + Py_DECREF(object); + return NULL; + } + if (!(encoding = PyUnicode_AsUTF8(encode))) { + Py_DECREF(object); + Py_DECREF(encode); + return NULL; + } + Py_DECREF(encode); + + if (strcmp(encoding, "utf-8") == 0){ + /*no need to check others*/ + } + else if (strcmp(encoding, "utf-16") == 0) { + bytelength = 2; +#ifdef BYTEORDER_IS_BIG_ENDIAN + le = 0; +#endif + } + else if (strcmp(encoding, "utf-16-le") == 0) + bytelength = 2; + else if (strcmp(encoding, "utf-16-be") == 0) { + bytelength = 2; + le = 0; + } + else if (strcmp(encoding, "utf-32") == 0) { + bytelength = 4; +#ifdef BYTEORDER_IS_BIG_ENDIAN + le = 0; +#endif + } + else if (strcmp(encoding, "utf-32-le") == 0) + bytelength = 4; + else if (strcmp(encoding, "utf-32-be") == 0) { + bytelength = 4; + le = 0; + } + res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start)); + if (!res) { Py_DECREF(object); return NULL; } - outp = PyBytes_AsString(res); + outp = (unsigned char*)PyBytes_AsString(res); for (i = start; i < end; i++) { /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); @@ -766,9 +817,37 @@ Py_DECREF(object); return NULL; } - *outp++ = (char)(0xe0 | (ch >> 12)); - *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *outp++ = (char)(0x80 | (ch & 0x3f)); + switch (bytelength) { + case 3: + *outp++ = (unsigned char)(0xe0 | (ch >> 12)); + *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); + *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); + break; + case 2: + if (le) { + *outp++ = (unsigned char) ch; + *outp++ = (unsigned char)(ch >> 8); + } + else { + *outp++ = (unsigned char)(ch >> 8); + *outp++ = (unsigned char) ch; + } + break; + case 4: + if (le) { + *outp++ = (unsigned char) ch; + *outp++ = (unsigned char)(ch >> 8); + *outp++ = (unsigned char)(ch >> 16); + *outp++ = (unsigned char)(ch >> 24); + } + else { + *outp++ = (unsigned char)(ch >> 24); + *outp++ = (unsigned char)(ch >> 16); + *outp++ = (unsigned char)(ch >> 8); + *outp++ = (unsigned char) ch; + } + break; + } } restuple = Py_BuildValue("(On)", res, end); Py_DECREF(res); @@ -780,24 +859,77 @@ Py_UCS4 ch = 0; if (PyUnicodeDecodeError_GetStart(exc, &start)) return NULL; + if (PyUnicodeDecodeError_GetEnd(exc, &end)) + return NULL; if (!(object = PyUnicodeDecodeError_GetObject(exc))) return NULL; if (!(p = (unsigned char*)PyBytes_AsString(object))) { Py_DECREF(object); return NULL; } + if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) { + Py_DECREF(object); + return NULL; + } + if (!(encoding = PyUnicode_AsUTF8(encode))) { + Py_DECREF(object); + Py_DECREF(encode); + return NULL; + } + Py_DECREF(encode); + + if (strcmp(encoding, "utf8") == 0){ + /*no need to check others*/ + } + else if (strcmp(encoding, "utf16le") == 0) + bytelength = 2; + else if (strcmp(encoding, "utf16be") == 0) { + bytelength = 2; + le = 0; + } + else if (strcmp(encoding, "utf32le") == 0) + bytelength = 4; + else if (strcmp(encoding, "utf32be") == 0) { + bytelength = 4; + le = 0; + } + /* Try decoding a single surrogate character. If there are more, let the codec call us again. */ p += start; - if ((p[0] & 0xf0) == 0xe0 || - (p[1] & 0xc0) == 0x80 || - (p[2] & 0xc0) == 0x80) { - /* it's a three-byte code */ - ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); - if (ch < 0xd800 || ch > 0xdfff) - /* it's not a surrogate - fail */ + + switch (bytelength) { + case 3: + if ((p[0] & 0xf0) == 0xe0 || + (p[1] & 0xc0) == 0x80 || + (p[2] & 0xc0) == 0x80) { + /* it's a three-byte code */ + ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); + if (ch < 0xd800 || ch > 0xdfff) + /* it's not a surrogate - fail */ + ch = 0; + } + break; + case 2: + if (end - start != 2) + break; + if (le) + ch = p[1] << 8 | p[0]; + else + ch = p[0] << 8 | p[1]; + break; + case 4: + if (end - start != 4) + break; + if (le) + ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; + else + ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; + if (ch > 0x10ffff) ch = 0; + break; } + Py_DECREF(object); if (ch == 0) { PyErr_SetObject(PyExceptionInstance_Class(exc), exc); @@ -806,7 +938,7 @@ res = PyUnicode_FromOrdinal(ch); if (res == NULL) return NULL; - return Py_BuildValue("(Nn)", res, start+3); + return Py_BuildValue("(Nn)", res, start + bytelength); } else { wrong_exception_type(exc);