diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1135,6 +1135,10 @@ | utf_8_sig | | all languages | +-----------------+--------------------------------+--------------------------------+ +.. versionchanged:: 3.3 + ``utf_16``, ``utf_16_be``, ``utf_16_le``, ``utf_32``, ``utf_32_be`` and ``utf_32_le`` encoders no longer allow surrogate code points (U+D800-U+DFFF) to be encoded anymore. ``utf_32``, ``utf_32_be`` and ``utf_32_le`` encoders no longer decode byte sequences that correspond to surrogate code points. + + .. XXX fix here, should be in above table +--------------------+---------+---------------------------+ diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -293,8 +293,26 @@ self.assertEqual(reader.readline(), s4) self.assertEqual(reader.readline(), s5) self.assertEqual(reader.readline(), "") +class CommonUTFTest(ReadTest): + def test_lone_surrogates(self): + self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding) + self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"), + "[\\udc80]".encode(self.encoding)) + self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"), + "[�]".encode(self.encoding)) + self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"), + "[]".encode(self.encoding)) + self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), + "[?]".encode(self.encoding)) + if (hasattr(self,"ill_formed_sequence")): + test_string = "A" + bom = "".encode(self.encoding) + well_formed_sequence = test_string.encode(self.encoding)[len(bom):] + test_sequence = bom + self.ill_formed_sequence + well_formed_sequence + self.assertRaises(UnicodeDecodeError, test_sequence.decode, + self.encoding) -class UTF32Test(ReadTest): +class UTF32Test(CommonUTFTest): encoding = "utf-32" spamle = (b'\xff\xfe\x00\x00' @@ -304,6 +322,11 @@ b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') + if sys.byteorder == 'little': + ill_formed_sequence = b"\x80\xdc\x00\x00" + else: + ill_formed_sequence = b"\x00\x00\xdc\x80" + def test_only_one_bom(self): _,_,reader,writer = codecs.lookup(self.encoding) # encode some stream @@ -381,7 +404,7 @@ self.assertEqual('\U00010000' * 1024, codecs.utf_32_decode(encoded_be)[0]) -class UTF32LETest(ReadTest): +class UTF32LETest(CommonUTFTest): encoding = "utf-32-le" def test_partial(self): @@ -421,7 +444,7 @@ self.assertEqual('\U00010000' * 1024, codecs.utf_32_le_decode(encoded)[0]) -class UTF32BETest(ReadTest): +class UTF32BETest(CommonUTFTest): encoding = "utf-32-be" def test_partial(self): @@ -462,12 +485,17 @@ codecs.utf_32_be_decode(encoded)[0]) -class UTF16Test(ReadTest): +class UTF16Test(CommonUTFTest): encoding = "utf-16" spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m' + if sys.byteorder == 'little': + ill_formed_sequence = b"\x80\xdc" + else: + ill_formed_sequence = b"\xdc\x80" + def test_only_one_bom(self): _,_,reader,writer = codecs.lookup(self.encoding) # encode some stream @@ -538,7 +566,7 @@ with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader: self.assertEqual(reader.read(), s1) -class UTF16LETest(ReadTest): +class UTF16LETest(CommonUTFTest): encoding = "utf-16-le" def test_partial(self): @@ -566,9 +594,11 @@ self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding), "\U00010203") -class UTF16BETest(ReadTest): +class UTF16BETest(CommonUTFTest): encoding = "utf-16-be" + ill_formed_sequence = b"\xdc\x80" + def test_partial(self): self.check_partial( "\x00\xff\u0100\uffff", @@ -594,9 +624,11 @@ self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding), "\U00010203") -class UTF8Test(ReadTest): +class UTF8Test(CommonUTFTest): encoding = "utf-8" + ill_formed_sequence = b"\xed\xb2\x80" + def test_partial(self): self.check_partial( "\x00\xff\u07ff\u0800\uffff", @@ -621,18 +653,11 @@ u, u.encode(self.encoding)) def test_lone_surrogates(self): - self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8") - self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8") - self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"), - b'[\\udc80]') - self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"), - b'[�]') - self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"), + super().test_lone_surrogates() + # not sure if this is making sense for + # UTF-16 and UTF-32 + self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"), b'[\x80]') - self.assertEqual("[\uDC80]".encode("utf-8", "ignore"), - b'[]') - self.assertEqual("[\uDC80]".encode("utf-8", "replace"), - b'[?]') def test_surrogatepass_handler(self): self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"), diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5183,6 +5183,11 @@ startinpos = ((const char *)q)-starts; endinpos = startinpos+4; goto utf32Error; + } else if (Py_UNICODE_IS_SURROGATE(ch)) { + errmsg = "codepoint in surrogate code point range(0xd800 x <= 0xdfff)"; + startinpos = ((const char *)q)-starts; + endinpos = startinpos+4; + goto utf32Error; } if (unicode_putchar(&unicode, &outpos, ch) < 0) goto onError; @@ -5235,14 +5240,17 @@ #else int iorder[] = {3, 2, 1, 0}; #endif - -#define STORECHAR(CH) \ - do { \ - p[iorder[3]] = ((CH) >> 24) & 0xff; \ - p[iorder[2]] = ((CH) >> 16) & 0xff; \ - p[iorder[1]] = ((CH) >> 8) & 0xff; \ - p[iorder[0]] = (CH) & 0xff; \ - p += 4; \ + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *rep = NULL; + +#define STORECHAR(CH) \ + do { \ + p[iorder[3]] = (unsigned char)((CH) >> 24); \ + p[iorder[2]] = (unsigned char)((CH) >> 16); \ + p[iorder[1]] = (unsigned char)((CH) >> 8); \ + p[iorder[0]] = (unsigned char)(CH); \ + p += 4; \ } while(0) if (!PyUnicode_Check(str)) { @@ -5267,7 +5275,7 @@ if (byteorder == 0) STORECHAR(0xFEFF); if (len == 0) - goto done; + return v; if (byteorder == -1) { /* force LE */ @@ -5284,11 +5292,83 @@ iorder[3] = 0; } - for (i = 0; i < len; i++) - STORECHAR(PyUnicode_READ(kind, data, i)); - - done: + for (i = 0; i < len;){ + Py_UCS4 ch = PyUnicode_READ(kind, data, i++); + assert(ch <= MAX_UNICODE); + if (!Py_UNICODE_IS_SURROGATE(ch)) + STORECHAR(ch); + else { + Py_ssize_t newpos; + Py_ssize_t repsize, k, morebytes; + rep = unicode_encode_call_errorhandler( + errors, &errorHandler, "utf-32", "surrogates not allowed", + str, &exc, i-1, i, &newpos); + if (!rep) + goto error; + + if (PyBytes_Check(rep)) + morebytes = repsize = PyBytes_GET_SIZE(rep); + else { + repsize = PyUnicode_GET_LENGTH(rep); + if (repsize > PY_SSIZE_T_MAX / 4) { + /* integer overflow */ + PyErr_NoMemory(); + goto error; + } + morebytes = 4 * repsize; + } + + /* four bytes are reserved for each surrogate */ + if (morebytes > 4) { + nsize = p - (unsigned char*) PyBytes_AS_STRING(v); + + if (bytesize > PY_SSIZE_T_MAX - morebytes + 4) { + /* integer overflow */ + PyErr_NoMemory(); + goto error; + } + bytesize += morebytes - 4; + if (_PyBytes_Resize(&v, bytesize) < 0) + goto error; + p = (unsigned char*) PyBytes_AS_STRING(v) + nsize; + } + + if (PyBytes_Check(rep)) { + char *prep = PyBytes_AS_STRING(rep); + for(k = repsize; k > 0; k--) + *p++ = *prep++; + } else /* rep is unicode */ { + enum PyUnicode_Kind repkind; + void *repdata; + + if (PyUnicode_READY(rep) < 0) + goto error; + repkind = PyUnicode_KIND(rep); + repdata = PyUnicode_DATA(rep); + + for(k=0; k> 8) & 0xff; \ - p[ilo] = (CH) & 0xff; \ + p[ihi] = (unsigned char)((CH) >> 8); \ + p[ilo] = (unsigned char)(CH); \ p += 2; \ } while(0) @@ -5641,7 +5724,7 @@ if (byteorder == 0) STORECHAR(0xFEFF); if (len == 0) - goto done; + return v; if (byteorder == -1) { /* force LE */ @@ -5654,20 +5737,92 @@ ilo = 1; } - for (i = 0; i < len; i++) { - Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_UCS4 ch2 = 0; + for (i = 0; i < len;) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i++); + assert(ch <= MAX_UNICODE); if (ch >= 0x10000) { + Py_UCS4 ch2 = 0; ch2 = Py_UNICODE_LOW_SURROGATE(ch); ch = Py_UNICODE_HIGH_SURROGATE(ch); - } - STORECHAR(ch); - if (ch2) + STORECHAR(ch); STORECHAR(ch2); - } - - done: + } + else if (!Py_UNICODE_IS_SURROGATE(ch)) { + STORECHAR(ch); + } + else { + Py_ssize_t newpos; + Py_ssize_t repsize, k, morebytes; + rep = unicode_encode_call_errorhandler( + errors, &errorHandler, "utf-16", "surrogates not allowed", + str, &exc, i-1, i, &newpos); + if (!rep) + goto error; + + if (PyBytes_Check(rep)) + morebytes = repsize = PyBytes_GET_SIZE(rep); + else { + repsize = PyUnicode_GET_LENGTH(rep); + if (repsize > PY_SSIZE_T_MAX / 2) { + /* integer overflow */ + PyErr_NoMemory(); + goto error; + } + morebytes = 2 * repsize; + } + + /* two bytes are reserved for each surrogate */ + if (morebytes > 2) { + nsize = p - (unsigned char*) PyBytes_AS_STRING(v); + + if (bytesize > PY_SSIZE_T_MAX - morebytes + 2) { + /* integer overflow */ + PyErr_NoMemory(); + goto error; + } + bytesize += morebytes - 2; + if (_PyBytes_Resize(&v, bytesize) < 0) + goto error; + p = (unsigned char*) PyBytes_AS_STRING(v) + nsize; + } + + if (PyBytes_Check(rep)) { + char *prep = PyBytes_AS_STRING(rep); + for(k = repsize; k > 0; k--) + *p++ = *prep++; + } else /* rep is unicode */ { + enum PyUnicode_Kind repkind; + void *repdata; + + if (PyUnicode_READY(rep) < 0) + goto error; + repkind = PyUnicode_KIND(rep); + repdata = PyUnicode_DATA(rep); + + /* The following code strips out bits over 0xFFFF */ + for(k=0; k