diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1135,6 +1135,11 @@ particular, the following variants typic | utf_8_sig | | all languages | +-----------------+--------------------------------+--------------------------------+ +.. versionchanged:: 3.3 + ``utf_16``, ``utf_16_be``, ``utf_16_le``, ``utf_32``, ``utf_32_be`` and + ``utf_32_le`` encoder don't allow to encode surrogates (U+D800-U+DFFF) + anymore. + .. XXX fix here, should be in above table +--------------------+---------+---------------------------+ diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -294,7 +294,18 @@ class ReadTest(unittest.TestCase, MixInC self.assertEqual(reader.readline(), s5) self.assertEqual(reader.readline(), "") -class UTF32Test(ReadTest): +class CommonUTF32Test(ReadTest): + def test_errors(self): + self.assertRaises(UnicodeDecodeError, + b"\xff".decode, self.encoding, "strict") + + def test_surrogates(self): + self.assertRaises(UnicodeEncodeError, + "\uD800".encode, self.encoding, "strict") + self.assertRaises(UnicodeEncodeError, + "\uDFFF".encode, self.encoding, "strict") + +class UTF32Test(CommonUTF32Test): encoding = "utf-32" spamle = (b'\xff\xfe\x00\x00' @@ -361,10 +372,6 @@ class UTF32Test(ReadTest): self.assertEqual(('', 1), codecs.utf_32_decode(b'\x01', 'ignore', True)) - def test_errors(self): - self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, - b"\xff", "strict", True) - def test_decoder_state(self): self.check_state_handling_decode(self.encoding, "spamspam", self.spamle) @@ -381,7 +388,7 @@ class UTF32Test(ReadTest): self.assertEqual('\U00010000' * 1024, codecs.utf_32_decode(encoded_be)[0]) -class UTF32LETest(ReadTest): +class UTF32LETest(CommonUTF32Test): encoding = "utf-32-le" def test_partial(self): @@ -410,10 +417,6 @@ class UTF32LETest(ReadTest): def test_simple(self): self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00") - def test_errors(self): - self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode, - b"\xff", "strict", True) - def test_issue8941(self): # Issue #8941: insufficient result allocation when decoding into # surrogate pairs on UCS-2 builds. @@ -421,7 +424,7 @@ class UTF32LETest(ReadTest): self.assertEqual('\U00010000' * 1024, codecs.utf_32_le_decode(encoded)[0]) -class UTF32BETest(ReadTest): +class UTF32BETest(CommonUTF32Test): encoding = "utf-32-be" def test_partial(self): @@ -450,10 +453,6 @@ class UTF32BETest(ReadTest): def test_simple(self): self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03") - def test_errors(self): - self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode, - b"\xff", "strict", True) - def test_issue8941(self): # Issue #8941: insufficient result allocation when decoding into # surrogate pairs on UCS-2 builds. @@ -461,8 +460,18 @@ class UTF32BETest(ReadTest): self.assertEqual('\U00010000' * 1024, codecs.utf_32_be_decode(encoded)[0]) +class CommonUTF16Test(ReadTest): + def test_errors(self): + self.assertRaises(UnicodeDecodeError, + b"\xff".decode, self.encoding, "strict") -class UTF16Test(ReadTest): + def test_surrogates(self): + self.assertRaises(UnicodeEncodeError, + "\uD800".encode, self.encoding, "strict") + self.assertRaises(UnicodeEncodeError, + "\uDFFF".encode, self.encoding, "strict") + +class UTF16Test(CommonUTF16Test): encoding = "utf-16" spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00' @@ -515,10 +524,6 @@ class UTF16Test(ReadTest): self.assertEqual(('', 1), codecs.utf_16_decode(b'\x01', 'ignore', True)) - def test_errors(self): - self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, - b"\xff", "strict", True) - def test_decoder_state(self): self.check_state_handling_decode(self.encoding, "spamspam", self.spamle) @@ -538,7 +543,7 @@ class UTF16Test(ReadTest): with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader: self.assertEqual(reader.read(), s1) -class UTF16LETest(ReadTest): +class UTF16LETest(CommonUTF16Test): encoding = "utf-16-le" def test_partial(self): @@ -556,17 +561,13 @@ class UTF16LETest(ReadTest): ] ) - def test_errors(self): - self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, - b"\xff", "strict", True) - def test_nonbmp(self): self.assertEqual("\U00010203".encode(self.encoding), b'\x00\xd8\x03\xde') self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding), "\U00010203") -class UTF16BETest(ReadTest): +class UTF16BETest(CommonUTF16Test): encoding = "utf-16-be" def test_partial(self): @@ -584,10 +585,6 @@ class UTF16BETest(ReadTest): ] ) - def test_errors(self): - self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, - b"\xff", "strict", True) - def test_nonbmp(self): self.assertEqual("\U00010203".encode(self.encoding), b'\xd8\x00\xde\x03') diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5115,6 +5115,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, #else int iorder[] = {3, 2, 1, 0}; #endif + const char *encoding; #define STORECHAR(CH) \ do { \ @@ -5165,7 +5166,28 @@ _PyUnicode_EncodeUTF32(PyObject *str, } for (i = 0; i < len; i++) - STORECHAR(PyUnicode_READ(kind, data, i)); + { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (!Py_UNICODE_IS_SURROGATE(ch)) { + STORECHAR(ch); + } + else { + PyObject *exc = NULL; + + if (byteorder == -1) + encoding = "utf-32-le"; + else if (byteorder == 1) + encoding = "utf-32-be"; + else + encoding = "utf-32"; + raise_encode_exception(&exc, encoding, + str, i, i+1, + "surrogates not allowed"); + Py_XDECREF(exc); + Py_DECREF(v); + return NULL; + } + } done: return v; @@ -5483,6 +5505,7 @@ _PyUnicode_EncodeUTF16(PyObject *str, #else int ihi = 0, ilo = 1; #endif + const char *encoding; #define STORECHAR(CH) \ do { \ @@ -5536,14 +5559,32 @@ _PyUnicode_EncodeUTF16(PyObject *str, for (i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_UCS4 ch2 = 0; if (ch >= 0x10000) { + Py_UCS4 ch2; ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); ch = 0xD800 | ((ch-0x10000) >> 10); - } - STORECHAR(ch); - if (ch2) + STORECHAR(ch); STORECHAR(ch2); + } + else if (!Py_UNICODE_IS_SURROGATE(ch)) { + STORECHAR(ch); + } + else { + PyObject *exc = NULL; + + if (byteorder == -1) + encoding = "utf-16-le"; + else if (byteorder == 1) + encoding = "utf-16-be"; + else + encoding = "utf-16"; + raise_encode_exception(&exc, encoding, + str, i, i+1, + "surrogates not allowed"); + Py_XDECREF(exc); + Py_DECREF(v); + return NULL; + } } done: