diff -r 10efb1797e7b Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Thu Oct 01 13:16:43 2015 +0200 +++ b/Lib/test/test_codecs.py Thu Oct 01 14:11:15 2015 +0200 @@ -361,6 +361,12 @@ class ReadTest(MixInCheckStateHandling): self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), "[?]".encode(self.encoding)) + # sequential surrogate characters + self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"), + "[]".encode(self.encoding)) + self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"), + "[??]".encode(self.encoding)) + bom = "".encode(self.encoding) for before, after in [("\U00010fff", "A"), ("[", "]"), ("A", "\U00010fff")]: @@ -753,6 +759,7 @@ class UTF8Test(ReadTest, unittest.TestCa encoding = "utf-8" ill_formed_sequence = b"\xed\xb2\x80" ill_formed_sequence_replace = "\ufffd" * 3 + BOM = b'' def test_partial(self): self.check_partial( @@ -785,23 +792,32 @@ class UTF8Test(ReadTest, unittest.TestCa super().test_lone_surrogates() # not sure if this is making sense for # UTF-16 and UTF-32 - self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"), - b'[\x80]') + self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"), + self.BOM + b'[\x80]') + + with self.assertRaises(UnicodeEncodeError) as cm: + "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape") + exc = cm.exception + self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF') def test_surrogatepass_handler(self): - self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"), - b"abc\xed\xa0\x80def") - self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"), + self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"), + self.BOM + b"abc\xed\xa0\x80def") + self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"), + self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80") + self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"), + self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]') + + self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"), "abc\ud800def") - self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"), - b"\xf0\x90\xbf\xbf\xed\xa0\x80") - self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"), + self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"), "\U00010fff\uD800") + self.assertTrue(codecs.lookup_error("surrogatepass")) with self.assertRaises(UnicodeDecodeError): - b"abc\xed\xa0".decode("utf-8", "surrogatepass") + b"abc\xed\xa0".decode(self.encoding, "surrogatepass") with self.assertRaises(UnicodeDecodeError): - b"abc\xed\xa0z".decode("utf-8", "surrogatepass") + b"abc\xed\xa0z".decode(self.encoding, "surrogatepass") @unittest.skipUnless(sys.platform == 'win32', @@ -1008,6 +1024,7 @@ class ReadBufferTest(unittest.TestCase): class UTF8SigTest(UTF8Test, unittest.TestCase): encoding = "utf-8-sig" + BOM = codecs.BOM_UTF8 def test_partial(self): self.check_partial( diff -r 10efb1797e7b Objects/stringlib/codecs.h --- a/Objects/stringlib/codecs.h Thu Oct 01 13:16:43 2015 +0200 +++ b/Objects/stringlib/codecs.h Thu Oct 01 14:11:15 2015 +0200 @@ -268,9 +268,10 @@ STRINGLIB(utf8_encoder)(PyObject *unicod Py_ssize_t nallocated; /* number of result bytes allocated */ Py_ssize_t nneeded; /* number of result bytes needed */ #if STRINGLIB_SIZEOF_CHAR > 1 - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; PyObject *rep = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; #endif #if STRINGLIB_SIZEOF_CHAR == 1 const Py_ssize_t max_char_size = 2; @@ -326,72 +327,116 @@ STRINGLIB(utf8_encoder)(PyObject *unicod } #if STRINGLIB_SIZEOF_CHAR > 1 else if (Py_UNICODE_IS_SURROGATE(ch)) { - Py_ssize_t newpos; - Py_ssize_t repsize, k, startpos; + Py_ssize_t startpos, endpos, newpos; + Py_ssize_t repsize, k; + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + startpos = i-1; - rep = unicode_encode_call_errorhandler( - errors, &errorHandler, "utf-8", "surrogates not allowed", - unicode, &exc, startpos, startpos+1, &newpos); - if (!rep) - goto error; + endpos = startpos+1; - if (PyBytes_Check(rep)) - repsize = PyBytes_GET_SIZE(rep); - else - repsize = PyUnicode_GET_LENGTH(rep); + while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) + endpos++; - if (repsize > max_char_size) { - Py_ssize_t offset; + switch (error_handler) + { + case _Py_ERROR_REPLACE: + memset(p, '?', endpos - startpos); + p += (endpos - startpos); + /* fall through the ignore handler */ + case _Py_ERROR_IGNORE: + i += (endpos - startpos - 1); + break; - if (result == NULL) - offset = p - stackbuf; + + case _Py_ERROR_SURROGATEPASS: + for (k=startpos; k> 12)); + *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } + i += (endpos - startpos - 1); + break; + + case _Py_ERROR_SURROGATEESCAPE: + for (k=startpos; k= endpos) { + i += (endpos - startpos - 1); + break; + } + startpos = k; + assert(startpos < endpos); + /* fall through the default handler */ + + default: + rep = unicode_encode_call_errorhandler( + errors, &error_handler_obj, "utf-8", "surrogates not allowed", + unicode, &exc, startpos, endpos, &newpos); + if (!rep) + goto error; + + if (PyBytes_Check(rep)) + repsize = PyBytes_GET_SIZE(rep); else - offset = p - PyBytes_AS_STRING(result); + repsize = PyUnicode_GET_LENGTH(rep); - if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) { - /* integer overflow */ - PyErr_NoMemory(); - goto error; + if (repsize > max_char_size) { + Py_ssize_t offset; + + if (result == NULL) + offset = p - stackbuf; + else + offset = p - PyBytes_AS_STRING(result); + + if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) { + /* integer overflow */ + PyErr_NoMemory(); + goto error; + } + nallocated += repsize - max_char_size; + if (result != NULL) { + if (_PyBytes_Resize(&result, nallocated) < 0) + goto error; + } else { + result = PyBytes_FromStringAndSize(NULL, nallocated); + if (result == NULL) + goto error; + Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); + } + p = PyBytes_AS_STRING(result) + offset; } - nallocated += repsize - max_char_size; - if (result != NULL) { - if (_PyBytes_Resize(&result, nallocated) < 0) + + if (PyBytes_Check(rep)) { + memcpy(p, PyBytes_AS_STRING(rep), repsize); + p += repsize; + } + else { + /* rep is unicode */ + if (PyUnicode_READY(rep) < 0) goto error; - } else { - result = PyBytes_FromStringAndSize(NULL, nallocated); - if (result == NULL) - goto error; - Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); - } - p = PyBytes_AS_STRING(result) + offset; - } - if (PyBytes_Check(rep)) { - char *prep = PyBytes_AS_STRING(rep); - for(k = repsize; k > 0; k--) - *p++ = *prep++; - } else /* rep is unicode */ { - enum PyUnicode_Kind repkind; - void *repdata; - - if (PyUnicode_READY(rep) < 0) - goto error; - repkind = PyUnicode_KIND(rep); - repdata = PyUnicode_DATA(rep); - - for(k=0; k 2 @@ -430,7 +475,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicod } #if STRINGLIB_SIZEOF_CHAR > 1 - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); #endif return result; @@ -438,7 +483,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicod #if STRINGLIB_SIZEOF_CHAR > 1 error: Py_XDECREF(rep); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); Py_XDECREF(result); return NULL; diff -r 10efb1797e7b Objects/unicodeobject.c --- a/Objects/unicodeobject.c Thu Oct 01 13:16:43 2015 +0200 +++ b/Objects/unicodeobject.c Thu Oct 01 14:11:15 2015 +0200 @@ -297,6 +297,7 @@ typedef enum { _Py_ERROR_UNKNOWN=0, _Py_ERROR_STRICT, _Py_ERROR_SURROGATEESCAPE, + _Py_ERROR_SURROGATEPASS, _Py_ERROR_REPLACE, _Py_ERROR_IGNORE, _Py_ERROR_XMLCHARREFREPLACE, @@ -312,6 +313,8 @@ get_error_handler(const char *errors) return _Py_ERROR_STRICT; if (strcmp(errors, "surrogateescape") == 0) return _Py_ERROR_SURROGATEESCAPE; + if (strcmp(errors, "surrogatepass") == 0) + return _Py_ERROR_SURROGATEPASS; if (strcmp(errors, "ignore") == 0) return _Py_ERROR_IGNORE; if (strcmp(errors, "replace") == 0) @@ -6479,8 +6482,8 @@ unicode_encode_ucs1(PyObject *unicode, goto onError; case _Py_ERROR_REPLACE: - while (collstart++ < collend) - *str++ = '?'; + memset(str, '?', collend - collstart); + str += (collend - collstart); /* fall through ignore error handler */ case _Py_ERROR_IGNORE: pos = collend; diff -r 10efb1797e7b Python/importlib_external.h --- a/Python/importlib_external.h Thu Oct 01 13:16:43 2015 +0200 +++ b/Python/importlib_external.h Thu Oct 01 14:11:15 2015 +0200 @@ -613,7 +613,7 @@ const unsigned char _Py_M__importlib_ext 95,102,105,110,100,95,109,111,100,117,108,101,95,115,104,105, 109,135,1,0,0,115,10,0,0,0,0,10,21,1,24,1, 6,1,29,1,114,123,0,0,0,99,4,0,0,0,0,0, - 0,0,11,0,0,0,19,0,0,0,67,0,0,0,115,240, + 0,0,11,0,0,0,19,0,0,0,67,0,0,0,115,252, 1,0,0,105,0,0,125,4,0,124,2,0,100,1,0,107, 9,0,114,31,0,124,2,0,124,4,0,100,2,0,60,110, 6,0,100,3,0,125,2,0,124,3,0,100,1,0,107,9, @@ -621,59 +621,60 @@ const unsigned char _Py_M__importlib_ext 0,100,1,0,100,5,0,133,2,0,25,125,5,0,124,0, 0,100,5,0,100,6,0,133,2,0,25,125,6,0,124,0, 0,100,6,0,100,7,0,133,2,0,25,125,7,0,124,5, - 0,116,0,0,107,3,0,114,168,0,100,8,0,106,1,0, + 0,116,0,0,107,3,0,114,171,0,100,8,0,106,1,0, 124,2,0,124,5,0,131,2,0,125,8,0,116,2,0,106, - 3,0,124,8,0,131,1,0,1,116,4,0,124,8,0,124, - 4,0,141,1,0,130,1,0,110,119,0,116,5,0,124,6, - 0,131,1,0,100,5,0,107,3,0,114,229,0,100,9,0, - 106,1,0,124,2,0,131,1,0,125,8,0,116,2,0,106, - 3,0,124,8,0,131,1,0,1,116,6,0,124,8,0,131, - 1,0,130,1,0,110,58,0,116,5,0,124,7,0,131,1, - 0,100,5,0,107,3,0,114,31,1,100,10,0,106,1,0, - 124,2,0,131,1,0,125,8,0,116,2,0,106,3,0,124, - 8,0,131,1,0,1,116,6,0,124,8,0,131,1,0,130, - 1,0,124,1,0,100,1,0,107,9,0,114,226,1,121,20, - 0,116,7,0,124,1,0,100,11,0,25,131,1,0,125,9, - 0,87,110,18,0,4,116,8,0,107,10,0,114,83,1,1, - 1,1,89,110,62,0,88,116,9,0,124,6,0,131,1,0, - 124,9,0,107,3,0,114,145,1,100,12,0,106,1,0,124, - 2,0,131,1,0,125,8,0,116,2,0,106,3,0,124,8, - 0,131,1,0,1,116,4,0,124,8,0,124,4,0,141,1, - 0,130,1,0,121,18,0,124,1,0,100,13,0,25,100,14, - 0,64,125,10,0,87,110,18,0,4,116,8,0,107,10,0, - 114,183,1,1,1,1,89,110,43,0,88,116,9,0,124,7, - 0,131,1,0,124,10,0,107,3,0,114,226,1,116,4,0, - 100,12,0,106,1,0,124,2,0,131,1,0,124,4,0,141, - 1,0,130,1,0,124,0,0,100,7,0,100,1,0,133,2, - 0,25,83,41,15,97,122,1,0,0,86,97,108,105,100,97, - 116,101,32,116,104,101,32,104,101,97,100,101,114,32,111,102, - 32,116,104,101,32,112,97,115,115,101,100,45,105,110,32,98, - 121,116,101,99,111,100,101,32,97,103,97,105,110,115,116,32, - 115,111,117,114,99,101,95,115,116,97,116,115,32,40,105,102, - 10,32,32,32,32,103,105,118,101,110,41,32,97,110,100,32, - 114,101,116,117,114,110,105,110,103,32,116,104,101,32,98,121, - 116,101,99,111,100,101,32,116,104,97,116,32,99,97,110,32, - 98,101,32,99,111,109,112,105,108,101,100,32,98,121,32,99, - 111,109,112,105,108,101,40,41,46,10,10,32,32,32,32,65, - 108,108,32,111,116,104,101,114,32,97,114,103,117,109,101,110, - 116,115,32,97,114,101,32,117,115,101,100,32,116,111,32,101, - 110,104,97,110,99,101,32,101,114,114,111,114,32,114,101,112, - 111,114,116,105,110,103,46,10,10,32,32,32,32,73,109,112, - 111,114,116,69,114,114,111,114,32,105,115,32,114,97,105,115, - 101,100,32,119,104,101,110,32,116,104,101,32,109,97,103,105, - 99,32,110,117,109,98,101,114,32,105,115,32,105,110,99,111, - 114,114,101,99,116,32,111,114,32,116,104,101,32,98,121,116, - 101,99,111,100,101,32,105,115,10,32,32,32,32,102,111,117, - 110,100,32,116,111,32,98,101,32,115,116,97,108,101,46,32, - 69,79,70,69,114,114,111,114,32,105,115,32,114,97,105,115, - 101,100,32,119,104,101,110,32,116,104,101,32,100,97,116,97, - 32,105,115,32,102,111,117,110,100,32,116,111,32,98,101,10, - 32,32,32,32,116,114,117,110,99,97,116,101,100,46,10,10, - 32,32,32,32,78,114,98,0,0,0,122,10,60,98,121,116, - 101,99,111,100,101,62,114,35,0,0,0,114,12,0,0,0, - 233,8,0,0,0,233,12,0,0,0,122,30,98,97,100,32, - 109,97,103,105,99,32,110,117,109,98,101,114,32,105,110,32, - 123,33,114,125,58,32,123,33,114,125,122,43,114,101,97,99, + 3,0,100,9,0,124,8,0,131,2,0,1,116,4,0,124, + 8,0,124,4,0,141,1,0,130,1,0,110,125,0,116,5, + 0,124,6,0,131,1,0,100,5,0,107,3,0,114,235,0, + 100,10,0,106,1,0,124,2,0,131,1,0,125,8,0,116, + 2,0,106,3,0,100,9,0,124,8,0,131,2,0,1,116, + 6,0,124,8,0,131,1,0,130,1,0,110,61,0,116,5, + 0,124,7,0,131,1,0,100,5,0,107,3,0,114,40,1, + 100,11,0,106,1,0,124,2,0,131,1,0,125,8,0,116, + 2,0,106,3,0,100,9,0,124,8,0,131,2,0,1,116, + 6,0,124,8,0,131,1,0,130,1,0,124,1,0,100,1, + 0,107,9,0,114,238,1,121,20,0,116,7,0,124,1,0, + 100,12,0,25,131,1,0,125,9,0,87,110,18,0,4,116, + 8,0,107,10,0,114,92,1,1,1,1,89,110,65,0,88, + 116,9,0,124,6,0,131,1,0,124,9,0,107,3,0,114, + 157,1,100,13,0,106,1,0,124,2,0,131,1,0,125,8, + 0,116,2,0,106,3,0,100,9,0,124,8,0,131,2,0, + 1,116,4,0,124,8,0,124,4,0,141,1,0,130,1,0, + 121,18,0,124,1,0,100,14,0,25,100,15,0,64,125,10, + 0,87,110,18,0,4,116,8,0,107,10,0,114,195,1,1, + 1,1,89,110,43,0,88,116,9,0,124,7,0,131,1,0, + 124,10,0,107,3,0,114,238,1,116,4,0,100,13,0,106, + 1,0,124,2,0,131,1,0,124,4,0,141,1,0,130,1, + 0,124,0,0,100,7,0,100,1,0,133,2,0,25,83,41, + 16,97,122,1,0,0,86,97,108,105,100,97,116,101,32,116, + 104,101,32,104,101,97,100,101,114,32,111,102,32,116,104,101, + 32,112,97,115,115,101,100,45,105,110,32,98,121,116,101,99, + 111,100,101,32,97,103,97,105,110,115,116,32,115,111,117,114, + 99,101,95,115,116,97,116,115,32,40,105,102,10,32,32,32, + 32,103,105,118,101,110,41,32,97,110,100,32,114,101,116,117, + 114,110,105,110,103,32,116,104,101,32,98,121,116,101,99,111, + 100,101,32,116,104,97,116,32,99,97,110,32,98,101,32,99, + 111,109,112,105,108,101,100,32,98,121,32,99,111,109,112,105, + 108,101,40,41,46,10,10,32,32,32,32,65,108,108,32,111, + 116,104,101,114,32,97,114,103,117,109,101,110,116,115,32,97, + 114,101,32,117,115,101,100,32,116,111,32,101,110,104,97,110, + 99,101,32,101,114,114,111,114,32,114,101,112,111,114,116,105, + 110,103,46,10,10,32,32,32,32,73,109,112,111,114,116,69, + 114,114,111,114,32,105,115,32,114,97,105,115,101,100,32,119, + 104,101,110,32,116,104,101,32,109,97,103,105,99,32,110,117, + 109,98,101,114,32,105,115,32,105,110,99,111,114,114,101,99, + 116,32,111,114,32,116,104,101,32,98,121,116,101,99,111,100, + 101,32,105,115,10,32,32,32,32,102,111,117,110,100,32,116, + 111,32,98,101,32,115,116,97,108,101,46,32,69,79,70,69, + 114,114,111,114,32,105,115,32,114,97,105,115,101,100,32,119, + 104,101,110,32,116,104,101,32,100,97,116,97,32,105,115,32, + 102,111,117,110,100,32,116,111,32,98,101,10,32,32,32,32, + 116,114,117,110,99,97,116,101,100,46,10,10,32,32,32,32, + 78,114,98,0,0,0,122,10,60,98,121,116,101,99,111,100, + 101,62,114,35,0,0,0,114,12,0,0,0,233,8,0,0, + 0,233,12,0,0,0,122,30,98,97,100,32,109,97,103,105, + 99,32,110,117,109,98,101,114,32,105,110,32,123,33,114,125, + 58,32,123,33,114,125,122,2,123,125,122,43,114,101,97,99, 104,101,100,32,69,79,70,32,119,104,105,108,101,32,114,101, 97,100,105,110,103,32,116,105,109,101,115,116,97,109,112,32, 105,110,32,123,33,114,125,122,48,114,101,97,99,104,101,100, @@ -699,9 +700,9 @@ const unsigned char _Py_M__importlib_ext 97,108,105,100,97,116,101,95,98,121,116,101,99,111,100,101, 95,104,101,97,100,101,114,152,1,0,0,115,76,0,0,0, 0,11,6,1,12,1,13,3,6,1,12,1,10,1,16,1, - 16,1,16,1,12,1,18,1,13,1,18,1,18,1,15,1, - 13,1,15,1,18,1,15,1,13,1,12,1,12,1,3,1, - 20,1,13,1,5,2,18,1,15,1,13,1,15,1,3,1, + 16,1,16,1,12,1,18,1,16,1,18,1,18,1,15,1, + 16,1,15,1,18,1,15,1,16,1,12,1,12,1,3,1, + 20,1,13,1,5,2,18,1,15,1,16,1,15,1,3,1, 18,1,13,1,5,2,18,1,15,1,9,1,114,135,0,0, 0,99,4,0,0,0,0,0,0,0,5,0,0,0,6,0, 0,0,67,0,0,0,115,115,0,0,0,116,0,0,106,1,