diff -r da9898e7e90d Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Wed Jul 27 16:59:22 2016 +0200 +++ b/Lib/test/test_codecs.py Wed Jul 27 18:25:54 2016 +0200 @@ -830,6 +830,21 @@ class UTF8Test(ReadTest, unittest.TestCa with self.assertRaises(UnicodeDecodeError): b"abc\xed\xa0z".decode(self.encoding, "surrogatepass") + def test_incremental_surrogatepass(self): + # Test incremental decoder for UTF-8/surrogatepass handler: + # see issue #24214 + data = b'\xed\xa4\x80' + + # partial decode, 1 byte + dec = codecs.getincrementaldecoder('utf8')('surrogatepass') + self.assertEqual(dec.decode(data[:1]), '') + self.assertEqual(dec.decode(data[1:]), '\uD900') + + # partial decode, 2 bytes + dec = codecs.getincrementaldecoder('utf8')('surrogatepass') + self.assertEqual(dec.decode(data[:2]), '') + self.assertEqual(dec.decode(data[2:]), '\uD900') + @unittest.skipUnless(sys.platform == 'win32', 'cp65001 is a Windows-only codec') diff -r da9898e7e90d Objects/stringlib/codecs.h --- a/Objects/stringlib/codecs.h Wed Jul 27 16:59:22 2016 +0200 +++ b/Objects/stringlib/codecs.h Wed Jul 27 18:25:54 2016 +0200 @@ -17,10 +17,15 @@ /* 10xxxxxx */ #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) +/* By default, the decoder raises an error if it would produce a surrogate + character. If surrogatepass is non-zero, produce surrogate characters. This + flag is used to implement UTF-8/surrogatepass decoder. The flag is required + to implement correctly incremental decoders: see issue #24214. */ Py_LOCAL_INLINE(Py_UCS4) STRINGLIB(utf8_decode)(const char **inptr, const char *end, STRINGLIB_CHAR *dest, - Py_ssize_t *outpos) + Py_ssize_t *outpos, + int surrogatepass) { Py_UCS4 ch; const char *s = *inptr; @@ -132,24 +137,30 @@ STRINGLIB(utf8_decode)(const char **inpt if (end - s < 2) break; ch2 = (unsigned char)s[1]; - if (!IS_CONTINUATION_BYTE(ch2) || - (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) + if (!IS_CONTINUATION_BYTE(ch2)) { /* for clarification see comments below */ goto InvalidContinuation1; + } + if (!surrogatepass && (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) { + /* for clarification see comments below */ + goto InvalidContinuation1; + } break; } + ch2 = (unsigned char)s[1]; - ch3 = (unsigned char)s[2]; if (!IS_CONTINUATION_BYTE(ch2)) { /* invalid continuation byte */ goto InvalidContinuation1; } if (ch == 0xE0) { - if (ch2 < 0xA0) + if (ch2 < 0xA0) { /* invalid sequence \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ goto InvalidContinuation1; - } else if (ch == 0xED && ch2 >= 0xA0) { + } + } + else if (ch == 0xED && ch2 >= 0xA0 && !surrogatepass) { /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF will result in surrogates in range D800-DFFF. Surrogates are not valid UTF-8 so they are rejected. @@ -157,6 +168,8 @@ STRINGLIB(utf8_decode)(const char **inpt (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ goto InvalidContinuation1; } + + ch3 = (unsigned char)s[2]; if (!IS_CONTINUATION_BYTE(ch3)) { /* invalid continuation byte */ goto InvalidContinuation2; diff -r da9898e7e90d Objects/unicodeobject.c --- a/Objects/unicodeobject.c Wed Jul 27 16:59:22 2016 +0200 +++ b/Objects/unicodeobject.c Wed Jul 27 18:25:54 2016 +0200 @@ -4870,6 +4870,7 @@ PyUnicode_DecodeUTF8Stateful(const char PyObject *error_handler_obj = NULL; PyObject *exc = NULL; _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; + int surrogatepass = 0; if (size == 0) { if (consumed) @@ -4897,14 +4898,14 @@ PyUnicode_DecodeUTF8Stateful(const char if (kind == PyUnicode_1BYTE_KIND) { if (PyUnicode_IS_ASCII(writer.buffer)) - ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); + ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos, surrogatepass); else - ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); + ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos, surrogatepass); } else if (kind == PyUnicode_2BYTE_KIND) { - ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); + ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos, surrogatepass); } else { assert(kind == PyUnicode_4BYTE_KIND); - ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); + ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos, surrogatepass); } switch (ch) { @@ -4933,9 +4934,16 @@ PyUnicode_DecodeUTF8Stateful(const char continue; } - if (error_handler == _Py_ERROR_UNKNOWN) + if (error_handler == _Py_ERROR_UNKNOWN) { error_handler = get_error_handler(errors); + if (error_handler == _Py_ERROR_SURROGATEPASS) { + /* call again utf8_decoder() with surrogatepass=1 */ + surrogatepass = 1; + continue; + } + } + switch (error_handler) { case _Py_ERROR_IGNORE: s += (endinpos - startinpos);