diff -r 8317796ca004 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Tue Sep 22 10:46:52 2015 +0200 +++ b/Objects/unicodeobject.c Tue Sep 22 11:32:48 2015 +0200 @@ -297,6 +297,7 @@ typedef enum { _Py_ERROR_UNKNOWN=0, _Py_ERROR_STRICT, _Py_ERROR_SURROGATEESCAPE, + _Py_ERROR_SURROGATEPASS, _Py_ERROR_REPLACE, _Py_ERROR_IGNORE, _Py_ERROR_XMLCHARREFREPLACE, @@ -312,6 +313,8 @@ get_error_handler(const char *errors) return _Py_ERROR_STRICT; if (strcmp(errors, "surrogateescape") == 0) return _Py_ERROR_SURROGATEESCAPE; + if (strcmp(errors, "surrogatepass") == 0) + return _Py_ERROR_SURROGATEPASS; if (strcmp(errors, "ignore") == 0) return _Py_ERROR_IGNORE; if (strcmp(errors, "replace") == 0) @@ -4709,8 +4712,9 @@ PyUnicode_DecodeUTF8Stateful(const char Py_ssize_t startinpos; Py_ssize_t endinpos; const char *errmsg = ""; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) { if (consumed) @@ -4773,24 +4777,83 @@ PyUnicode_DecodeUTF8Stateful(const char continue; } - if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, - "utf-8", errmsg, - &starts, &end, &startinpos, &endinpos, &exc, &s, - &writer)) - goto onError; + /* undecodable byte: call the error handler */ + + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) + { + case _Py_ERROR_REPLACE: + case _Py_ERROR_SURROGATEESCAPE: + ch = (unsigned char)*s; + + /* Fast-path: the error handler only writes one character, + but we may switch to UCS2 at the first write */ + if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) + goto onError; + kind = writer.kind; + + if (error_handler == _Py_ERROR_REPLACE) + PyUnicode_WRITE(kind, writer.data, writer.pos, 0xfffd); + else + PyUnicode_WRITE(kind, writer.data, writer.pos, ch + 0xdc00); + + writer.pos++; + s = starts + endinpos; + break; + + case _Py_ERROR_IGNORE: + s = starts + endinpos; + break; + + case _Py_ERROR_SURROGATEPASS: + { + const unsigned char *p = (const unsigned char *)s; + + ch = 0; + if (size - startinpos >= 3 + && (p[0] & 0xf0) == 0xe0 + && (p[1] & 0xc0) == 0x80 + && (p[2] & 0xc0) == 0x80) + { + /* it's a three-byte code */ + ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); + if (!Py_UNICODE_IS_SURROGATE(ch)) + ch = 0; + } + + if (ch != 0) { + /* Fast-path: the error handler only writes one character, + but we may switch to UCS2 at the first write */ + if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) + goto onError; + s += 3; + break; + } + /* fall-through to default */ + } + + default: + if (unicode_decode_call_errorhandler_writer( + errors, &error_handler_obj, + "utf-8", errmsg, + &starts, &end, &startinpos, &endinpos, &exc, &s, + &writer)) + goto onError; + } } End: if (consumed) *consumed = s - starts; - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); onError: - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); _PyUnicodeWriter_Dealloc(&writer); return NULL;