diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e28bae4..507344e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3979,6 +3979,23 @@ PyUnicode_GetDefaultEncoding(void) return "utf-8"; } +#define _Py_CODEC_ERROR_UNKNOWN 0 +#define _Py_CODEC_ERROR_SURROGATEPASS 1 +#define _Py_CODEC_ERROR_SURROGATEESCAPE 2 +#define _Py_CODEC_ERROR_OTHER -1 + +static int +detect_standard_errorhandler(const char *errors) +{ + if (errors == NULL) + return _Py_CODEC_ERROR_OTHER; /* strict */ + if (strcmp(errors, "surrogatepass") == 0) + return _Py_CODEC_ERROR_SURROGATEPASS; + if (strcmp(errors, "surrogateescape") == 0) + return _Py_CODEC_ERROR_SURROGATEESCAPE; + return _Py_CODEC_ERROR_OTHER; +} + /* create or adjust a UnicodeDecodeError */ static void make_decode_exception(PyObject **exceptionObject, @@ -6729,6 +6746,7 @@ PyUnicode_DecodeASCII(const char *s, const char *e; PyObject *errorHandler = NULL; PyObject *exc = NULL; + int errorType = _Py_CODEC_ERROR_UNKNOWN; if (size == 0) _Py_RETURN_UNICODE_EMPTY(); @@ -6759,6 +6777,23 @@ PyUnicode_DecodeASCII(const char *s, ++s; } else { + if (errorType == _Py_CODEC_ERROR_UNKNOWN) { + errorType = detect_standard_errorhandler(errors); + if (errorType == _Py_CODEC_ERROR_SURROGATEESCAPE && + kind < PyUnicode_2BYTE_KIND) { + if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos, 0xffff) < 0) + return NULL; + kind = writer.kind; + data = writer.data; + } + } + if (errorType == _Py_CODEC_ERROR_SURROGATEESCAPE) { + PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); + writer.pos++; + ++s; + continue; + } + startinpos = s-starts; endinpos = startinpos + 1; if (unicode_decode_call_errorhandler_writer(