diff -r f23d0a4278aa Objects/stringlib/codecs.h --- a/Objects/stringlib/codecs.h Fri May 15 12:55:20 2015 -0400 +++ b/Objects/stringlib/codecs.h Sat May 16 15:51:28 2015 +0300 @@ -18,12 +18,15 @@ Py_LOCAL_INLINE(Py_UCS4) STRINGLIB(utf8_decode)(const char **inptr, const char *end, STRINGLIB_CHAR *dest, - Py_ssize_t *outpos) + Py_ssize_t *outpos, + const char *errors) { Py_UCS4 ch; const char *s = *inptr; const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); STRINGLIB_CHAR *p = dest + *outpos; + int len; /* the lenght of invalid bytes sequence */ + int errorType = _Py_CODEC_ERROR_UNKNOWN; while (s < end) { ch = (unsigned char)*s; @@ -153,7 +156,10 @@ STRINGLIB(utf8_decode)(const char **inpt not valid UTF-8 so they are rejected. See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ - goto InvalidContinuation1; + if (errorType == _Py_CODEC_ERROR_UNKNOWN) + errorType = detect_standard_errorhandler(errors); + if (errorType != _Py_CODEC_ERROR_SURROGATEPASS) + goto InvalidContinuation1; } if (!IS_CONTINUATION_BYTE(ch3)) { /* invalid continuation byte */ @@ -228,24 +234,49 @@ STRINGLIB(utf8_decode)(const char **inpt continue; } goto InvalidStart; + + InvalidStart: + len = 0; + goto Invalid; + InvalidContinuation1: + len = 1; + goto Invalid; + InvalidContinuation2: + len = 2; + goto Invalid; + InvalidContinuation3: + len = 3; + goto Invalid; + Invalid: + if (errorType == _Py_CODEC_ERROR_UNKNOWN) + errorType = detect_standard_errorhandler(errors); + if (errorType == _Py_CODEC_ERROR_SURROGATEESCAPE) { +#if STRINGLIB_MAX_CHAR < 0xDCFF + assert ((ch >= 0x80) && (ch <= 0xFF)); + ch += 0xDC00; + s++; + /* Out-of-range */ + goto Return; +#else + while (1) { + assert ((ch >= 0x80) && (ch <= 0xFF)); + *p++ = ch + 0xDC00; + s++; + if (--len <= 0) + break; + ch = (unsigned char)*s; + } + continue; +#endif + } + ch = len; + goto Return; } - ch = 0; + ch = (Py_UCS4)-1; Return: *inptr = s; *outpos = p - dest; return ch; -InvalidStart: - ch = 1; - goto Return; -InvalidContinuation1: - ch = 2; - goto Return; -InvalidContinuation2: - ch = 3; - goto Return; -InvalidContinuation3: - ch = 4; - goto Return; } #undef ASCII_CHAR_MASK @@ -271,6 +302,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicod PyObject *errorHandler = NULL; PyObject *exc = NULL; PyObject *rep = NULL; + int errorType = _Py_CODEC_ERROR_UNKNOWN; #endif #if STRINGLIB_SIZEOF_CHAR == 1 const Py_ssize_t max_char_size = 2; @@ -328,6 +360,22 @@ STRINGLIB(utf8_encoder)(PyObject *unicod else if (Py_UNICODE_IS_SURROGATE(ch)) { Py_ssize_t newpos; Py_ssize_t repsize, k, startpos; + if (errorType == _Py_CODEC_ERROR_UNKNOWN) + errorType = detect_standard_errorhandler(errors); + if (errorType == _Py_CODEC_ERROR_SURROGATEPASS) + goto surrogatepass; + if (errorType == _Py_CODEC_ERROR_SURROGATEESCAPE) { + while (ch >= 0xdc80 && ch <= 0xdcff) { + *p++ = (char)(ch & 0xff); + if (i >= size) + break; + ch = data[i++]; + } + if (i >= size) + break; + continue; + } + startpos = i-1; rep = unicode_encode_call_errorhandler( errors, &errorHandler, "utf-8", "surrogates not allowed", @@ -398,6 +446,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicod if (ch < 0x10000) #endif { + surrogatepass: *p++ = (char)(0xe0 | (ch >> 12)); *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f)); @@ -478,7 +527,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicod Py_LOCAL_INLINE(Py_UCS4) STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, STRINGLIB_CHAR *dest, Py_ssize_t *outpos, - int native_ordering) + int native_ordering, + const char *errors) { Py_UCS4 ch; const unsigned char *aligned_end = @@ -491,6 +541,8 @@ STRINGLIB(utf16_decode)(const unsigned c #else int ihi = !native_ordering, ilo = !!native_ordering; #endif + int errorType = _Py_CODEC_ERROR_UNKNOWN; + int err; /* error code */ --e; while (q < e) { @@ -573,22 +625,40 @@ STRINGLIB(utf16_decode)(const unsigned c goto Return; #else *p++ = (STRINGLIB_CHAR)ch; + continue; #endif + +UnexpectedEnd: + err = 1; + goto Invalid; +IllegalEncoding: + err = 2; + goto Invalid; +IllegalSurrogate: + err = 3; + goto Invalid; +Invalid: + if (errorType == _Py_CODEC_ERROR_UNKNOWN) + errorType = detect_standard_errorhandler(errors); + if (errorType == _Py_CODEC_ERROR_SURROGATEPASS) { + if (err == 3) /* illegal surrogate */ + q -= 2; +#if STRINGLIB_SIZEOF_CHAR < 2 + if (ch > STRINGLIB_MAX_CHAR) + /* Out-of-range */ + goto Return; +#endif + *p++ = (STRINGLIB_CHAR)ch; + continue; + } + ch = err; + goto Return; } ch = 0; Return: *inptr = q; *outpos = p - dest; return ch; -UnexpectedEnd: - ch = 1; - goto Return; -IllegalEncoding: - ch = 2; - goto Return; -IllegalSurrogate: - ch = 3; - goto Return; } #undef UCS2_REPEAT_MASK #undef FAST_CHAR_MASK diff -r f23d0a4278aa Objects/unicodeobject.c --- a/Objects/unicodeobject.c Fri May 15 12:55:20 2015 -0400 +++ b/Objects/unicodeobject.c Sat May 16 15:51:28 2015 +0300 @@ -3928,6 +3928,23 @@ PyUnicode_GetDefaultEncoding(void) return "utf-8"; } +#define _Py_CODEC_ERROR_UNKNOWN 0 +#define _Py_CODEC_ERROR_SURROGATEPASS 1 +#define _Py_CODEC_ERROR_SURROGATEESCAPE 2 +#define _Py_CODEC_ERROR_OTHER -1 + +static int +detect_standard_errorhandler(const char *errors) +{ + if (errors == NULL) + return _Py_CODEC_ERROR_OTHER; /* strict */ + if (strcmp(errors, "surrogatepass") == 0) + return _Py_CODEC_ERROR_SURROGATEPASS; + if (strcmp(errors, "surrogateescape") == 0) + return _Py_CODEC_ERROR_SURROGATEESCAPE; + return _Py_CODEC_ERROR_OTHER; +} + /* create or adjust a UnicodeDecodeError */ static void make_decode_exception(PyObject **exceptionObject, @@ -4684,6 +4701,7 @@ PyUnicode_DecodeUTF8Stateful(const char const char *errmsg = ""; PyObject *errorHandler = NULL; PyObject *exc = NULL; + int errorType = _Py_CODEC_ERROR_UNKNOWN; if (size == 0) { if (consumed) @@ -4710,35 +4728,35 @@ PyUnicode_DecodeUTF8Stateful(const char int kind = writer.kind; if (kind == PyUnicode_1BYTE_KIND) { if (PyUnicode_IS_ASCII(writer.buffer)) - ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); + ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos, errors); else - ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); + ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos, errors); } else if (kind == PyUnicode_2BYTE_KIND) { - ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); + ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos, errors); } else { assert(kind == PyUnicode_4BYTE_KIND); - ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); + ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos, errors); } switch (ch) { - case 0: + case (Py_UCS4)-1: if (s == end || consumed) goto End; errmsg = "unexpected end of data"; startinpos = s - starts; endinpos = end - starts; break; - case 1: + case 0: errmsg = "invalid start byte"; startinpos = s - starts; endinpos = startinpos + 1; break; + case 1: case 2: case 3: - case 4: errmsg = "invalid continuation byte"; startinpos = s - starts; - endinpos = startinpos + ch - 1; + endinpos = startinpos + ch; break; default: if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) @@ -4918,6 +4936,7 @@ PyUnicode_DecodeUTF32Stateful(const char const char *errmsg = ""; PyObject *errorHandler = NULL; PyObject *exc = NULL; + int errorType = _Py_CODEC_ERROR_UNKNOWN; q = (unsigned char *)s; e = q + size; @@ -4998,6 +5017,14 @@ PyUnicode_DecodeUTF32Stateful(const char } if (Py_UNICODE_IS_SURROGATE(ch)) { + if (errorType == _Py_CODEC_ERROR_UNKNOWN) + errorType = detect_standard_errorhandler(errors); + if (errorType == _Py_CODEC_ERROR_SURROGATEPASS) { + if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) + goto onError; + q += 4; + continue; + } errmsg = "code point in surrogate code point range(0xd800, 0xe000)"; startinpos = ((const char *)q) - starts; endinpos = startinpos + 4; @@ -5066,6 +5093,8 @@ PyObject * PyObject *errorHandler = NULL; PyObject *exc = NULL; PyObject *rep = NULL; + int errorType = _Py_CODEC_ERROR_UNKNOWN; + int surrogatepass = 0; if (!PyUnicode_Check(str)) { PyErr_BadArgument(); @@ -5298,20 +5327,20 @@ PyUnicode_DecodeUTF16Stateful(const char if (PyUnicode_IS_ASCII(writer.buffer)) ch = asciilib_utf16_decode(&q, e, (Py_UCS1*)writer.data, &writer.pos, - native_ordering); + native_ordering, errors); else ch = ucs1lib_utf16_decode(&q, e, (Py_UCS1*)writer.data, &writer.pos, - native_ordering); + native_ordering, errors); } else if (kind == PyUnicode_2BYTE_KIND) { ch = ucs2lib_utf16_decode(&q, e, (Py_UCS2*)writer.data, &writer.pos, - native_ordering); + native_ordering, errors); } else { assert(kind == PyUnicode_4BYTE_KIND); ch = ucs4lib_utf16_decode(&q, e, (Py_UCS4*)writer.data, &writer.pos, - native_ordering); + native_ordering, errors); } } @@ -6407,6 +6436,7 @@ unicode_encode_ucs1(PyObject *unicode, /* the following variable is used for caching string comparisons * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ int known_errorHandler = -1; + int surrogateescape = detect_standard_errorhandler(errors) == _Py_CODEC_ERROR_SURROGATEESCAPE; if (PyUnicode_READY(unicode) == -1) return NULL; @@ -6437,9 +6467,24 @@ unicode_encode_ucs1(PyObject *unicode, PyObject *repunicode; Py_ssize_t repsize, newpos, respos, i; /* startpos for collecting unencodable chars */ - Py_ssize_t collstart = pos; - Py_ssize_t collend = pos; + Py_ssize_t collstart; + Py_ssize_t collend; + + if (surrogateescape) { + while (c >= 0xdc80 && c <= 0xdcff) { + *str++ = (char)c; + ++pos; + if (pos >= size) + break; + c = PyUnicode_READ(kind, data, pos); + } + if (pos >= size) + break; + /* fallback to general error handling */ + } + /* find all unecodable characters */ + collstart = collend = pos; while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) ++collend; /* cache callback name lookup (if not done yet, i.e. it's the first error) */ @@ -6658,6 +6703,7 @@ PyUnicode_DecodeASCII(const char *s, const char *e; PyObject *errorHandler = NULL; PyObject *exc = NULL; + int errorType = _Py_CODEC_ERROR_UNKNOWN; if (size == 0) _Py_RETURN_UNICODE_EMPTY(); @@ -6688,6 +6734,23 @@ PyUnicode_DecodeASCII(const char *s, ++s; } else { + if (errorType == _Py_CODEC_ERROR_UNKNOWN) { + errorType = detect_standard_errorhandler(errors); + if (errorType == _Py_CODEC_ERROR_SURROGATEESCAPE && + kind < PyUnicode_2BYTE_KIND) { + if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos, 0xffff) < 0) + return NULL; + kind = writer.kind; + data = writer.data; + } + } + if (errorType == _Py_CODEC_ERROR_SURROGATEESCAPE) { + PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); + writer.pos++; + ++s; + continue; + } + startinpos = s-starts; endinpos = startinpos + 1; if (unicode_decode_call_errorhandler_writer(