diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 07330119dc..e2f7040076 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4190,7 +4190,10 @@ unicode_decode_call_errorhandler_writer( Py_ssize_t insize; Py_ssize_t newpos; Py_ssize_t replen; + Py_ssize_t remain; PyObject *inputobj = NULL; + int need_to_grow = 0; + const char *new_inptr; if (*errorHandler == NULL) { *errorHandler = PyCodec_LookupError(errors); @@ -4221,6 +4224,7 @@ unicode_decode_call_errorhandler_writer( inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); if (!inputobj) goto onError; + remain = *inend - *inptr; *input = PyBytes_AS_STRING(inputobj); insize = PyBytes_GET_SIZE(inputobj); *inend = *input + insize; @@ -4238,6 +4242,18 @@ unicode_decode_call_errorhandler_writer( replen = PyUnicode_GET_LENGTH(repunicode); if (replen > 1) { writer->min_length += replen - 1; + need_to_grow = 1; + } + new_inptr = *input + newpos; + if (*inend - new_inptr > remain) { + /* We don't know the decoding algorithm here so we make the worst assumption + that one byte decodes to one unicode character. If unfortunately one byte + could decode to more unicode characters, the decoder may write out of bounds + then. Is it possible for the algorithms using this function? */ + writer->min_length += *inend - new_inptr - remain; + need_to_grow = 1; + } + if (need_to_grow) { writer->overallocate = 1; if (_PyUnicodeWriter_Prepare(writer, writer->min_length, PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1) @@ -4247,7 +4263,7 @@ unicode_decode_call_errorhandler_writer( goto onError; *endinpos = newpos; - *inptr = *input + newpos; + *inptr = new_inptr; /* we made it! */ Py_DECREF(restuple);