diff -r f89beccd470c Objects/unicodeobject.c --- a/Objects/unicodeobject.c Tue Nov 05 02:50:49 2013 -0800 +++ b/Objects/unicodeobject.c Tue Nov 05 23:41:25 2013 +1000 @@ -235,6 +235,7 @@ static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); static PyObject* get_latin1_char(unsigned char ch); static int unicode_modifiable(PyObject *unicode); +static void wrap_codec_error(const char* operation, const char *encoding); static PyObject * @@ -3047,12 +3048,16 @@ if (buffer == NULL) goto onError; unicode = PyCodec_Decode(buffer, encoding, errors); - if (unicode == NULL) + if (unicode == NULL) { + wrap_codec_error("decoding", encoding); goto onError; + } if (!PyUnicode_Check(unicode)) { PyErr_Format(PyExc_TypeError, - "decoder did not return a str object (type=%.400s)", - Py_TYPE(unicode)->tp_name); + "'%.400s' decoder returned '%.400s' instead of 'str'; " + "use codecs.decode to decode to arbitrary types", + encoding, + Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); Py_DECREF(unicode); goto onError; } @@ -3081,8 +3086,10 @@ /* Decode via the codec registry */ v = PyCodec_Decode(unicode, encoding, errors); - if (v == NULL) + if (v == NULL) { + wrap_codec_error("decoding", encoding); goto onError; + } return unicode_result(v); onError: @@ -3106,12 +3113,16 @@ /* Decode via the codec registry */ v = PyCodec_Decode(unicode, encoding, errors); - if (v == NULL) + if (v == NULL) { + wrap_codec_error("decoding", encoding); goto onError; + } if (!PyUnicode_Check(v)) { PyErr_Format(PyExc_TypeError, - "decoder did not return a str object (type=%.400s)", - Py_TYPE(v)->tp_name); + "'%.400s' decoder returned '%.400s' instead of 'str'; " + "use codecs.decode to decode to arbitrary types", + encoding, + Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); Py_DECREF(v); goto onError; } @@ -3371,6 +3382,60 @@ #endif } + +/* Helper to ensure the exception chain indicates the codec that was + * invoked to trigger the failure. + * + * We limit this to *exact* matches on a whitelist of types that we + * know we can wrap correctly. + * + * We need to be very careful with what we wrap, since changing types to + * a broader exception type would be backwards incompatible for + * existing codecs, and subclasses of the known types may either + * not support instantiation with PyErr_Format or lose information + * when instantiated that way. + * + * We skip wrapping UnicodeEncodeError and UnicodeDecodeError since + * they're annoying to wrap correctly and also already mention the + * codec that triggered the error by name. + */ +static void +wrap_codec_error(const char *operation, + const char *encoding) +{ + PyObject *exc, *val, *tb; + PyObject *new_exc, *new_val, *new_tb; + PyErr_Fetch(&exc, &val, &tb); + if (exc == PyExc_TypeError || + exc == PyExc_ValueError || + exc == PyExc_AttributeError + ) { + /* For whitelisted exception types, we chain the original + * exception to a new one of the exact same type with an + * error message that mentions the current codec and the + * original exception. + * + * It would be nice to wrap OSError as well, but that's a + * bit trickier due to the extra state potentially stored + * on OSError instances. + */ + PyErr_NormalizeException(&exc, &val, &tb); + PyErr_Format(exc, + "%s with '%s' codec failed (%s: %S)", + operation, encoding, + Py_TYPE(val)->tp_name, val); + Py_DECREF(exc); + Py_XDECREF(tb); + PyErr_Fetch(&new_exc, &new_val, &new_tb); + PyErr_NormalizeException(&new_exc, &new_val, &new_tb); + PyException_SetCause(new_val, val); + PyErr_Restore(new_exc, new_val, new_tb); + } + else { + PyErr_Restore(exc, val, tb); + } +} + PyObject * PyUnicode_AsEncodedString(PyObject *unicode, const char *encoding, @@ -3409,8 +3474,10 @@ /* Encode via the codec registry */ v = PyCodec_Encode(unicode, encoding, errors); - if (v == NULL) - return NULL; + if (v == NULL) { + wrap_codec_error("encoding", encoding); + return NULL; + } /* The normal path */ if (PyBytes_Check(v)) @@ -3422,7 +3489,8 @@ PyObject *b; error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, - "encoder %s returned bytearray instead of bytes", + "encoder %s returned bytearray instead of bytes; " + "use codecs.encode to encode to arbitrary types", encoding); if (error) { Py_DECREF(v); @@ -3435,8 +3503,10 @@ } PyErr_Format(PyExc_TypeError, - "encoder did not return a bytes object (type=%.400s)", - Py_TYPE(v)->tp_name); + "'%.400s' encoder returned '%.400s' instead of 'bytes'; " + "use codecs.encode to encode to arbitrary types", + encoding, + Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); Py_DECREF(v); return NULL; } @@ -3458,12 +3528,16 @@ /* Encode via the codec registry */ v = PyCodec_Encode(unicode, encoding, errors); - if (v == NULL) + if (v == NULL) { + wrap_codec_error("encoding", encoding); goto onError; + } if (!PyUnicode_Check(v)) { PyErr_Format(PyExc_TypeError, - "encoder did not return an str object (type=%.400s)", - Py_TYPE(v)->tp_name); + "'%.400s' encoder returned '%.400s' instead of 'str'; " + "use codecs.encode to encode to arbitrary types", + encoding, + Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); Py_DECREF(v); goto onError; }