diff -r 1ee45eb6aab9 Doc/whatsnew/3.4.rst --- a/Doc/whatsnew/3.4.rst Sat Nov 09 23:15:52 2013 +0200 +++ b/Doc/whatsnew/3.4.rst Mon Nov 11 00:58:38 2013 +1000 @@ -101,6 +101,7 @@ * :ref:`PEP 446: Make newly created file descriptors non-inheritable `. * command line option for :ref:`isolated mode `, (:issue:`16499`). +* improvements to codec error reporting Significantly Improved Library Modules: @@ -141,6 +142,63 @@ PEP written and implemented by Victor Stinner. +Codec handling improvements +=========================== + +The :meth:`codecs.encode` and :meth:`codecs.decode` convenience functions are +now properly documented in Python 2.7, 3.3 and 3.4. These functions have +existed in the :mod:`codecs` module and have been covered by the regression +test suite since Python 2.4, but were previously only discoverable through +runtime introspection. + +Unlike the convenience methods on :class:`str`, :class:`bytes` and +:class:`bytearray`, these convenience functions support arbitrary codecs +in both Python 2 and Python 3, rather than being limited to Unicode text +encodings (in Python 3) or ``basestring`` <-> ``basestring`` conversions +(in Python 2). + +The errors raised by the convenience methods when a codec produces the +incorrect output type have been updated to direct users towards these +general purpose convenience functions:: + + >>> import codecs + + >>> codecs.encode(b"hello", "bz2_codec").decode("bz2_codec") + Traceback (most recent call last): + File "", line 1, in + TypeError: 'bz2_codec' decoder returned 'bytes' instead of 'str'; use codecs.decode to decode to arbitrary types + + >>> "hello".encode("rot_13") + Traceback (most recent call last): + File "", line 1, in + TypeError: 'rot_13' encoder returned 'str' instead of 'bytes'; use codecs.encode to encode to arbitrary types + +In a related change, whenever it is feasible without breaking backwards +compatibility, exceptions raised during encoding and decoding operations +will be wrapped in a chained exception of the same type that mentions the +name of the codec responsible for producing the error:: + + >>> b"hello".decode("uu_codec") + ValueError: Missing "begin" line in input data + + The above exception was the direct cause of the following exception: + + Traceback (most recent call last): + File "", line 1, in + ValueError: decoding with 'uu_codec' codec failed (ValueError: Missing "begin" line in input data) + + >>> "hello".encode("bz2_codec") + TypeError: 'str' does not support the buffer interface + + The above exception was the direct cause of the following exception: + + Traceback (most recent call last): + File "", line 1, in + TypeError: encoding with 'bz2_codec' codec failed (TypeError: 'str' does not support the buffer interface) + +(Contributed by Nick Coghlan in :issue:`17827` and :issue:`17828`) + + Other Language Changes ====================== @@ -233,19 +291,6 @@ Added support for 24-bit samples (:issue:`12866`). -codecs ------- - -The :meth:`codecs.encode` and :meth:`codecs.decode` convenience functions are -now properly documented. These functions have existed in the :mod:`codecs` -module since ~2004, but were previously only discoverable through runtime -introspection. - -Unlike the convenience methods on :class:`str`, :class:`bytes` and -:class:`bytearray`, these convenience functions support arbitrary codecs, -rather than being limited to Unicode text encodings. - - colorsys -------- diff -r 1ee45eb6aab9 Include/pyerrors.h --- a/Include/pyerrors.h Sat Nov 09 23:15:52 2013 +0200 +++ b/Include/pyerrors.h Mon Nov 11 00:58:38 2013 +1000 @@ -285,6 +285,28 @@ const char *name, const char *doc, PyObject *base, PyObject *dict); PyAPI_FUNC(void) PyErr_WriteUnraisable(PyObject *); +/* In exceptions.c */ +#ifndef Py_LIMITED_API +/* Helper that attempts to replace the current exception with one of the + * same type but with a prefix added to the exception text. The resulting + * exception description looks like: + * + * prefix (exc_type: original_exc_str) + * + * Only some exceptions can be safely replaced. If the function determines + * it isn't safe to perform the replacement, it will leave the original + * unmodified exception in place. + * + * Returns a borrowed reference to the new exception (if any), NULL if the + * existing exception was left in place. + */ +PyAPI_FUNC(PyObject *) _PyErr_TrySetFromCause( + const char *prefix_format, /* ASCII-encoded string */ + ... + ); +#endif + + /* In sigcheck.c or signalmodule.c */ PyAPI_FUNC(int) PyErr_CheckSignals(void); PyAPI_FUNC(void) PyErr_SetInterrupt(void); diff -r 1ee45eb6aab9 Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Sat Nov 09 23:15:52 2013 +0200 +++ b/Lib/test/test_codecs.py Mon Nov 11 00:58:38 2013 +1000 @@ -1,5 +1,6 @@ import _testcapi import codecs +import contextlib import io import locale import sys @@ -2292,28 +2293,31 @@ def test_basics(self): binput = bytes(range(256)) for encoding in bytes_transform_encodings: - # generic codecs interface - (o, size) = codecs.getencoder(encoding)(binput) - self.assertEqual(size, len(binput)) - (i, size) = codecs.getdecoder(encoding)(o) - self.assertEqual(size, len(o)) - self.assertEqual(i, binput) + with self.subTest(encoding=encoding): + # generic codecs interface + (o, size) = codecs.getencoder(encoding)(binput) + self.assertEqual(size, len(binput)) + (i, size) = codecs.getdecoder(encoding)(o) + self.assertEqual(size, len(o)) + self.assertEqual(i, binput) def test_read(self): for encoding in bytes_transform_encodings: - sin = codecs.encode(b"\x80", encoding) - reader = codecs.getreader(encoding)(io.BytesIO(sin)) - sout = reader.read() - self.assertEqual(sout, b"\x80") + with self.subTest(encoding=encoding): + sin = codecs.encode(b"\x80", encoding) + reader = codecs.getreader(encoding)(io.BytesIO(sin)) + sout = reader.read() + self.assertEqual(sout, b"\x80") def test_readline(self): for encoding in bytes_transform_encodings: if encoding in ['uu_codec', 'zlib_codec']: continue - sin = codecs.encode(b"\x80", encoding) - reader = codecs.getreader(encoding)(io.BytesIO(sin)) - sout = reader.readline() - self.assertEqual(sout, b"\x80") + with self.subTest(encoding=encoding): + sin = codecs.encode(b"\x80", encoding) + reader = codecs.getreader(encoding)(io.BytesIO(sin)) + sout = reader.readline() + self.assertEqual(sout, b"\x80") def test_buffer_api_usage(self): # We check all the transform codecs accept memoryview input @@ -2321,17 +2325,150 @@ # and also that they roundtrip correctly original = b"12345\x80" for encoding in bytes_transform_encodings: - data = original - view = memoryview(data) - data = codecs.encode(data, encoding) - view_encoded = codecs.encode(view, encoding) - self.assertEqual(view_encoded, data) - view = memoryview(data) - data = codecs.decode(data, encoding) - self.assertEqual(data, original) - view_decoded = codecs.decode(view, encoding) - self.assertEqual(view_decoded, data) + with self.subTest(encoding=encoding): + data = original + view = memoryview(data) + data = codecs.encode(data, encoding) + view_encoded = codecs.encode(view, encoding) + self.assertEqual(view_encoded, data) + view = memoryview(data) + data = codecs.decode(data, encoding) + self.assertEqual(data, original) + view_decoded = codecs.decode(view, encoding) + self.assertEqual(view_decoded, data) + def test_type_error_for_text_input(self): + # Check binary -> binary codecs give a good error for str input + bad_input = "bad input type" + for encoding in bytes_transform_encodings: + with self.subTest(encoding=encoding): + msg = "^encoding with '{}' codec failed".format(encoding) + with self.assertRaisesRegex(TypeError, msg) as failure: + bad_input.encode(encoding) + self.assertTrue(isinstance(failure.exception.__cause__, + TypeError)) + + def test_type_error_for_binary_input(self): + # Check str -> str codec gives a good error for binary input + for bad_input in (b"immutable", bytearray(b"mutable")): + with self.subTest(bad_input=bad_input): + msg = "^decoding with 'rot_13' codec failed" + with self.assertRaisesRegex(AttributeError, msg) as failure: + bad_input.decode("rot_13") + self.assertTrue(isinstance(failure.exception.__cause__, + AttributeError)) + + def test_bad_decoding_output_type(self): + # Check bytes.decode and bytearray.decode give a good error + # message for binary -> binary codecs + data = b"encode first to ensure we meet any format restrictions" + for encoding in bytes_transform_encodings: + with self.subTest(encoding=encoding): + encoded_data = codecs.encode(data, encoding) + fmt = ("'{}' decoder returned 'bytes' instead of 'str'; " + "use codecs.decode\(\) to decode to arbitrary types") + msg = fmt.format(encoding) + with self.assertRaisesRegex(TypeError, msg): + encoded_data.decode(encoding) + with self.assertRaisesRegex(TypeError, msg): + bytearray(encoded_data).decode(encoding) + + def test_bad_encoding_output_type(self): + # Check str.encode gives a good error message for str -> str codecs + msg = ("'rot_13' encoder returned 'str' instead of 'bytes'; " + "use codecs.encode\(\) to encode to arbitrary types") + with self.assertRaisesRegex(TypeError, msg): + "just an example message".encode("rot_13") + + +# The codec system tries to wrap exceptions in order to ensure the error +# mentions the operation being performed and the codec involved. We +# currently *only* want this to happen for relatively stateless +# exceptions, where the only significant information they contain is their +# type and a single str argument. +class ExceptionChainingTest(unittest.TestCase): + + def setUp(self): + # There's no way to unregister a codec search function, so we just + # ensure we render this one fairly harmless after the test + # case finishes by using the test case repr as the codec name + # The codecs module normalizes codec names, although this doesn't + # appear to be formally documented... + self.codec_name = repr(self).lower().replace(" ", "-") + self.codec_info = None + codecs.register(self.get_codec) + + def get_codec(self, codec_name): + if codec_name != self.codec_name: + return None + return self.codec_info + + def set_codec(self, obj_to_raise): + def raise_obj(*args, **kwds): + raise obj_to_raise + self.codec_info = codecs.CodecInfo(raise_obj, raise_obj, + name=self.codec_name) + + @contextlib.contextmanager + def assertWrapped(self, operation, exc_type, msg): + full_msg = "{} with '{}' codec failed \({}: {}\)".format( + operation, self.codec_name, exc_type.__name__, msg) + with self.assertRaisesRegex(exc_type, full_msg) as caught: + yield caught + + def check_wrapped(self, obj_to_raise, msg): + self.set_codec(obj_to_raise) + with self.assertWrapped("encoding", RuntimeError, msg): + "str_input".encode(self.codec_name) + with self.assertWrapped("decoding", RuntimeError, msg): + b"bytes input".decode(self.codec_name) + + def test_raise_by_type(self): + self.check_wrapped(RuntimeError, "") + + def test_raise_by_value(self): + msg = "This should be wrapped" + self.check_wrapped(RuntimeError(msg), msg) + + @contextlib.contextmanager + def assertNotWrapped(self, operation, exc_type, msg): + with self.assertRaisesRegex(exc_type, msg) as caught: + yield caught + actual_msg = str(caught.exception) + self.assertNotIn(operation, actual_msg) + self.assertNotIn(self.codec_name, actual_msg) + + def check_not_wrapped(self, obj_to_raise, msg): + self.set_codec(obj_to_raise) + with self.assertNotWrapped("encoding", RuntimeError, msg): + "str input".encode(self.codec_name) + with self.assertNotWrapped("decoding", RuntimeError, msg): + b"bytes input".decode(self.codec_name) + + def test_init_override_is_not_wrapped(self): + class CustomInit(RuntimeError): + def __init__(self): + pass + self.check_not_wrapped(CustomInit, "") + + def test_new_override_is_not_wrapped(self): + class CustomNew(RuntimeError): + def __new__(cls): + return super().__new__(cls) + self.check_not_wrapped(CustomNew, "") + + def test_instance_attribute_is_not_wrapped(self): + msg = "This should NOT be wrapped" + exc = RuntimeError(msg) + exc.attr = 1 + self.check_not_wrapped(exc, msg) + + def test_non_str_arg_is_not_wrapped(self): + self.check_not_wrapped(RuntimeError(1), "1") + + def test_multiple_args_is_not_wrapped(self): + msg = "\('a', 'b', 'c'\)" + self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg) @unittest.skipUnless(sys.platform == 'win32', diff -r 1ee45eb6aab9 Objects/exceptions.c --- a/Objects/exceptions.c Sat Nov 09 23:15:52 2013 +0200 +++ b/Objects/exceptions.c Mon Nov 11 00:58:38 2013 +1000 @@ -2591,3 +2591,116 @@ free_preallocated_memerrors(); Py_CLEAR(errnomap); } + +/* Helper to do the equivalent of "raise X from Y" in C, but always using + * the current exception rather than passing one in. + * + * We currently limit this to *only* exceptions that use the BaseException + * tp_init and tp_new methods, since we can be reasonably sure we can wrap + * those correctly without losing data and without losing backwards + * compatibility. + * + * We also aim to rule out *all* exceptions that might be storing additional + * state, whether by having a size difference relative to BaseException, + * additional arguments passed in during construction or by having a + * non-empty instance dict. + * + * We need to be very careful with what we wrap, since changing types to + * a broader exception type would be backwards incompatible for + * existing codecs, and with different init or new method implementations + * may either not support instantiation with PyErr_Format or lose + * information when instantiated that way. + * + * XXX (ncoghlan): This could be made more comprehensive by exploiting the + * fact that exceptions are expected to support pickling. If more builtin + * exceptions (e.g. AttributeError) start to be converted to rich + * exceptions with additional attributes, that's probably a better approach + * to pursue over adding special cases for particular stateful subclasses. + * + * Returns a borrowed reference to the new exception (if any), NULL if the + * existing exception was left in place. + */ +PyObject * +_PyErr_TrySetFromCause(const char *format, ...) +{ + PyObject* msg_prefix; + PyObject *exc, *val, *tb; + PyTypeObject *caught_type; + PyObject *instance_dict; + PyObject *instance_args; + Py_ssize_t num_args; + PyObject *new_exc, *new_val, *new_tb; + va_list vargs; + +#ifdef HAVE_STDARG_PROTOTYPES + va_start(vargs, format); +#else + va_start(vargs); +#endif + + PyErr_Fetch(&exc, &val, &tb); + caught_type = (PyTypeObject *) exc; + /* Ensure type info indicates no extra state is stored at the C level */ + if (caught_type->tp_init != (initproc) BaseException_init || + caught_type->tp_new != BaseException_new || + caught_type->tp_basicsize != _PyExc_BaseException.tp_basicsize || + caught_type->tp_itemsize != _PyExc_BaseException.tp_itemsize + ) { + /* We can't be sure we can wrap this safely, since it may contain + * more state than just the exception type. Accordingly, we just + * leave it alone. + */ + PyErr_Restore(exc, val, tb); + return NULL; + } + + /* Check the args are empty or contain a single string */ + PyErr_NormalizeException(&exc, &val, &tb); + instance_args = ((PyBaseExceptionObject *) val)->args; + num_args = PyTuple_GET_SIZE(instance_args); + if ((num_args > 1) || + (num_args == 1 && + !PyUnicode_CheckExact(PyTuple_GET_ITEM(instance_args, 0)) + ) + ) { + /* More than 1 arg, or the one arg we do have isn't a string + */ + PyErr_Restore(exc, val, tb); + return NULL; + } + + /* Ensure the instance dict is also empty */ + instance_dict = *_PyObject_GetDictPtr(val); + if (instance_dict != NULL && PyObject_Length(instance_dict) > 0) { + /* While we could potentially copy a non-empty instance dictionary + * to the replacement exception, for now we take the more + * conservative path of leaving exceptions with attributes set + * alone. + */ + PyErr_Restore(exc, val, tb); + return NULL; + } + + /* For exceptions that we can wrap safely, we chain the original + * exception to a new one of the exact same type with an + * error message that mentions the additional details and the + * original exception. + * + * It would be nice to wrap OSError and various other exception + * types as well, but that's quite a bit trickier due to the extra + * state potentially stored on OSError instances. + */ + msg_prefix = PyUnicode_FromFormatV(format, vargs); + if (msg_prefix == NULL) + return NULL; + + PyErr_Format(exc, "%U (%s: %S)", + msg_prefix, Py_TYPE(val)->tp_name, val); + Py_DECREF(exc); + Py_XDECREF(tb); + PyErr_Fetch(&new_exc, &new_val, &new_tb); + PyErr_NormalizeException(&new_exc, &new_val, &new_tb); + PyException_SetCause(new_val, val); + PyErr_Restore(new_exc, new_val, new_tb); + return new_val; +} diff -r 1ee45eb6aab9 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Sat Nov 09 23:15:52 2013 +0200 +++ b/Objects/unicodeobject.c Mon Nov 11 00:58:38 2013 +1000 @@ -235,6 +235,7 @@ static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); static PyObject* get_latin1_char(unsigned char ch); static int unicode_modifiable(PyObject *unicode); +static void wrap_codec_error(const char* operation, const char *encoding); static PyObject * @@ -3050,12 +3051,16 @@ if (buffer == NULL) goto onError; unicode = PyCodec_Decode(buffer, encoding, errors); - if (unicode == NULL) + if (unicode == NULL) { + wrap_codec_error("decoding", encoding); goto onError; + } if (!PyUnicode_Check(unicode)) { PyErr_Format(PyExc_TypeError, - "decoder did not return a str object (type=%.400s)", - Py_TYPE(unicode)->tp_name); + "'%.400s' decoder returned '%.400s' instead of 'str'; " + "use codecs.decode() to decode to arbitrary types", + encoding, + Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); Py_DECREF(unicode); goto onError; } @@ -3084,8 +3089,10 @@ /* Decode via the codec registry */ v = PyCodec_Decode(unicode, encoding, errors); - if (v == NULL) + if (v == NULL) { + wrap_codec_error("decoding", encoding); goto onError; + } return unicode_result(v); onError: @@ -3109,12 +3116,16 @@ /* Decode via the codec registry */ v = PyCodec_Decode(unicode, encoding, errors); - if (v == NULL) + if (v == NULL) { + wrap_codec_error("decoding", encoding); goto onError; + } if (!PyUnicode_Check(v)) { PyErr_Format(PyExc_TypeError, - "decoder did not return a str object (type=%.400s)", - Py_TYPE(v)->tp_name); + "'%.400s' decoder returned '%.400s' instead of 'str'; " + "use codecs.decode() to decode to arbitrary types", + encoding, + Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); Py_DECREF(v); goto onError; } @@ -3374,6 +3385,23 @@ #endif } + +/* Helper that tries to ensure the reported exception chain indicates the + * codec that was invoked to trigger the failure without changing the type + * of the exception raised. + */ +static void +wrap_codec_error(const char *operation, + const char *encoding) +{ + /* TrySetFromCause will replace the active exception with a suitably + * updated clone if it can, otherwise it will leave the original + * exception alone. + */ + _PyErr_TrySetFromCause("%s with '%s' codec failed", + operation, encoding); +} + PyObject * PyUnicode_AsEncodedString(PyObject *unicode, const char *encoding, @@ -3412,8 +3440,10 @@ /* Encode via the codec registry */ v = PyCodec_Encode(unicode, encoding, errors); - if (v == NULL) - return NULL; + if (v == NULL) { + wrap_codec_error("encoding", encoding); + return NULL; + } /* The normal path */ if (PyBytes_Check(v)) @@ -3425,7 +3455,8 @@ PyObject *b; error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, - "encoder %s returned bytearray instead of bytes", + "encoder %s returned bytearray instead of bytes; " + "use codecs.encode() to encode to arbitrary types", encoding); if (error) { Py_DECREF(v); @@ -3438,8 +3469,10 @@ } PyErr_Format(PyExc_TypeError, - "encoder did not return a bytes object (type=%.400s)", - Py_TYPE(v)->tp_name); + "'%.400s' encoder returned '%.400s' instead of 'bytes'; " + "use codecs.encode() to encode to arbitrary types", + encoding, + Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); Py_DECREF(v); return NULL; } @@ -3461,12 +3494,16 @@ /* Encode via the codec registry */ v = PyCodec_Encode(unicode, encoding, errors); - if (v == NULL) + if (v == NULL) { + wrap_codec_error("encoding", encoding); goto onError; + } if (!PyUnicode_Check(v)) { PyErr_Format(PyExc_TypeError, - "encoder did not return an str object (type=%.400s)", - Py_TYPE(v)->tp_name); + "'%.400s' encoder returned '%.400s' instead of 'str'; " + "use codecs.encode() to encode to arbitrary types", + encoding, + Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); Py_DECREF(v); goto onError; } diff -r 1ee45eb6aab9 Python/errors.c --- a/Python/errors.c Sat Nov 09 23:15:52 2013 +0200 +++ b/Python/errors.c Mon Nov 11 00:58:38 2013 +1000 @@ -749,7 +749,6 @@ } - PyObject * PyErr_NewException(const char *name, PyObject *base, PyObject *dict) {