diff -r f89beccd470c Doc/whatsnew/3.4.rst --- a/Doc/whatsnew/3.4.rst Tue Nov 05 02:50:49 2013 -0800 +++ b/Doc/whatsnew/3.4.rst Wed Nov 06 00:55:58 2013 +1000 @@ -101,6 +101,7 @@ * :ref:`PEP 446: Make newly created file descriptors non-inheritable `. * command line option for :ref:`isolated mode `, (:issue:`16499`). +* improvements to codec error reporting Significantly Improved Library Modules: @@ -141,6 +142,63 @@ PEP written and implemented by Victor Stinner. +Codec handling improvements +=========================== + +The :meth:`codecs.encode` and :meth:`codecs.decode` convenience functions are +now properly documented in Python 2.7, 3.3 and 3.4. These functions have +existed in the :mod:`codecs` module and have been covered by the regression +test suite since Python 2.4, but were previously only discoverable through +runtime introspection. + +Unlike the convenience methods on :class:`str`, :class:`bytes` and +:class:`bytearray`, these convenience functions support arbitrary codecs +in both Python 2 and Python 3, rather than being limited to Unicode text +encodings (in Python 3) or ``basestring`` <-> ``basestring`` conversions +(in Python 2). + +The errors raised by the convenience methods when a codec produces the +incorrect output type have been updated to direct users towards these +general purpose convenience functions:: + + >>> import codecs + + >>> codecs.encode(b"hello", "bz2_codec").decode("bz2_codec") + Traceback (most recent call last): + File "", line 1, in + TypeError: 'bz2_codec' decoder returned 'bytes' instead of 'str'; use codecs.decode to decode to arbitrary types + + >>> "hello".encode("rot_13") + Traceback (most recent call last): + File "", line 1, in + TypeError: 'rot_13' encoder returned 'str' instead of 'bytes'; use codecs.encode to encode to arbitrary types + +In a related change, whenever it is feasible without breaking backwards +compatibility, exceptions raised during encoding and decoding operations +will be wrapped in a chained exception of the same type that mentions the +name of the codec responsible for producing the error:: + + >>> b"hello".decode("uu_codec") + ValueError: Missing "begin" line in input data + + The above exception was the direct cause of the following exception: + + Traceback (most recent call last): + File "", line 1, in + ValueError: decoding with 'uu_codec' codec failed (ValueError: Missing "begin" line in input data) + + >>> "hello".encode("bz2_codec") + TypeError: 'str' does not support the buffer interface + + The above exception was the direct cause of the following exception: + + Traceback (most recent call last): + File "", line 1, in + TypeError: encoding with 'bz2_codec' codec failed (TypeError: 'str' does not support the buffer interface) + +(Contributed by Nick Coghlan in :issue:`17827` and :issue:`17828`) + + Other Language Changes ====================== @@ -233,19 +291,6 @@ Added support for 24-bit samples (:issue:`12866`). -codecs ------- - -The :meth:`codecs.encode` and :meth:`codecs.decode` convenience functions are -now properly documented. These functions have existed in the :mod:`codecs` -module since ~2004, but were previously only discoverable through runtime -introspection. - -Unlike the convenience methods on :class:`str`, :class:`bytes` and -:class:`bytearray`, these convenience functions support arbitrary codecs, -rather than being limited to Unicode text encodings. - - colorsys -------- diff -r f89beccd470c Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Tue Nov 05 02:50:49 2013 -0800 +++ b/Lib/test/test_codecs.py Wed Nov 06 00:55:58 2013 +1000 @@ -2292,28 +2292,31 @@ def test_basics(self): binput = bytes(range(256)) for encoding in bytes_transform_encodings: - # generic codecs interface - (o, size) = codecs.getencoder(encoding)(binput) - self.assertEqual(size, len(binput)) - (i, size) = codecs.getdecoder(encoding)(o) - self.assertEqual(size, len(o)) - self.assertEqual(i, binput) + with self.subTest(encoding=encoding): + # generic codecs interface + (o, size) = codecs.getencoder(encoding)(binput) + self.assertEqual(size, len(binput)) + (i, size) = codecs.getdecoder(encoding)(o) + self.assertEqual(size, len(o)) + self.assertEqual(i, binput) def test_read(self): for encoding in bytes_transform_encodings: - sin = codecs.encode(b"\x80", encoding) - reader = codecs.getreader(encoding)(io.BytesIO(sin)) - sout = reader.read() - self.assertEqual(sout, b"\x80") + with self.subTest(encoding=encoding): + sin = codecs.encode(b"\x80", encoding) + reader = codecs.getreader(encoding)(io.BytesIO(sin)) + sout = reader.read() + self.assertEqual(sout, b"\x80") def test_readline(self): for encoding in bytes_transform_encodings: if encoding in ['uu_codec', 'zlib_codec']: continue - sin = codecs.encode(b"\x80", encoding) - reader = codecs.getreader(encoding)(io.BytesIO(sin)) - sout = reader.readline() - self.assertEqual(sout, b"\x80") + with self.subTest(encoding=encoding): + sin = codecs.encode(b"\x80", encoding) + reader = codecs.getreader(encoding)(io.BytesIO(sin)) + sout = reader.readline() + self.assertEqual(sout, b"\x80") def test_buffer_api_usage(self): # We check all the transform codecs accept memoryview input @@ -2321,16 +2324,60 @@ # and also that they roundtrip correctly original = b"12345\x80" for encoding in bytes_transform_encodings: - data = original - view = memoryview(data) - data = codecs.encode(data, encoding) - view_encoded = codecs.encode(view, encoding) - self.assertEqual(view_encoded, data) - view = memoryview(data) - data = codecs.decode(data, encoding) - self.assertEqual(data, original) - view_decoded = codecs.decode(view, encoding) - self.assertEqual(view_decoded, data) + with self.subTest(encoding=encoding): + data = original + view = memoryview(data) + data = codecs.encode(data, encoding) + view_encoded = codecs.encode(view, encoding) + self.assertEqual(view_encoded, data) + view = memoryview(data) + data = codecs.decode(data, encoding) + self.assertEqual(data, original) + view_decoded = codecs.decode(view, encoding) + self.assertEqual(view_decoded, data) + + def test_type_error_for_text_input(self): + # Check binary -> binary codecs give a good error for str input + bad_input = "bad input type" + for encoding in bytes_transform_encodings: + with self.subTest(encoding=encoding): + msg = "^encoding with '{}' codec failed".format(encoding) + with self.assertRaisesRegex(TypeError, msg) as failure: + bad_input.encode(encoding) + self.assertTrue(isinstance(failure.exception.__cause__, + TypeError)) + + def test_type_error_for_binary_input(self): + # Check str -> str codec gives a good error for binary input + for bad_input in (b"immutable", bytearray(b"mutable")): + with self.subTest(bad_input=bad_input): + msg = "^decoding with 'rot_13' codec failed" + with self.assertRaisesRegex(AttributeError, msg) as failure: + bad_input.decode("rot_13") + self.assertTrue(isinstance(failure.exception.__cause__, + AttributeError)) + + def test_bad_decoding_output_type(self): + # Check bytes.decode and bytearray.decode give a good error + # message for binary -> binary codecs + data = b"encode first to ensure we meet any format restrictions" + for encoding in bytes_transform_encodings: + with self.subTest(encoding=encoding): + encoded_data = codecs.encode(data, encoding) + fmt = ("'{}' decoder returned 'bytes' instead of 'str'; " + "use codecs.decode to decode to arbitrary types") + msg = fmt.format(encoding) + with self.assertRaisesRegex(TypeError, msg): + encoded_data.decode(encoding) + with self.assertRaisesRegex(TypeError, msg): + bytearray(encoded_data).decode(encoding) + + def test_bad_encoding_output_type(self): + # Check str.encode gives a good error message for str -> str codecs + msg = ("'rot_13' encoder returned 'str' instead of 'bytes'; " + "use codecs.encode to encode to arbitrary types") + with self.assertRaisesRegex(TypeError, msg): + "just an example message".encode("rot_13") diff -r f89beccd470c Objects/unicodeobject.c --- a/Objects/unicodeobject.c Tue Nov 05 02:50:49 2013 -0800 +++ b/Objects/unicodeobject.c Wed Nov 06 00:55:58 2013 +1000 @@ -235,6 +235,7 @@ static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); static PyObject* get_latin1_char(unsigned char ch); static int unicode_modifiable(PyObject *unicode); +static void wrap_codec_error(const char* operation, const char *encoding); static PyObject * @@ -3047,12 +3048,16 @@ if (buffer == NULL) goto onError; unicode = PyCodec_Decode(buffer, encoding, errors); - if (unicode == NULL) + if (unicode == NULL) { + wrap_codec_error("decoding", encoding); goto onError; + } if (!PyUnicode_Check(unicode)) { PyErr_Format(PyExc_TypeError, - "decoder did not return a str object (type=%.400s)", - Py_TYPE(unicode)->tp_name); + "'%.400s' decoder returned '%.400s' instead of 'str'; " + "use codecs.decode to decode to arbitrary types", + encoding, + Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); Py_DECREF(unicode); goto onError; } @@ -3081,8 +3086,10 @@ /* Decode via the codec registry */ v = PyCodec_Decode(unicode, encoding, errors); - if (v == NULL) + if (v == NULL) { + wrap_codec_error("decoding", encoding); goto onError; + } return unicode_result(v); onError: @@ -3106,12 +3113,16 @@ /* Decode via the codec registry */ v = PyCodec_Decode(unicode, encoding, errors); - if (v == NULL) + if (v == NULL) { + wrap_codec_error("decoding", encoding); goto onError; + } if (!PyUnicode_Check(v)) { PyErr_Format(PyExc_TypeError, - "decoder did not return a str object (type=%.400s)", - Py_TYPE(v)->tp_name); + "'%.400s' decoder returned '%.400s' instead of 'str'; " + "use codecs.decode to decode to arbitrary types", + encoding, + Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); Py_DECREF(v); goto onError; } @@ -3371,6 +3382,60 @@ #endif } + +/* Helper to ensure the exception chain indicates the codec that was + * invoked to trigger the failure. + * + * We limit this to *exact* matches on a whitelist of types that we + * know we can wrap correctly. + * + * We need to be very careful with what we wrap, since changing types to + * a broader exception type would be backwards incompatible for + * existing codecs, and subclasses of the known types may either + * not support instantiation with PyErr_Format or lose information + * when instantiated that way. + * + * We skip wrapping UnicodeEncodeError and UnicodeDecodeError since + * they're annoying to wrap correctly and also already mention the + * codec that triggered the error by name. + */ +static void +wrap_codec_error(const char *operation, + const char *encoding) +{ + PyObject *exc, *val, *tb; + PyObject *new_exc, *new_val, *new_tb; + PyErr_Fetch(&exc, &val, &tb); + if (exc == PyExc_TypeError || + exc == PyExc_ValueError || + exc == PyExc_AttributeError + ) { + /* For whitelisted exception types, we chain the original + * exception to a new one of the exact same type with an + * error message that mentions the current codec and the + * original exception. + * + * It would be nice to wrap OSError as well, but that's a + * bit trickier due to the extra state potentially stored + * on OSError instances. + */ + PyErr_NormalizeException(&exc, &val, &tb); + PyErr_Format(exc, + "%s with '%s' codec failed (%s: %S)", + operation, encoding, + Py_TYPE(val)->tp_name, val); + Py_DECREF(exc); + Py_XDECREF(tb); + PyErr_Fetch(&new_exc, &new_val, &new_tb); + PyErr_NormalizeException(&new_exc, &new_val, &new_tb); + PyException_SetCause(new_val, val); + PyErr_Restore(new_exc, new_val, new_tb); + } + else { + PyErr_Restore(exc, val, tb); + } +} + PyObject * PyUnicode_AsEncodedString(PyObject *unicode, const char *encoding, @@ -3409,8 +3474,10 @@ /* Encode via the codec registry */ v = PyCodec_Encode(unicode, encoding, errors); - if (v == NULL) - return NULL; + if (v == NULL) { + wrap_codec_error("encoding", encoding); + return NULL; + } /* The normal path */ if (PyBytes_Check(v)) @@ -3422,7 +3489,8 @@ PyObject *b; error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, - "encoder %s returned bytearray instead of bytes", + "encoder %s returned bytearray instead of bytes; " + "use codecs.encode to encode to arbitrary types", encoding); if (error) { Py_DECREF(v); @@ -3435,8 +3503,10 @@ } PyErr_Format(PyExc_TypeError, - "encoder did not return a bytes object (type=%.400s)", - Py_TYPE(v)->tp_name); + "'%.400s' encoder returned '%.400s' instead of 'bytes'; " + "use codecs.encode to encode to arbitrary types", + encoding, + Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); Py_DECREF(v); return NULL; } @@ -3458,12 +3528,16 @@ /* Encode via the codec registry */ v = PyCodec_Encode(unicode, encoding, errors); - if (v == NULL) + if (v == NULL) { + wrap_codec_error("encoding", encoding); goto onError; + } if (!PyUnicode_Check(v)) { PyErr_Format(PyExc_TypeError, - "encoder did not return an str object (type=%.400s)", - Py_TYPE(v)->tp_name); + "'%.400s' encoder returned '%.400s' instead of 'str'; " + "use codecs.encode to encode to arbitrary types", + encoding, + Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); Py_DECREF(v); goto onError; }