diff -r 13203ea0ac5b Doc/whatsnew/3.4.rst --- a/Doc/whatsnew/3.4.rst Wed Nov 13 13:29:37 2013 +0100 +++ b/Doc/whatsnew/3.4.rst Wed Nov 13 23:28:37 2013 +1000 @@ -102,6 +102,7 @@ * :ref:`PEP 446: Make newly created file descriptors non-inheritable `. * command line option for :ref:`isolated mode `, (:issue:`16499`). +* improvements to handling of non-Unicode codecs Significantly Improved Library Modules: @@ -170,6 +171,70 @@ PEP written and implemented by Victor Stinner. +Improvements to handling of non-Unicode codecs +============================================== + +Since it was first introduced, the :mod:`codecs` module has always been +intended to operate as a type-neutral dynamic encoding and decoding +system. However, its close coupling with the Python text model, especially +the type restricted convenience methods on the builtin :class:`str`, +:class:`bytes` and :class:`bytearray` types, has historically obscured that +fact. + +As a key step in clarifying the situation, the :meth:`codecs.encode` and +:meth:`codecs.decode` convenience functions are now properly documented in +Python 2.7, 3.3 and 3.4. These functions have existed in the :mod:`codecs` +module and have been covered by the regression test suite since Python 2.4, +but were previously only discoverable through runtime introspection. + +Unlike the convenience methods on :class:`str`, :class:`bytes` and +:class:`bytearray`, these convenience functions support arbitrary codecs +in both Python 2 and Python 3, rather than being limited to Unicode text +encodings (in Python 3) or ``basestring`` <-> ``basestring`` conversions +(in Python 2). + +In Python 3.4, the errors raised by the convenience methods when a codec +produces the incorrect output type have also been updated to direct users +towards these general purpose convenience functions:: + + >>> import codecs + + >>> codecs.encode(b"hello", "bz2_codec").decode("bz2_codec") + Traceback (most recent call last): + File "", line 1, in + TypeError: 'bz2_codec' decoder returned 'bytes' instead of 'str'; use codecs.decode() to decode to arbitrary types + + >>> "hello".encode("rot_13") + Traceback (most recent call last): + File "", line 1, in + TypeError: 'rot_13' encoder returned 'str' instead of 'bytes'; use codecs.encode() to encode to arbitrary types + +In a related change, whenever it is feasible without breaking backwards +compatibility, exceptions raised during encoding and decoding operations +will be wrapped in a chained exception of the same type that mentions the +name of the codec responsible for producing the error:: + + >>> b"hello".decode("uu_codec") + ValueError: Missing "begin" line in input data + + The above exception was the direct cause of the following exception: + + Traceback (most recent call last): + File "", line 1, in + ValueError: decoding with 'uu_codec' codec failed (ValueError: Missing "begin" line in input data) + + >>> "hello".encode("bz2_codec") + TypeError: 'str' does not support the buffer interface + + The above exception was the direct cause of the following exception: + + Traceback (most recent call last): + File "", line 1, in + TypeError: encoding with 'bz2_codec' codec failed (TypeError: 'str' does not support the buffer interface) + +(Contributed by Nick Coghlan in :issue:`17827` and :issue:`17828`) + + Other Language Changes ====================== @@ -262,19 +327,6 @@ Added support for 24-bit samples (:issue:`12866`). -codecs ------- - -The :meth:`codecs.encode` and :meth:`codecs.decode` convenience functions are -now properly documented. These functions have existed in the :mod:`codecs` -module since ~2004, but were previously only discoverable through runtime -introspection. - -Unlike the convenience methods on :class:`str`, :class:`bytes` and -:class:`bytearray`, these convenience functions support arbitrary codecs, -rather than being limited to Unicode text encodings. - - colorsys -------- diff -r 13203ea0ac5b Include/pyerrors.h --- a/Include/pyerrors.h Wed Nov 13 13:29:37 2013 +0100 +++ b/Include/pyerrors.h Wed Nov 13 23:28:37 2013 +1000 @@ -285,6 +285,28 @@ const char *name, const char *doc, PyObject *base, PyObject *dict); PyAPI_FUNC(void) PyErr_WriteUnraisable(PyObject *); +/* In exceptions.c */ +#ifndef Py_LIMITED_API +/* Helper that attempts to replace the current exception with one of the + * same type but with a prefix added to the exception text. The resulting + * exception description looks like: + * + * prefix (exc_type: original_exc_str) + * + * Only some exceptions can be safely replaced. If the function determines + * it isn't safe to perform the replacement, it will leave the original + * unmodified exception in place. + * + * Returns a borrowed reference to the new exception (if any), NULL if the + * existing exception was left in place. + */ +PyAPI_FUNC(PyObject *) _PyErr_TrySetFromCause( + const char *prefix_format, /* ASCII-encoded string */ + ... + ); +#endif + + /* In sigcheck.c or signalmodule.c */ PyAPI_FUNC(int) PyErr_CheckSignals(void); PyAPI_FUNC(void) PyErr_SetInterrupt(void); diff -r 13203ea0ac5b Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Wed Nov 13 13:29:37 2013 +0100 +++ b/Lib/test/test_codecs.py Wed Nov 13 23:28:37 2013 +1000 @@ -1,5 +1,6 @@ import _testcapi import codecs +import contextlib import io import locale import sys @@ -2292,28 +2293,31 @@ def test_basics(self): binput = bytes(range(256)) for encoding in bytes_transform_encodings: - # generic codecs interface - (o, size) = codecs.getencoder(encoding)(binput) - self.assertEqual(size, len(binput)) - (i, size) = codecs.getdecoder(encoding)(o) - self.assertEqual(size, len(o)) - self.assertEqual(i, binput) + with self.subTest(encoding=encoding): + # generic codecs interface + (o, size) = codecs.getencoder(encoding)(binput) + self.assertEqual(size, len(binput)) + (i, size) = codecs.getdecoder(encoding)(o) + self.assertEqual(size, len(o)) + self.assertEqual(i, binput) def test_read(self): for encoding in bytes_transform_encodings: - sin = codecs.encode(b"\x80", encoding) - reader = codecs.getreader(encoding)(io.BytesIO(sin)) - sout = reader.read() - self.assertEqual(sout, b"\x80") + with self.subTest(encoding=encoding): + sin = codecs.encode(b"\x80", encoding) + reader = codecs.getreader(encoding)(io.BytesIO(sin)) + sout = reader.read() + self.assertEqual(sout, b"\x80") def test_readline(self): for encoding in bytes_transform_encodings: if encoding in ['uu_codec', 'zlib_codec']: continue - sin = codecs.encode(b"\x80", encoding) - reader = codecs.getreader(encoding)(io.BytesIO(sin)) - sout = reader.readline() - self.assertEqual(sout, b"\x80") + with self.subTest(encoding=encoding): + sin = codecs.encode(b"\x80", encoding) + reader = codecs.getreader(encoding)(io.BytesIO(sin)) + sout = reader.readline() + self.assertEqual(sout, b"\x80") def test_buffer_api_usage(self): # We check all the transform codecs accept memoryview input @@ -2321,17 +2325,158 @@ # and also that they roundtrip correctly original = b"12345\x80" for encoding in bytes_transform_encodings: - data = original - view = memoryview(data) - data = codecs.encode(data, encoding) - view_encoded = codecs.encode(view, encoding) - self.assertEqual(view_encoded, data) - view = memoryview(data) - data = codecs.decode(data, encoding) - self.assertEqual(data, original) - view_decoded = codecs.decode(view, encoding) - self.assertEqual(view_decoded, data) + with self.subTest(encoding=encoding): + data = original + view = memoryview(data) + data = codecs.encode(data, encoding) + view_encoded = codecs.encode(view, encoding) + self.assertEqual(view_encoded, data) + view = memoryview(data) + data = codecs.decode(data, encoding) + self.assertEqual(data, original) + view_decoded = codecs.decode(view, encoding) + self.assertEqual(view_decoded, data) + def test_type_error_for_text_input(self): + # Check binary -> binary codecs give a good error for str input + bad_input = "bad input type" + for encoding in bytes_transform_encodings: + with self.subTest(encoding=encoding): + msg = "^encoding with '{}' codec failed".format(encoding) + with self.assertRaisesRegex(TypeError, msg) as failure: + bad_input.encode(encoding) + self.assertTrue(isinstance(failure.exception.__cause__, + TypeError)) + + def test_type_error_for_binary_input(self): + # Check str -> str codec gives a good error for binary input + for bad_input in (b"immutable", bytearray(b"mutable")): + with self.subTest(bad_input=bad_input): + msg = "^decoding with 'rot_13' codec failed" + with self.assertRaisesRegex(AttributeError, msg) as failure: + bad_input.decode("rot_13") + self.assertTrue(isinstance(failure.exception.__cause__, + AttributeError)) + + def test_bad_decoding_output_type(self): + # Check bytes.decode and bytearray.decode give a good error + # message for binary -> binary codecs + data = b"encode first to ensure we meet any format restrictions" + for encoding in bytes_transform_encodings: + with self.subTest(encoding=encoding): + encoded_data = codecs.encode(data, encoding) + fmt = ("'{}' decoder returned 'bytes' instead of 'str'; " + "use codecs.decode\(\) to decode to arbitrary types") + msg = fmt.format(encoding) + with self.assertRaisesRegex(TypeError, msg): + encoded_data.decode(encoding) + with self.assertRaisesRegex(TypeError, msg): + bytearray(encoded_data).decode(encoding) + + def test_bad_encoding_output_type(self): + # Check str.encode gives a good error message for str -> str codecs + msg = ("'rot_13' encoder returned 'str' instead of 'bytes'; " + "use codecs.encode\(\) to encode to arbitrary types") + with self.assertRaisesRegex(TypeError, msg): + "just an example message".encode("rot_13") + + +# The codec system tries to wrap exceptions in order to ensure the error +# mentions the operation being performed and the codec involved. We +# currently *only* want this to happen for relatively stateless +# exceptions, where the only significant information they contain is their +# type and a single str argument. +class ExceptionChainingTest(unittest.TestCase): + + def setUp(self): + # There's no way to unregister a codec search function, so we just + # ensure we render this one fairly harmless after the test + # case finishes by using the test case repr as the codec name + # The codecs module normalizes codec names, although this doesn't + # appear to be formally documented... + self.codec_name = repr(self).lower().replace(" ", "-") + self.codec_info = None + codecs.register(self.get_codec) + + def get_codec(self, codec_name): + if codec_name != self.codec_name: + return None + return self.codec_info + + def set_codec(self, obj_to_raise): + def raise_obj(*args, **kwds): + raise obj_to_raise + self.codec_info = codecs.CodecInfo(raise_obj, raise_obj, + name=self.codec_name) + + @contextlib.contextmanager + def assertWrapped(self, operation, exc_type, msg): + full_msg = "{} with '{}' codec failed \({}: {}\)".format( + operation, self.codec_name, exc_type.__name__, msg) + with self.assertRaisesRegex(exc_type, full_msg) as caught: + yield caught + + def check_wrapped(self, obj_to_raise, msg): + self.set_codec(obj_to_raise) + with self.assertWrapped("encoding", RuntimeError, msg): + "str_input".encode(self.codec_name) + with self.assertWrapped("encoding", RuntimeError, msg): + codecs.encode("str_input", self.codec_name) + with self.assertWrapped("decoding", RuntimeError, msg): + b"bytes input".decode(self.codec_name) + with self.assertWrapped("decoding", RuntimeError, msg): + codecs.decode(b"bytes input", self.codec_name) + + def test_raise_by_type(self): + self.check_wrapped(RuntimeError, "") + + def test_raise_by_value(self): + msg = "This should be wrapped" + self.check_wrapped(RuntimeError(msg), msg) + + @contextlib.contextmanager + def assertNotWrapped(self, operation, exc_type, msg): + with self.assertRaisesRegex(exc_type, msg) as caught: + yield caught + actual_msg = str(caught.exception) + self.assertNotIn(operation, actual_msg) + self.assertNotIn(self.codec_name, actual_msg) + + def check_not_wrapped(self, obj_to_raise, msg): + self.set_codec(obj_to_raise) + with self.assertNotWrapped("encoding", RuntimeError, msg): + "str input".encode(self.codec_name) + with self.assertNotWrapped("encoding", RuntimeError, msg): + codecs.encode("str input", self.codec_name) + with self.assertNotWrapped("decoding", RuntimeError, msg): + b"bytes input".decode(self.codec_name) + with self.assertNotWrapped("decoding", RuntimeError, msg): + codecs.decode(b"bytes input", self.codec_name) + + def test_init_override_is_not_wrapped(self): + class CustomInit(RuntimeError): + def __init__(self): + pass + self.check_not_wrapped(CustomInit, "") + + def test_new_override_is_not_wrapped(self): + class CustomNew(RuntimeError): + def __new__(cls): + return super().__new__(cls) + self.check_not_wrapped(CustomNew, "") + + def test_instance_attribute_is_not_wrapped(self): + msg = "This should NOT be wrapped" + exc = RuntimeError(msg) + exc.attr = 1 + self.check_not_wrapped(exc, msg) + + def test_non_str_arg_is_not_wrapped(self): + self.check_not_wrapped(RuntimeError(1), "1") + + def test_multiple_args_is_not_wrapped(self): + msg = "\('a', 'b', 'c'\)" + self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg) @unittest.skipUnless(sys.platform == 'win32', diff -r 13203ea0ac5b Objects/exceptions.c --- a/Objects/exceptions.c Wed Nov 13 13:29:37 2013 +0100 +++ b/Objects/exceptions.c Wed Nov 13 23:28:37 2013 +1000 @@ -2591,3 +2591,116 @@ free_preallocated_memerrors(); Py_CLEAR(errnomap); } + +/* Helper to do the equivalent of "raise X from Y" in C, but always using + * the current exception rather than passing one in. + * + * We currently limit this to *only* exceptions that use the BaseException + * tp_init and tp_new methods, since we can be reasonably sure we can wrap + * those correctly without losing data and without losing backwards + * compatibility. + * + * We also aim to rule out *all* exceptions that might be storing additional + * state, whether by having a size difference relative to BaseException, + * additional arguments passed in during construction or by having a + * non-empty instance dict. + * + * We need to be very careful with what we wrap, since changing types to + * a broader exception type would be backwards incompatible for + * existing codecs, and with different init or new method implementations + * may either not support instantiation with PyErr_Format or lose + * information when instantiated that way. + * + * XXX (ncoghlan): This could be made more comprehensive by exploiting the + * fact that exceptions are expected to support pickling. If more builtin + * exceptions (e.g. AttributeError) start to be converted to rich + * exceptions with additional attributes, that's probably a better approach + * to pursue over adding special cases for particular stateful subclasses. + * + * Returns a borrowed reference to the new exception (if any), NULL if the + * existing exception was left in place. + */ +PyObject * +_PyErr_TrySetFromCause(const char *format, ...) +{ + PyObject* msg_prefix; + PyObject *exc, *val, *tb; + PyTypeObject *caught_type; + PyObject *instance_dict; + PyObject *instance_args; + Py_ssize_t num_args; + PyObject *new_exc, *new_val, *new_tb; + va_list vargs; + +#ifdef HAVE_STDARG_PROTOTYPES + va_start(vargs, format); +#else + va_start(vargs); +#endif + + PyErr_Fetch(&exc, &val, &tb); + caught_type = (PyTypeObject *) exc; + /* Ensure type info indicates no extra state is stored at the C level */ + if (caught_type->tp_init != (initproc) BaseException_init || + caught_type->tp_new != BaseException_new || + caught_type->tp_basicsize != _PyExc_BaseException.tp_basicsize || + caught_type->tp_itemsize != _PyExc_BaseException.tp_itemsize + ) { + /* We can't be sure we can wrap this safely, since it may contain + * more state than just the exception type. Accordingly, we just + * leave it alone. + */ + PyErr_Restore(exc, val, tb); + return NULL; + } + + /* Check the args are empty or contain a single string */ + PyErr_NormalizeException(&exc, &val, &tb); + instance_args = ((PyBaseExceptionObject *) val)->args; + num_args = PyTuple_GET_SIZE(instance_args); + if ((num_args > 1) || + (num_args == 1 && + !PyUnicode_CheckExact(PyTuple_GET_ITEM(instance_args, 0)) + ) + ) { + /* More than 1 arg, or the one arg we do have isn't a string + */ + PyErr_Restore(exc, val, tb); + return NULL; + } + + /* Ensure the instance dict is also empty */ + instance_dict = *_PyObject_GetDictPtr(val); + if (instance_dict != NULL && PyObject_Length(instance_dict) > 0) { + /* While we could potentially copy a non-empty instance dictionary + * to the replacement exception, for now we take the more + * conservative path of leaving exceptions with attributes set + * alone. + */ + PyErr_Restore(exc, val, tb); + return NULL; + } + + /* For exceptions that we can wrap safely, we chain the original + * exception to a new one of the exact same type with an + * error message that mentions the additional details and the + * original exception. + * + * It would be nice to wrap OSError and various other exception + * types as well, but that's quite a bit trickier due to the extra + * state potentially stored on OSError instances. + */ + msg_prefix = PyUnicode_FromFormatV(format, vargs); + if (msg_prefix == NULL) + return NULL; + + PyErr_Format(exc, "%U (%s: %S)", + msg_prefix, Py_TYPE(val)->tp_name, val); + Py_DECREF(exc); + Py_XDECREF(tb); + PyErr_Fetch(&new_exc, &new_val, &new_tb); + PyErr_NormalizeException(&new_exc, &new_val, &new_tb); + PyException_SetCause(new_val, val); + PyErr_Restore(new_exc, new_val, new_tb); + return new_val; +} diff -r 13203ea0ac5b Objects/unicodeobject.c --- a/Objects/unicodeobject.c Wed Nov 13 13:29:37 2013 +0100 +++ b/Objects/unicodeobject.c Wed Nov 13 23:28:37 2013 +1000 @@ -3054,8 +3054,10 @@ goto onError; if (!PyUnicode_Check(unicode)) { PyErr_Format(PyExc_TypeError, - "decoder did not return a str object (type=%.400s)", - Py_TYPE(unicode)->tp_name); + "'%.400s' decoder returned '%.400s' instead of 'str'; " + "use codecs.decode() to decode to arbitrary types", + encoding, + Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); Py_DECREF(unicode); goto onError; } @@ -3113,8 +3115,10 @@ goto onError; if (!PyUnicode_Check(v)) { PyErr_Format(PyExc_TypeError, - "decoder did not return a str object (type=%.400s)", - Py_TYPE(v)->tp_name); + "'%.400s' decoder returned '%.400s' instead of 'str'; " + "use codecs.decode() to decode to arbitrary types", + encoding, + Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); Py_DECREF(v); goto onError; } @@ -3425,7 +3429,8 @@ PyObject *b; error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, - "encoder %s returned bytearray instead of bytes", + "encoder %s returned bytearray instead of bytes; " + "use codecs.encode() to encode to arbitrary types", encoding); if (error) { Py_DECREF(v); @@ -3438,8 +3443,10 @@ } PyErr_Format(PyExc_TypeError, - "encoder did not return a bytes object (type=%.400s)", - Py_TYPE(v)->tp_name); + "'%.400s' encoder returned '%.400s' instead of 'bytes'; " + "use codecs.encode() to encode to arbitrary types", + encoding, + Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); Py_DECREF(v); return NULL; } @@ -3465,8 +3472,10 @@ goto onError; if (!PyUnicode_Check(v)) { PyErr_Format(PyExc_TypeError, - "encoder did not return an str object (type=%.400s)", - Py_TYPE(v)->tp_name); + "'%.400s' encoder returned '%.400s' instead of 'str'; " + "use codecs.encode() to encode to arbitrary types", + encoding, + Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); Py_DECREF(v); goto onError; } diff -r 13203ea0ac5b Python/codecs.c --- a/Python/codecs.c Wed Nov 13 13:29:37 2013 +0100 +++ b/Python/codecs.c Wed Nov 13 23:28:37 2013 +1000 @@ -332,6 +332,22 @@ return codec_getstreamcodec(encoding, stream, errors, 3); } +/* Helper that tries to ensure the reported exception chain indicates the + * codec that was invoked to trigger the failure without changing the type + * of the exception raised. + */ +static void +wrap_codec_error(const char *operation, + const char *encoding) +{ + /* TrySetFromCause will replace the active exception with a suitably + * updated clone if it can, otherwise it will leave the original + * exception alone. + */ + _PyErr_TrySetFromCause("%s with '%s' codec failed", + operation, encoding); +} + /* Encode an object (e.g. an Unicode object) using the given encoding and return the resulting encoded object (usually a Python string). @@ -376,6 +392,7 @@ Py_XDECREF(result); Py_XDECREF(args); Py_XDECREF(encoder); + wrap_codec_error("encoding", encoding); return NULL; } @@ -422,6 +439,7 @@ Py_XDECREF(args); Py_XDECREF(decoder); Py_XDECREF(result); + wrap_codec_error("decoding", encoding); return NULL; }