diff -r 478523c1aafc Include/codecs.h --- a/Include/codecs.h Sun Feb 16 14:52:01 2014 -0500 +++ b/Include/codecs.h Mon Feb 17 19:39:07 2014 +0200 @@ -94,6 +94,33 @@ const char *errors ); +#ifndef PY_LIMITED_API +/* Text codec specific encoding and decoding API. + + Checks the encoding against a list of codecs which do not + implement a str<->bytes encoding before attempting the + operation. + + Please note that these APIs are internal and should not + be used in Python C extensions. + + */ + +PyAPI_FUNC(PyObject *) _PyCodec_EncodeText( + PyObject *object, + const char *encoding, + const char *errors + ); + +PyAPI_FUNC(PyObject *) _PyCodec_DecodeText( + PyObject *object, + const char *encoding, + const char *errors + ); +#endif + + + /* --- Codec Lookup APIs -------------------------------------------------- All APIs return a codec object with incremented refcount and are diff -r 478523c1aafc Lib/codecs.py --- a/Lib/codecs.py Sun Feb 16 14:52:01 2014 -0500 +++ b/Lib/codecs.py Mon Feb 17 19:39:07 2014 +0200 @@ -73,9 +73,19 @@ ### Codec base classes (defining the API) class CodecInfo(tuple): + """Codec details when looking up the codec registry""" + + # Private API to allow Python 3.4 to blacklist the known non-Unicode + # codecs in the standard library. A more general mechanism to + # reliably distinguish test encodings from other codecs will hopefully + # be defined for Python 3.5 + # + # See http://bugs.python.org/issue19619 + _is_text_encoding = True # Assume codecs are text encodings by default def __new__(cls, encode, decode, streamreader=None, streamwriter=None, - incrementalencoder=None, incrementaldecoder=None, name=None): + incrementalencoder=None, incrementaldecoder=None, name=None, + *, _is_text_encoding=None): self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) self.name = name self.encode = encode @@ -84,6 +94,8 @@ self.incrementaldecoder = incrementaldecoder self.streamwriter = streamwriter self.streamreader = streamreader + if _is_text_encoding is not None: + self._is_text_encoding = _is_text_encoding return self def __repr__(self): diff -r 478523c1aafc Lib/encodings/base64_codec.py --- a/Lib/encodings/base64_codec.py Sun Feb 16 14:52:01 2014 -0500 +++ b/Lib/encodings/base64_codec.py Mon Feb 17 19:39:07 2014 +0200 @@ -52,4 +52,5 @@ incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) diff -r 478523c1aafc Lib/encodings/bz2_codec.py --- a/Lib/encodings/bz2_codec.py Sun Feb 16 14:52:01 2014 -0500 +++ b/Lib/encodings/bz2_codec.py Mon Feb 17 19:39:07 2014 +0200 @@ -74,4 +74,5 @@ incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) diff -r 478523c1aafc Lib/encodings/hex_codec.py --- a/Lib/encodings/hex_codec.py Sun Feb 16 14:52:01 2014 -0500 +++ b/Lib/encodings/hex_codec.py Mon Feb 17 19:39:07 2014 +0200 @@ -52,4 +52,5 @@ incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) diff -r 478523c1aafc Lib/encodings/quopri_codec.py --- a/Lib/encodings/quopri_codec.py Sun Feb 16 14:52:01 2014 -0500 +++ b/Lib/encodings/quopri_codec.py Mon Feb 17 19:39:07 2014 +0200 @@ -53,4 +53,5 @@ incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) diff -r 478523c1aafc Lib/encodings/rot_13.py --- a/Lib/encodings/rot_13.py Sun Feb 16 14:52:01 2014 -0500 +++ b/Lib/encodings/rot_13.py Mon Feb 17 19:39:07 2014 +0200 @@ -43,6 +43,7 @@ incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) ### Map diff -r 478523c1aafc Lib/encodings/uu_codec.py --- a/Lib/encodings/uu_codec.py Sun Feb 16 14:52:01 2014 -0500 +++ b/Lib/encodings/uu_codec.py Mon Feb 17 19:39:07 2014 +0200 @@ -96,4 +96,5 @@ incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_text_encoding=False, ) diff -r 478523c1aafc Lib/encodings/zlib_codec.py --- a/Lib/encodings/zlib_codec.py Sun Feb 16 14:52:01 2014 -0500 +++ b/Lib/encodings/zlib_codec.py Mon Feb 17 19:39:07 2014 +0200 @@ -74,4 +74,5 @@ incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_text_encoding=False, ) diff -r 478523c1aafc Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Sun Feb 16 14:52:01 2014 -0500 +++ b/Lib/test/test_codecs.py Mon Feb 17 19:39:07 2014 +0200 @@ -4,6 +4,7 @@ import sys import unittest import warnings +import encodings from test import support @@ -2405,6 +2406,47 @@ sout = reader.readline() self.assertEqual(sout, b"\x80") + def test_text_to_binary_blacklists_binary_transforms(self): + # Check binary -> binary codecs give a good error for str input + bad_input = "bad input type" + for encoding in bytes_transform_encodings: + fmt = (r"{!r} is not a text encoding; " + r"use codecs.encode\(\) to handle arbitrary codecs") + msg = fmt.format(encoding) + with self.assertRaisesRegex(LookupError, msg) as failure: + bad_input.encode(encoding) + self.assertIsNone(failure.exception.__cause__) + + def test_text_to_binary_blacklists_text_transforms(self): + # Check str.encode gives a good error message for str -> str codecs + msg = (r"^'rot_13' is not a text encoding; " + r"use codecs.encode\(\) to handle arbitrary codecs") + with self.assertRaisesRegex(LookupError, msg): + "just an example message".encode("rot_13") + + def test_binary_to_text_blacklists_binary_transforms(self): + # Check bytes.decode and bytearray.decode give a good error + # message for binary -> binary codecs + data = b"encode first to ensure we meet any format restrictions" + for encoding in bytes_transform_encodings: + encoded_data = codecs.encode(data, encoding) + fmt = (r"{!r} is not a text encoding; " + r"use codecs.decode\(\) to handle arbitrary codecs") + msg = fmt.format(encoding) + with self.assertRaisesRegex(LookupError, msg): + encoded_data.decode(encoding) + with self.assertRaisesRegex(LookupError, msg): + bytearray(encoded_data).decode(encoding) + + def test_binary_to_text_blacklists_text_transforms(self): + # Check str -> str codec gives a good error for binary input + for bad_input in (b"immutable", bytearray(b"mutable")): + msg = (r"^'rot_13' is not a text encoding; " + r"use codecs.decode\(\) to handle arbitrary codecs") + with self.assertRaisesRegex(LookupError, msg) as failure: + bad_input.decode("rot_13") + self.assertIsNone(failure.exception.__cause__) + @unittest.skipUnless(sys.platform == 'win32', 'code pages are specific to Windows') diff -r 478523c1aafc Misc/NEWS --- a/Misc/NEWS Sun Feb 16 14:52:01 2014 -0500 +++ b/Misc/NEWS Mon Feb 17 19:39:07 2014 +0200 @@ -10,6 +10,12 @@ Core and Builtins ----------------- +- Issue #19619: str.encode, bytes.decode and bytearray.decode now use an + internal API to throw LookupError for known non-text encodings, rather + than attempting the encoding or decoding operation and then throwing a + TypeError for an unexpected output type. (The latter mechanism remains + in place for third party non-text encodings) + - Issue #20588: Make Python-ast.c C89 compliant. - Issue #20437: Fixed 21 potential bugs when deleting objects references. diff -r 478523c1aafc Objects/unicodeobject.c --- a/Objects/unicodeobject.c Sun Feb 16 14:52:01 2014 -0500 +++ b/Objects/unicodeobject.c Mon Feb 17 19:39:07 2014 +0200 @@ -3129,7 +3129,7 @@ buffer = PyMemoryView_FromBuffer(&info); if (buffer == NULL) goto onError; - unicode = PyCodec_Decode(buffer, encoding, errors); + unicode = _PyCodec_DecodeText(buffer, encoding, errors); if (unicode == NULL) goto onError; if (!PyUnicode_Check(unicode)) { @@ -3489,7 +3489,7 @@ } /* Encode via the codec registry */ - v = PyCodec_Encode(unicode, encoding, errors); + v = _PyCodec_EncodeText(unicode, encoding, errors); if (v == NULL) return NULL; diff -r 478523c1aafc Python/codecs.c --- a/Python/codecs.c Sun Feb 16 14:52:01 2014 -0500 +++ b/Python/codecs.c Mon Feb 17 19:39:07 2014 +0200 @@ -337,18 +337,15 @@ errors is passed to the encoder factory as argument if non-NULL. */ -PyObject *PyCodec_Encode(PyObject *object, - const char *encoding, - const char *errors) +static PyObject * +_PyCodec_EncodeInternal(PyObject *object, + PyObject *encoder, + const char *encoding, + const char *errors) { - PyObject *encoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v = NULL; - encoder = PyCodec_Encoder(encoding); - if (encoder == NULL) - goto onError; - args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -384,18 +381,15 @@ errors is passed to the decoder factory as argument if non-NULL. */ -PyObject *PyCodec_Decode(PyObject *object, - const char *encoding, - const char *errors) +static PyObject * +_PyCodec_DecodeInternal(PyObject *object, + PyObject *decoder, + const char *encoding, + const char *errors) { - PyObject *decoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v; - decoder = PyCodec_Decoder(encoding); - if (decoder == NULL) - goto onError; - args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -425,6 +419,118 @@ return NULL; } +/* Generic encoding/decoding API */ +PyObject *PyCodec_Encode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *encoder; + + encoder = PyCodec_Encoder(encoding); + if (encoder == NULL) + return NULL; + + return _PyCodec_EncodeInternal(object, encoder, encoding, errors); +} + +PyObject *PyCodec_Decode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *decoder; + + decoder = PyCodec_Decoder(encoding); + if (decoder == NULL) + return NULL; + + return _PyCodec_DecodeInternal(object, decoder, encoding, errors); +} + +/* Text encoding/decoding API */ +static +PyObject *codec_getitem_checked(const char *encoding, + const char *operation_name, + int index) +{ + _Py_IDENTIFIER(_is_text_encoding); + PyObject *codec; + PyObject *attr; + PyObject *v; + int is_text_codec; + + codec = _PyCodec_Lookup(encoding); + if (codec == NULL) + return NULL; + + /* Backwards compatibility: assume any raw tuple describes a text + * encoding, and the same for anything lacking the private + * attribute. + */ + if (!PyTuple_CheckExact(codec)) { + attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding); + if (attr == NULL) { + if (PyErr_ExceptionMatches(PyExc_AttributeError)) { + PyErr_Clear(); + } else { + Py_DECREF(codec); + return NULL; + } + } else { + is_text_codec = PyObject_IsTrue(attr); + Py_DECREF(attr); + if (!is_text_codec) { + Py_DECREF(codec); + PyErr_Format(PyExc_LookupError, + "'%.400s' is not a text encoding; " + "use codecs.%s() to handle arbitrary codecs", + encoding, operation_name); + return NULL; + } + } + } + + v = PyTuple_GET_ITEM(codec, index); + Py_DECREF(codec); + Py_INCREF(v); + return v; +} + +static PyObject * _PyCodec_TextEncoder(const char *encoding) +{ + return codec_getitem_checked(encoding, "encode", 0); +} + +static PyObject * _PyCodec_TextDecoder(const char *encoding) +{ + return codec_getitem_checked(encoding, "decode", 1); +} + +PyObject *_PyCodec_EncodeText(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *encoder; + + encoder = _PyCodec_TextEncoder(encoding); + if (encoder == NULL) + return NULL; + + return _PyCodec_EncodeInternal(object, encoder, encoding, errors); +} + +PyObject *_PyCodec_DecodeText(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *decoder; + + decoder = _PyCodec_TextDecoder(encoding); + if (decoder == NULL) + return NULL; + + return _PyCodec_DecodeInternal(object, decoder, encoding, errors); +} + /* Register the error handling callback function error under the name name. This function will be called by the codec when it encounters an unencodable characters/undecodable bytes and doesn't know the