diff -r ab73b7fd7523 Include/codecs.h --- a/Include/codecs.h Thu Nov 21 12:29:51 2013 +0100 +++ b/Include/codecs.h Thu Nov 21 23:38:04 2013 +1000 @@ -94,6 +94,27 @@ const char *errors ); +/* Text codec specific encoding and decoding API. + + Checks the encoding against a blacklist of known non-Unicode codecs + before attempting the operation. + + */ + +PyAPI_FUNC(PyObject *) _PyCodec_EncodeText( + PyObject *object, + const char *encoding, + const char *errors + ); + +PyAPI_FUNC(PyObject *) _PyCodec_DecodeText( + PyObject *object, + const char *encoding, + const char *errors + ); + + + /* --- Codec Lookup APIs -------------------------------------------------- All APIs return a codec object with incremented refcount and are diff -r ab73b7fd7523 Lib/codecs.py --- a/Lib/codecs.py Thu Nov 21 12:29:51 2013 +0100 +++ b/Lib/codecs.py Thu Nov 21 23:38:04 2013 +1000 @@ -84,6 +84,15 @@ self.incrementaldecoder = incrementaldecoder self.streamwriter = streamwriter self.streamreader = streamreader + self._is_text_encoding = True # Assume codecs are text encodings + return self + + @classmethod + def _declare_transform(cls, *args, **kwds): + # Private helper to declare binary and text transforms + # See http://bugs.python.org/issue19619 + self = cls(*args, **kwds) + self._is_text_encoding = False return self def __repr__(self): diff -r ab73b7fd7523 Lib/encodings/base64_codec.py --- a/Lib/encodings/base64_codec.py Thu Nov 21 12:29:51 2013 +0100 +++ b/Lib/encodings/base64_codec.py Thu Nov 21 23:38:04 2013 +1000 @@ -44,7 +44,7 @@ ### encodings module API def getregentry(): - return codecs.CodecInfo( + return codecs.CodecInfo._declare_transform( name='base64', encode=base64_encode, decode=base64_decode, diff -r ab73b7fd7523 Lib/encodings/bz2_codec.py --- a/Lib/encodings/bz2_codec.py Thu Nov 21 12:29:51 2013 +0100 +++ b/Lib/encodings/bz2_codec.py Thu Nov 21 23:38:04 2013 +1000 @@ -66,7 +66,7 @@ ### encodings module API def getregentry(): - return codecs.CodecInfo( + return codecs.CodecInfo._declare_transform( name="bz2", encode=bz2_encode, decode=bz2_decode, diff -r ab73b7fd7523 Lib/encodings/hex_codec.py --- a/Lib/encodings/hex_codec.py Thu Nov 21 12:29:51 2013 +0100 +++ b/Lib/encodings/hex_codec.py Thu Nov 21 23:38:04 2013 +1000 @@ -44,7 +44,7 @@ ### encodings module API def getregentry(): - return codecs.CodecInfo( + return codecs.CodecInfo._declare_transform( name='hex', encode=hex_encode, decode=hex_decode, diff -r ab73b7fd7523 Lib/encodings/quopri_codec.py --- a/Lib/encodings/quopri_codec.py Thu Nov 21 12:29:51 2013 +0100 +++ b/Lib/encodings/quopri_codec.py Thu Nov 21 23:38:04 2013 +1000 @@ -45,7 +45,7 @@ # encodings module API def getregentry(): - return codecs.CodecInfo( + return codecs.CodecInfo._declare_transform( name='quopri', encode=quopri_encode, decode=quopri_decode, diff -r ab73b7fd7523 Lib/encodings/rot_13.py --- a/Lib/encodings/rot_13.py Thu Nov 21 12:29:51 2013 +0100 +++ b/Lib/encodings/rot_13.py Thu Nov 21 23:38:04 2013 +1000 @@ -35,7 +35,7 @@ ### encodings module API def getregentry(): - return codecs.CodecInfo( + return codecs.CodecInfo._declare_transform( name='rot-13', encode=Codec().encode, decode=Codec().decode, diff -r ab73b7fd7523 Lib/encodings/uu_codec.py --- a/Lib/encodings/uu_codec.py Thu Nov 21 12:29:51 2013 +0100 +++ b/Lib/encodings/uu_codec.py Thu Nov 21 23:38:04 2013 +1000 @@ -88,7 +88,7 @@ ### encodings module API def getregentry(): - return codecs.CodecInfo( + return codecs.CodecInfo._declare_transform( name='uu', encode=uu_encode, decode=uu_decode, diff -r ab73b7fd7523 Lib/encodings/zlib_codec.py --- a/Lib/encodings/zlib_codec.py Thu Nov 21 12:29:51 2013 +0100 +++ b/Lib/encodings/zlib_codec.py Thu Nov 21 23:38:04 2013 +1000 @@ -66,7 +66,7 @@ ### encodings module API def getregentry(): - return codecs.CodecInfo( + return codecs.CodecInfo._declare_transform( name='zlib', encode=zlib_encode, decode=zlib_decode, diff -r ab73b7fd7523 Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Thu Nov 21 12:29:51 2013 +0100 +++ b/Lib/test/test_codecs.py Thu Nov 21 23:38:04 2013 +1000 @@ -2381,67 +2381,68 @@ view_decoded = codecs.decode(view, encoding) self.assertEqual(view_decoded, data) - def test_type_error_for_text_input(self): + def test_text_to_binary_blacklists_binary_transforms(self): # Check binary -> binary codecs give a good error for str input bad_input = "bad input type" for encoding in bytes_transform_encodings: with self.subTest(encoding=encoding): - msg = "^encoding with '{}' codec failed".format(encoding) + fmt = ("'{}' is not a text encoding; " + "use codecs.encode\(\) to handle arbitrary codecs") + msg = fmt.format(encoding) with self.assertRaisesRegex(TypeError, msg) as failure: bad_input.encode(encoding) - self.assertTrue(isinstance(failure.exception.__cause__, - TypeError)) + self.assertIsNone(failure.exception.__cause__) - def test_type_error_for_binary_input(self): - # Check str -> str codec gives a good error for binary input - for bad_input in (b"immutable", bytearray(b"mutable")): - with self.subTest(bad_input=bad_input): - msg = "^decoding with 'rot_13' codec failed" - with self.assertRaisesRegex(AttributeError, msg) as failure: - bad_input.decode("rot_13") - self.assertTrue(isinstance(failure.exception.__cause__, - AttributeError)) + def test_text_to_binary_blacklists_text_transforms(self): + # Check str.encode gives a good error message for str -> str codecs + msg = ("^'rot_13' is not a text encoding; " + "use codecs.encode\(\) to handle arbitrary codecs") + with self.assertRaisesRegex(TypeError, msg): + "just an example message".encode("rot_13") - def test_custom_zlib_error_is_wrapped(self): - # Check zlib codec gives a good error for malformed input - msg = "^decoding with 'zlib_codec' codec failed" - with self.assertRaisesRegex(Exception, msg) as failure: - b"hello".decode("zlib_codec") - self.assertTrue(isinstance(failure.exception.__cause__, - type(failure.exception))) - - def test_custom_hex_error_is_wrapped(self): - # Check hex codec gives a good error for malformed input - msg = "^decoding with 'hex_codec' codec failed" - with self.assertRaisesRegex(Exception, msg) as failure: - b"hello".decode("hex_codec") - self.assertTrue(isinstance(failure.exception.__cause__, - type(failure.exception))) - - # Unfortunately, the bz2 module throws OSError, which the codec - # machinery currently can't wrap :( - - def test_bad_decoding_output_type(self): + def test_binary_to_text_blacklists_binary_transforms(self): # Check bytes.decode and bytearray.decode give a good error # message for binary -> binary codecs data = b"encode first to ensure we meet any format restrictions" for encoding in bytes_transform_encodings: with self.subTest(encoding=encoding): encoded_data = codecs.encode(data, encoding) - fmt = ("'{}' decoder returned 'bytes' instead of 'str'; " - "use codecs.decode\(\) to decode to arbitrary types") + fmt = ("'{}' is not a text encoding; " + "use codecs.decode\(\) to handle arbitrary codecs") msg = fmt.format(encoding) with self.assertRaisesRegex(TypeError, msg): encoded_data.decode(encoding) with self.assertRaisesRegex(TypeError, msg): bytearray(encoded_data).decode(encoding) - def test_bad_encoding_output_type(self): - # Check str.encode gives a good error message for str -> str codecs - msg = ("'rot_13' encoder returned 'str' instead of 'bytes'; " - "use codecs.encode\(\) to encode to arbitrary types") - with self.assertRaisesRegex(TypeError, msg): - "just an example message".encode("rot_13") + def test_binary_to_text_blacklists_text_transforms(self): + # Check str -> str codec gives a good error for binary input + for bad_input in (b"immutable", bytearray(b"mutable")): + with self.subTest(bad_input=bad_input): + msg = ("^'rot_13' is not a text encoding; " + "use codecs.decode\(\) to handle arbitrary codecs") + with self.assertRaisesRegex(TypeError, msg) as failure: + bad_input.decode("rot_13") + self.assertIsNone(failure.exception.__cause__) + + def test_custom_zlib_error_is_wrapped(self): + # Check zlib codec gives a good error for malformed input + msg = "^decoding with 'zlib_codec' codec failed" + with self.assertRaisesRegex(Exception, msg) as failure: + codecs.decode(b"hello", "zlib_codec") + self.assertIsInstance(failure.exception.__cause__, + type(failure.exception)) + + def test_custom_hex_error_is_wrapped(self): + # Check hex codec gives a good error for malformed input + msg = "^decoding with 'hex_codec' codec failed" + with self.assertRaisesRegex(Exception, msg) as failure: + codecs.decode(b"hello", "hex_codec") + self.assertIsInstance(failure.exception.__cause__, + type(failure.exception)) + + # Unfortunately, the bz2 module throws OSError, which the codec + # machinery currently can't wrap :( # The codec system tries to wrap exceptions in order to ensure the error diff -r ab73b7fd7523 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Thu Nov 21 12:29:51 2013 +0100 +++ b/Objects/unicodeobject.c Thu Nov 21 23:38:04 2013 +1000 @@ -3044,7 +3044,7 @@ buffer = PyMemoryView_FromBuffer(&info); if (buffer == NULL) goto onError; - unicode = PyCodec_Decode(buffer, encoding, errors); + unicode = _PyCodec_DecodeText(buffer, encoding, errors); if (unicode == NULL) goto onError; if (!PyUnicode_Check(unicode)) { @@ -3410,7 +3410,7 @@ } /* Encode via the codec registry */ - v = PyCodec_Encode(unicode, encoding, errors); + v = _PyCodec_EncodeText(unicode, encoding, errors); if (v == NULL) return NULL; diff -r ab73b7fd7523 Python/codecs.c --- a/Python/codecs.c Thu Nov 21 12:29:51 2013 +0100 +++ b/Python/codecs.c Thu Nov 21 23:38:04 2013 +1000 @@ -353,18 +353,15 @@ errors is passed to the encoder factory as argument if non-NULL. */ -PyObject *PyCodec_Encode(PyObject *object, - const char *encoding, - const char *errors) +static PyObject * +_PyCodec_EncodeInternal(PyObject *object, + PyObject *encoder, + const char *encoding, + const char *errors) { - PyObject *encoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v = NULL; - encoder = PyCodec_Encoder(encoding); - if (encoder == NULL) - goto onError; - args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -402,18 +399,15 @@ errors is passed to the decoder factory as argument if non-NULL. */ -PyObject *PyCodec_Decode(PyObject *object, - const char *encoding, - const char *errors) +static PyObject * +_PyCodec_DecodeInternal(PyObject *object, + PyObject *decoder, + const char *encoding, + const char *errors) { - PyObject *decoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v; - decoder = PyCodec_Decoder(encoding); - if (decoder == NULL) - goto onError; - args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -445,6 +439,106 @@ return NULL; } +/* Generic encoding/decoding API */ +PyObject *PyCodec_Encode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *encoder; + + encoder = PyCodec_Encoder(encoding); + if (encoder == NULL) + return NULL; + + return _PyCodec_EncodeInternal(object, encoder, encoding, errors); +} + +PyObject *PyCodec_Decode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *decoder; + + decoder = PyCodec_Decoder(encoding); + if (decoder == NULL) + return NULL; + + return _PyCodec_DecodeInternal(object, decoder, encoding, errors); +} + +/* Text encoding/decoding API */ +static +PyObject *codec_getitem_checked(const char *encoding, + const char *operation_name, + int index) +{ + _Py_IDENTIFIER(_is_text_encoding); + PyObject *codec; + PyObject *attr; + PyObject *v; + int is_text_codec; + + codec = _PyCodec_Lookup(encoding); + if (codec == NULL) + return NULL; + + attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding); + if (attr == NULL) { + Py_DECREF(codec); + return NULL; + } + is_text_codec = PyObject_IsTrue(attr); + Py_DECREF(attr); + if (!is_text_codec) { + PyErr_Format(PyExc_TypeError, + "'%.400s' is not a text encoding; " + "use codecs.%s() to handle arbitrary codecs", + encoding, operation_name); + return NULL; + } + + v = PyTuple_GET_ITEM(codec, index); + Py_DECREF(codec); + Py_INCREF(v); + return v; +} + +static PyObject * _PyCodec_TextEncoder(const char *encoding) +{ + return codec_getitem_checked(encoding, "encode", 0); +} + +static PyObject * _PyCodec_TextDecoder(const char *encoding) +{ + return codec_getitem_checked(encoding, "decode", 1); +} + +PyObject *_PyCodec_EncodeText(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *encoder; + + encoder = _PyCodec_TextEncoder(encoding); + if (encoder == NULL) + return NULL; + + return _PyCodec_EncodeInternal(object, encoder, encoding, errors); +} + +PyObject *_PyCodec_DecodeText(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *decoder; + + decoder = _PyCodec_TextDecoder(encoding); + if (decoder == NULL) + return NULL; + + return _PyCodec_DecodeInternal(object, decoder, encoding, errors); +} + /* Register the error handling callback function error under the name name. This function will be called by the codec when it encounters an unencodable characters/undecodable bytes and doesn't know the