diff -r 9bff2458b7a0 Include/codecs.h --- a/Include/codecs.h Mon Jan 27 11:58:49 2014 +0100 +++ b/Include/codecs.h Mon Jan 27 22:23:17 2014 +1000 @@ -105,6 +105,10 @@ be used in Python C extensions. */ +PyAPI_FUNC(PyObject *) _PyCodec_LookupTextEncoding( + const char *encoding, + const char *alternate_command + ); PyAPI_FUNC(PyObject *) _PyCodec_EncodeText( PyObject *object, diff -r 9bff2458b7a0 Lib/_pyio.py --- a/Lib/_pyio.py Mon Jan 27 11:58:49 2014 +0100 +++ b/Lib/_pyio.py Mon Jan 27 22:23:17 2014 +1000 @@ -1503,6 +1503,11 @@ if not isinstance(encoding, str): raise ValueError("invalid encoding: %r" % encoding) + if not codecs.lookup(encoding)._is_text_encoding: + msg = ("%r is not a text encoding; " + "use codecs.open() to handle arbitrary codecs") + raise LookupError(msg % encoding) + if errors is None: errors = "strict" else: diff -r 9bff2458b7a0 Lib/test/test_io.py --- a/Lib/test/test_io.py Mon Jan 27 11:58:49 2014 +0100 +++ b/Lib/test/test_io.py Mon Jan 27 22:23:17 2014 +1000 @@ -1929,6 +1929,16 @@ self.assertRaises(TypeError, t.__init__, b, newline=42) self.assertRaises(ValueError, t.__init__, b, newline='xyzzy') + def test_binary_transforms_are_rejected(self): + # Ensure the constructor complains if passed a binary codec + # http://bugs.python.org/issue20404 + r = self.BytesIO() + b = self.BufferedWriter(r) + error_msg = (r"'hex' is not a text encoding; " + r"use codecs.open\(\) to handle arbitrary codecs") + with self.assertRaisesRegex(LookupError, error_msg): + self.TextIOWrapper(b, encoding="hex") + def test_detach(self): r = self.BytesIO() b = self.BufferedWriter(r) @@ -2579,15 +2589,22 @@ def test_illegal_decoder(self): # Issue #17106 + # Bypass the early encoding check added in issue 20404 + def _make_illegal_wrapper(): + quopri = codecs.lookup("quopri") + quopri._is_text_encoding = True + try: + t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), + newline='\n', encoding="quopri") + finally: + quopri._is_text_encoding = False + return t # Crash when decoder returns non-string - t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', - encoding='quopri_codec') + t = _make_illegal_wrapper() self.assertRaises(TypeError, t.read, 1) - t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', - encoding='quopri_codec') + t = _make_illegal_wrapper() self.assertRaises(TypeError, t.readline) - t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', - encoding='quopri_codec') + t = _make_illegal_wrapper() self.assertRaises(TypeError, t.read) def _check_create_at_shutdown(self, **kwargs): @@ -2616,8 +2633,7 @@ if err: # Can error out with a RuntimeError if the module state # isn't found. - self.assertIn("RuntimeError: could not find io module state", - err.decode()) + self.assertIn(self.shutdown_error, err.decode()) else: self.assertEqual("ok", out.decode().strip()) @@ -2630,6 +2646,7 @@ class CTextIOWrapperTest(TextIOWrapperTest): io = io + shutdown_error = "RuntimeError: could not find io module state" def test_initialization(self): r = self.BytesIO(b"\xc3\xa9\n\n") @@ -2674,6 +2691,7 @@ class PyTextIOWrapperTest(TextIOWrapperTest): io = pyio + shutdown_error = "LookupError: unknown encoding: ascii" class IncrementalNewlineDecoderTest(unittest.TestCase): diff -r 9bff2458b7a0 Modules/_io/textio.c --- a/Modules/_io/textio.c Mon Jan 27 11:58:49 2014 +0100 +++ b/Modules/_io/textio.c Mon Jan 27 22:23:17 2014 +1000 @@ -849,7 +849,7 @@ char *kwlist[] = {"buffer", "encoding", "errors", "newline", "line_buffering", "write_through", NULL}; - PyObject *buffer, *raw; + PyObject *buffer, *raw, *codec_info; char *encoding = NULL; char *errors = NULL; char *newline = NULL; @@ -961,6 +961,19 @@ "could not determine default encoding"); } + /* Check we have been asked for a real text encoding */ + codec_info = _PyCodec_LookupTextEncoding(encoding, "codecs.open()"); + if (codec_info == NULL) { + Py_CLEAR(self->encoding); + goto error; + } + /* XXX (ncoghlan): for now, we just throw this away and use the + * normal string based APIs to look up the incremental encoder and + * decoder below. However, it would be a micro-optimisation to retrieve + * them from this codec info tuple instead. + */ + Py_DECREF(codec_info); + if (errors == NULL) errors = "strict"; self->errors = PyBytes_FromString(errors); diff -r 9bff2458b7a0 Python/codecs.c --- a/Python/codecs.c Mon Jan 27 11:58:49 2014 +0100 +++ b/Python/codecs.c Mon Jan 27 22:23:17 2014 +1000 @@ -467,15 +467,12 @@ } /* Text encoding/decoding API */ -static -PyObject *codec_getitem_checked(const char *encoding, - const char *operation_name, - int index) +PyObject * _PyCodec_LookupTextEncoding(const char *encoding, + const char *alternate_command) { _Py_IDENTIFIER(_is_text_encoding); PyObject *codec; PyObject *attr; - PyObject *v; int is_text_codec; codec = _PyCodec_Lookup(encoding); @@ -502,27 +499,44 @@ Py_DECREF(codec); PyErr_Format(PyExc_LookupError, "'%.400s' is not a text encoding; " - "use codecs.%s() to handle arbitrary codecs", - encoding, operation_name); + "use %s to handle arbitrary codecs", + encoding, alternate_command); return NULL; } } } + /* This appears to be a valid text encoding */ + return codec; +} + + +static +PyObject *codec_getitem_checked(const char *encoding, + const char *alternate_command, + int index) +{ + PyObject *codec; + PyObject *v; + + codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); + if (codec == NULL) + return NULL; + v = PyTuple_GET_ITEM(codec, index); + Py_INCREF(v); Py_DECREF(codec); - Py_INCREF(v); return v; } static PyObject * _PyCodec_TextEncoder(const char *encoding) { - return codec_getitem_checked(encoding, "encode", 0); + return codec_getitem_checked(encoding, "codecs.encode()", 0); } static PyObject * _PyCodec_TextDecoder(const char *encoding) { - return codec_getitem_checked(encoding, "decode", 1); + return codec_getitem_checked(encoding, "codecs.decode()", 1); } PyObject *_PyCodec_EncodeText(PyObject *object,