diff -r 04e1f701aeaa Include/codecs.h --- a/Include/codecs.h Tue Nov 19 22:33:10 2013 +1000 +++ b/Include/codecs.h Wed Nov 20 00:04:25 2013 +1000 @@ -94,6 +94,27 @@ const char *errors ); +/* Text codec specific encoding and decoding API. + + Checks the encoding against a blacklist of known non-Unicode codecs + before attempting the operation. + + */ + +PyAPI_FUNC(PyObject *) _PyCodec_EncodeText( + PyObject *object, + const char *encoding, + const char *errors + ); + +PyAPI_FUNC(PyObject *) _PyCodec_DecodeText( + PyObject *object, + const char *encoding, + const char *errors + ); + + + /* --- Codec Lookup APIs -------------------------------------------------- All APIs return a codec object with incremented refcount and are diff -r 04e1f701aeaa Objects/unicodeobject.c --- a/Objects/unicodeobject.c Tue Nov 19 22:33:10 2013 +1000 +++ b/Objects/unicodeobject.c Wed Nov 20 00:04:25 2013 +1000 @@ -3044,7 +3044,7 @@ buffer = PyMemoryView_FromBuffer(&info); if (buffer == NULL) goto onError; - unicode = PyCodec_Decode(buffer, encoding, errors); + unicode = _PyCodec_DecodeText(buffer, encoding, errors); if (unicode == NULL) goto onError; if (!PyUnicode_Check(unicode)) { @@ -3410,7 +3410,7 @@ } /* Encode via the codec registry */ - v = PyCodec_Encode(unicode, encoding, errors); + v = _PyCodec_EncodeText(unicode, encoding, errors); if (v == NULL) return NULL; diff -r 04e1f701aeaa Python/codecs.c --- a/Python/codecs.c Tue Nov 19 22:33:10 2013 +1000 +++ b/Python/codecs.c Wed Nov 20 00:04:25 2013 +1000 @@ -353,18 +353,15 @@ errors is passed to the encoder factory as argument if non-NULL. */ -PyObject *PyCodec_Encode(PyObject *object, - const char *encoding, - const char *errors) +static PyObject * +_PyCodec_EncodeInternal(PyObject *object, + PyObject *encoder, + const char *encoding, + const char *errors) { - PyObject *encoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v = NULL; - encoder = PyCodec_Encoder(encoding); - if (encoder == NULL) - goto onError; - args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -402,18 +399,15 @@ errors is passed to the decoder factory as argument if non-NULL. */ -PyObject *PyCodec_Decode(PyObject *object, - const char *encoding, - const char *errors) +static PyObject * +_PyCodec_DecodeInternal(PyObject *object, + PyObject *decoder, + const char *encoding, + const char *errors) { - PyObject *decoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v; - decoder = PyCodec_Decoder(encoding); - if (decoder == NULL) - goto onError; - args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -445,6 +439,124 @@ return NULL; } +/* Generic encoding/decoding API */ +PyObject *PyCodec_Encode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *encoder; + + encoder = PyCodec_Encoder(encoding); + if (encoder == NULL) + return NULL; + + return _PyCodec_EncodeInternal(object, encoder, encoding, errors); +} + +PyObject *PyCodec_Decode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *decoder; + + decoder = PyCodec_Decoder(encoding); + if (decoder == NULL) + return NULL; + + return _PyCodec_DecodeInternal(object, decoder, encoding, errors); +} + +/* Text encoding/decoding API */ +static +PyObject *codec_getitem_checked(const char *encoding, + const char *generic_name, + int index) +{ + _Py_IDENTIFIER(name); + _Py_IDENTIFIER(base64); + _Py_IDENTIFIER(uu); + _Py_IDENTIFIER(quopri); + _Py_IDENTIFIER(hex); + _Py_IDENTIFIER(bz2); + _Py_IDENTIFIER(zlib); + PyObject *codec; + PyObject *v; + PyObject *codec_name; + int is_text_codec = 1; + + codec = _PyCodec_Lookup(encoding); + if (codec == NULL) + return NULL; + + codec_name = _PyObject_GetAttrId(codec, &PyId_name); + if (codec_name == NULL) { + Py_DECREF(codec); + return NULL; + } + /* A set would be faster, but when to build it, where to store it? */ + if (_PyUnicode_CompareWithId(codec_name, &PyId_base64) == 0 || + _PyUnicode_CompareWithId(codec_name, &PyId_uu) == 0 || + _PyUnicode_CompareWithId(codec_name, &PyId_quopri) == 0 || + _PyUnicode_CompareWithId(codec_name, &PyId_hex) == 0 || + _PyUnicode_CompareWithId(codec_name, &PyId_bz2) == 0 || + _PyUnicode_CompareWithId(codec_name, &PyId_zlib) == 0 || + PyUnicode_CompareWithASCIIString(codec_name, "rot-13") == 0 + ) { + is_text_codec = 0; + } + + Py_DECREF(codec_name); + if (!is_text_codec) { + PyErr_Format(PyExc_TypeError, + "'%.400s' is not a Unicode encoding; " + "use codecs.%s() to handle arbitrary codecs", + encoding, generic_name); + return NULL; + } + + + v = PyTuple_GET_ITEM(codec, index); + Py_DECREF(codec); + Py_INCREF(v); + return v; +} + +static PyObject * _PyCodec_TextEncoder(const char *encoding) +{ + return codec_getitem_checked(encoding, "encode", 0); +} + +static PyObject * _PyCodec_TextDecoder(const char *encoding) +{ + return codec_getitem_checked(encoding, "decode", 1); +} + +PyObject *_PyCodec_EncodeText(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *encoder; + + encoder = _PyCodec_TextEncoder(encoding); + if (encoder == NULL) + return NULL; + + return _PyCodec_EncodeInternal(object, encoder, encoding, errors); +} + +PyObject *_PyCodec_DecodeText(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *decoder; + + decoder = _PyCodec_TextDecoder(encoding); + if (decoder == NULL) + return NULL; + + return _PyCodec_DecodeInternal(object, decoder, encoding, errors); +} + /* Register the error handling callback function error under the name name. This function will be called by the codec when it encounters an unencodable characters/undecodable bytes and doesn't know the