diff -r bee7943d38c6 Doc/library/codecs.rst --- a/Doc/library/codecs.rst Tue Feb 07 23:41:01 2012 +0100 +++ b/Doc/library/codecs.rst Wed Feb 08 00:33:45 2012 +0100 @@ -1144,6 +1144,13 @@ particular, the following variants typic | | | see also | | | | :mod:`encodings.idna` | +--------------------+---------+---------------------------+ +| locale | | Current locale encoding. | +| | | Only support ``'strict'`` | +| | | and ``'surrogateescape'`` | +| | | error handlers. | +| | | | +| | | .. versionadded:: 3.3 | ++--------------------+---------+---------------------------+ | mbcs | dbcs | Windows only: Encode | | | | operand according to the | | | | ANSI codepage (CP_ACP) | diff -r bee7943d38c6 Doc/whatsnew/3.3.rst --- a/Doc/whatsnew/3.3.rst Tue Feb 07 23:41:01 2012 +0100 +++ b/Doc/whatsnew/3.3.rst Wed Feb 08 00:33:45 2012 +0100 @@ -317,6 +317,12 @@ The :mod:`~encodings.mbcs` codec has be :mod:`~encodings.mbcs` codec now supports all error handlers, instead of only ``replace`` to encode and ``ignore`` to decode. +A new codec has been added: ``locale`` (:issue:`13619`), the current locale +encoding. In most cases, it should be the same encoding than +:func:`sys.getfilesystemencoding`. Both encodings can be different if the +:data:`~locale.LC_ALL` or :data:`~locale.LC_CTYPE` locale is changed with +:func:`locale.setlocale` to use another locale encoding. + A new Windows-only codec has been added: ``cp65001`` (:issue:`13216`). It is the Windows code page 65001 (Windows UTF-8, ``CP_UTF8``). For example, it is used by ``sys.stdout`` if the console output code page is set to cp65001 (e.g., using diff -r bee7943d38c6 Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Tue Feb 07 23:41:01 2012 +0100 +++ b/Lib/test/test_codecs.py Wed Feb 08 00:33:45 2012 +0100 @@ -2,6 +2,7 @@ import _testcapi import codecs import io import locale +import re import sys import unittest import warnings @@ -2033,6 +2034,71 @@ class CodePageTest(unittest.TestCase): self.assertEqual(decoded, ('abc', 3)) +class LocaleEncodingTest(unittest.TestCase): + def test_ascii(self): + self.assertEqual('abc'.encode('locale'), b'abc') + self.assertEqual(b'abc'.decode('locale'), 'abc') + + def test_surrogateescape(self): + self.assertEqual( + '\udc80\udce9\udcff'.encode('locale', 'surrogateescape'), + b'\x80\xe9\xff') + + def test_encode_errors(self): + old_locale = locale.getlocale(locale.LC_CTYPE) + try: + locale.setlocale(locale.LC_CTYPE, 'C') + + for ch in '\u20AC\uD800\U0010FFFF': + text = '[%s]' % ch + errmsg = ("'locale' codec can't encode character %a " + "in position 1: " % ch) + errmsg = '^' + re.escape(errmsg) + self.assertRaisesRegex( + UnicodeEncodeError, + errmsg, + text.encode, 'locale') + finally: + locale.setlocale(locale.LC_CTYPE, old_locale) + + def test_decode_errors(self): + old_locale = locale.getlocale(locale.LC_CTYPE) + try: + locale.setlocale(locale.LC_CTYPE, 'C') + + data = b'[\x80\xe9\xff]' + try: + data.decode('locale') + except UnicodeDecodeError: + pass + else: + # On FreeBSD, Solaris and Mac OS X, the C locale uses + # ISO-8859-1 encoding (and not the 7-bit encoding). + + # Retry with the user locale encoding. If the user locale + # encoding is UTF-8, we can test the error handling. + locale.setlocale(locale.LC_CTYPE, '') + try: + data.decode('locale') + except UnicodeDecodeError: + pass + else: + encoding = locale.getpreferredencoding(False) + self.skipTest('C locale and %s can decode %a' + % (encoding, data)) + + self.assertRaisesRegex( + UnicodeDecodeError, + "'locale' codec can't decode byte 0x80 in position 1:", + data.decode, 'locale') + + self.assertEqual( + data.decode('locale', 'surrogateescape'), + '[\udc80\udce9\udcff]') + finally: + locale.setlocale(locale.LC_CTYPE, old_locale) + + def test_main(): support.run_unittest( UTF32Test, @@ -2063,6 +2129,7 @@ def test_main(): BomTest, TransformCodecTest, CodePageTest, + LocaleEncodingTest, ) diff -r bee7943d38c6 Modules/_codecsmodule.c --- a/Modules/_codecsmodule.c Tue Feb 07 23:41:01 2012 +0100 +++ b/Modules/_codecsmodule.c Wed Feb 08 00:33:45 2012 +0100 @@ -645,6 +645,28 @@ code_page_decode(PyObject *self, #endif /* HAVE_MBCS */ +static PyObject * +locale_decode(PyObject *self, PyObject *args) +{ + Py_buffer pbuf; + const char *errors = NULL; + Py_ssize_t consumed; + PyObject *decoded = NULL; + + if (!PyArg_ParseTuple(args, "y*|z:locale_decode", + &pbuf, &errors)) + return NULL; + consumed = pbuf.len; + + decoded = PyUnicode_DecodeLocaleAndSize(pbuf.buf, pbuf.len, + errors); + PyBuffer_Release(&pbuf); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + + /* --- Encoder ------------------------------------------------------------ */ static PyObject * @@ -1073,6 +1095,27 @@ code_page_encode(PyObject *self, #endif /* HAVE_MBCS */ +static PyObject * +locale_encode(PyObject *self, PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "O|z:locale_encode", + &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL || PyUnicode_READY(str) < 0) { + Py_XDECREF(str); + return NULL; + } + v = codec_tuple(PyUnicode_EncodeLocale(str, errors), + PyUnicode_GET_LENGTH(str)); + Py_DECREF(str); + return v; +} + /* --- Error handler registry --------------------------------------------- */ PyDoc_STRVAR(register_error__doc__, @@ -1164,6 +1207,8 @@ static PyMethodDef _codecs_functions[] = {"code_page_encode", code_page_encode, METH_VARARGS}, {"code_page_decode", code_page_decode, METH_VARARGS}, #endif + {"locale_encode", locale_encode, METH_VARARGS}, + {"locale_decode", locale_decode, METH_VARARGS}, {"register_error", register_error, METH_VARARGS, register_error__doc__}, {"lookup_error", lookup_error, METH_VARARGS,