# HG changeset patch # User Steve Dower # Date 1473125403 25200 # Mon Sep 05 18:30:03 2016 -0700 # Node ID cb615d80dd66eab8258e1f0ecdb9571a331a8681 # Parent d52f10a0f10d157e74205922c2f0a90b7004022f Issue #27959: Adds oem encoding, alias ansi to mbcs, move aliasmbcs to codec lookup diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1663,7 +1663,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( const char *string, /* MBCS encoded string */ - Py_ssize_t length, /* size of string */ + Py_ssize_t length, /* size of string */ const char *errors /* error handling */ ); diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -29,6 +29,7 @@ """#" import codecs +import sys from . import aliases _cache = {} @@ -151,3 +152,12 @@ # Register the search_function in the Python codec registry codecs.register(search_function) + +if sys.platform == 'win32': + def _alias_mbcs(encoding): + import _bootlocale + if encoding == _bootlocale.getpreferredencoding(False): + import encodings.mbcs + return encodings.mbcs.getregentry() + + codecs.register(_alias_mbcs) diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py --- a/Lib/encodings/aliases.py +++ b/Lib/encodings/aliases.py @@ -458,6 +458,7 @@ 'macturkish' : 'mac_turkish', # mbcs codec + 'ansi' : 'mbcs', 'dbcs' : 'mbcs', # ptcp154 codec diff --git a/Lib/encodings/oem.py b/Lib/encodings/oem.py new file mode 100644 --- /dev/null +++ b/Lib/encodings/oem.py @@ -0,0 +1,41 @@ +""" Python 'oem' Codec for Windows + +""" +# Import them explicitly to cause an ImportError +# on non-Windows systems +from codecs import oem_encode, oem_decode +# for IncrementalDecoder, IncrementalEncoder, ... +import codecs + +### Codec APIs + +encode = oem_encode + +def decode(input, errors='strict'): + return oem_decode(input, errors, True) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return oem_encode(input, self.errors)[0] + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + _buffer_decode = oem_decode + +class StreamWriter(codecs.StreamWriter): + encode = oem_encode + +class StreamReader(codecs.StreamReader): + decode = oem_decode + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='oem', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/site.py b/Lib/site.py --- a/Lib/site.py +++ b/Lib/site.py @@ -423,21 +423,6 @@ sys.__interactivehook__ = register_readline -def aliasmbcs(): - """On Windows, some default encodings are not provided by Python, - while they are always available as "mbcs" in each locale. Make - them usable by aliasing to "mbcs" in such a case.""" - if sys.platform == 'win32': - import _bootlocale, codecs - enc = _bootlocale.getpreferredencoding(False) - if enc.startswith('cp'): # "cp***" ? - try: - codecs.lookup(enc) - except LookupError: - import encodings - encodings._cache[enc] = encodings._unknown - encodings.aliases.aliases[enc] = 'mbcs' - CONFIG_LINE = r'^(?P(\w|[-_])+)\s*=\s*(?P.*)\s*$' def venv(known_paths): @@ -560,7 +545,6 @@ setcopyright() sethelper() enablerlcompleter() - aliasmbcs() execsitecustomize() if ENABLE_USER_SITE: execusercustomize() diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -8,11 +8,6 @@ from test import support -if sys.platform == 'win32': - VISTA_OR_LATER = (sys.getwindowsversion().major >= 6) -else: - VISTA_OR_LATER = False - try: import ctypes except ImportError: @@ -841,18 +836,13 @@ ('abc', 'strict', b'abc'), ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'), ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'), + ('\udc80', 'strict', None), + ('\udc80', 'ignore', b''), + ('\udc80', 'replace', b'?'), + ('\udc80', 'backslashreplace', b'\\udc80'), + ('\udc80', 'namereplace', b'\\udc80'), + ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), ] - if VISTA_OR_LATER: - tests.extend(( - ('\udc80', 'strict', None), - ('\udc80', 'ignore', b''), - ('\udc80', 'replace', b'?'), - ('\udc80', 'backslashreplace', b'\\udc80'), - ('\udc80', 'namereplace', b'\\udc80'), - ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), - )) - else: - tests.append(('\udc80', 'strict', b'\xed\xb2\x80')) for text, errors, expected in tests: if expected is not None: try: @@ -879,17 +869,10 @@ (b'[\xff]', 'ignore', '[]'), (b'[\xff]', 'replace', '[\ufffd]'), (b'[\xff]', 'surrogateescape', '[\udcff]'), + (b'[\xed\xb2\x80]', 'strict', None), + (b'[\xed\xb2\x80]', 'ignore', '[]'), + (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), ] - if VISTA_OR_LATER: - tests.extend(( - (b'[\xed\xb2\x80]', 'strict', None), - (b'[\xed\xb2\x80]', 'ignore', '[]'), - (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), - )) - else: - tests.extend(( - (b'[\xed\xb2\x80]', 'strict', '[\udc80]'), - )) for raw, errors, expected in tests: if expected is not None: try: @@ -904,7 +887,6 @@ self.assertRaises(UnicodeDecodeError, raw.decode, 'cp65001', errors) - @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later') def test_lone_surrogates(self): self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001") self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001") @@ -921,7 +903,6 @@ self.assertEqual("[\uDC80]".encode("cp65001", "replace"), b'[?]') - @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later') def test_surrogatepass_handler(self): self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"), b"abc\xed\xa0\x80def") @@ -1951,6 +1932,8 @@ if hasattr(codecs, "mbcs_encode"): all_unicode_encodings.append("mbcs") +if hasattr(codecs, "oem_encode"): + all_unicode_encodings.append("oem") # The following encoding is not tested, because it's not supposed # to work: @@ -2667,6 +2650,8 @@ bytes_transform_encodings.append("bz2_codec") transform_aliases["bz2_codec"] = ["bz2"] +if hasattr(codecs, "mbcs_encode"): + transform_aliases["mbcs"] = ["dbcs", "ansi"] class TransformCodecTest(unittest.TestCase): @@ -3119,11 +3104,10 @@ (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), )) - if VISTA_OR_LATER: - self.check_encode(self.CP_UTF8, ( - ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), - ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), - )) + self.check_encode(self.CP_UTF8, ( + ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), + ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), + )) def test_incremental(self): decoded = codecs.code_page_decode(932, b'\x82', 'strict', False) @@ -3144,6 +3128,20 @@ False) self.assertEqual(decoded, ('abc', 3)) + def test_mbcs_alias(self): + # Check that looking up our 'default' codepage will return + # mbcs when we don't have a more specific one available + import _bootlocale + def _get_fake_codepage(*a): + return 'cp123' + old_getpreferredencoding = _bootlocale.getpreferredencoding + _bootlocale.getpreferredencoding = _get_fake_codepage + try: + codec = codecs.lookup('cp123') + self.assertEqual(codec.name, 'mbcs') + finally: + _bootlocale.getpreferredencoding = old_getpreferredencoding + class ASCIITest(unittest.TestCase): def test_encode(self): diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -626,6 +626,25 @@ } /*[clinic input] +_codecs.oem_decode + data: Py_buffer + errors: str(accept={str, NoneType}) = NULL + final: int(c_default="0") = False + / +[clinic start generated code]*/ + +static PyObject * +_codecs_oem_decode_impl(PyObject *module, Py_buffer *data, + const char *errors, int final) +/*[clinic end generated code: output=da1617612f3fcad8 input=95b8a92c446b03cd]*/ +{ + Py_ssize_t consumed = data->len; + PyObject *decoded = PyUnicode_DecodeCodePageStateful(CP_OEMCP, + data->buf, data->len, errors, final ? NULL : &consumed); + return codec_tuple(decoded, consumed); +} + +/*[clinic input] _codecs.code_page_decode codepage: int data: Py_buffer @@ -971,6 +990,21 @@ } /*[clinic input] +_codecs.oem_encode + str: unicode + errors: str(accept={str, NoneType}) = NULL + / +[clinic start generated code]*/ + +static PyObject * +_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors) +/*[clinic end generated code: output=65d5982c737de649 input=3fc5f0028aad3cda]*/ +{ + return codec_tuple(PyUnicode_EncodeCodePage(CP_OEMCP, str, errors), + PyUnicode_GET_LENGTH(str)); +} + +/*[clinic input] _codecs.code_page_encode code_page: int str: unicode @@ -1075,6 +1109,8 @@ _CODECS_READBUFFER_ENCODE_METHODDEF _CODECS_MBCS_ENCODE_METHODDEF _CODECS_MBCS_DECODE_METHODDEF + _CODECS_OEM_ENCODE_METHODDEF + _CODECS_OEM_DECODE_METHODDEF _CODECS_CODE_PAGE_ENCODE_METHODDEF _CODECS_CODE_PAGE_DECODE_METHODDEF _CODECS_REGISTER_ERROR_METHODDEF diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -805,6 +805,45 @@ #if defined(HAVE_MBCS) +PyDoc_STRVAR(_codecs_oem_decode__doc__, +"oem_decode($module, data, errors=None, final=False, /)\n" +"--\n" +"\n"); + +#define _CODECS_OEM_DECODE_METHODDEF \ + {"oem_decode", (PyCFunction)_codecs_oem_decode, METH_VARARGS, _codecs_oem_decode__doc__}, + +static PyObject * +_codecs_oem_decode_impl(PyObject *module, Py_buffer *data, + const char *errors, int final); + +static PyObject * +_codecs_oem_decode(PyObject *module, PyObject *args) +{ + PyObject *return_value = NULL; + Py_buffer data = {NULL, NULL}; + const char *errors = NULL; + int final = 0; + + if (!PyArg_ParseTuple(args, "y*|zi:oem_decode", + &data, &errors, &final)) { + goto exit; + } + return_value = _codecs_oem_decode_impl(module, &data, errors, final); + +exit: + /* Cleanup for data */ + if (data.obj) { + PyBuffer_Release(&data); + } + + return return_value; +} + +#endif /* defined(HAVE_MBCS) */ + +#if defined(HAVE_MBCS) + PyDoc_STRVAR(_codecs_code_page_decode__doc__, "code_page_decode($module, codepage, data, errors=None, final=False, /)\n" "--\n" @@ -1346,6 +1385,38 @@ #if defined(HAVE_MBCS) +PyDoc_STRVAR(_codecs_oem_encode__doc__, +"oem_encode($module, str, errors=None, /)\n" +"--\n" +"\n"); + +#define _CODECS_OEM_ENCODE_METHODDEF \ + {"oem_encode", (PyCFunction)_codecs_oem_encode, METH_VARARGS, _codecs_oem_encode__doc__}, + +static PyObject * +_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors); + +static PyObject * +_codecs_oem_encode(PyObject *module, PyObject *args) +{ + PyObject *return_value = NULL; + PyObject *str; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "U|z:oem_encode", + &str, &errors)) { + goto exit; + } + return_value = _codecs_oem_encode_impl(module, str, errors); + +exit: + return return_value; +} + +#endif /* defined(HAVE_MBCS) */ + +#if defined(HAVE_MBCS) + PyDoc_STRVAR(_codecs_code_page_encode__doc__, "code_page_encode($module, code_page, str, errors=None, /)\n" "--\n" @@ -1446,6 +1517,10 @@ #define _CODECS_MBCS_DECODE_METHODDEF #endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */ +#ifndef _CODECS_OEM_DECODE_METHODDEF + #define _CODECS_OEM_DECODE_METHODDEF +#endif /* !defined(_CODECS_OEM_DECODE_METHODDEF) */ + #ifndef _CODECS_CODE_PAGE_DECODE_METHODDEF #define _CODECS_CODE_PAGE_DECODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_DECODE_METHODDEF) */ @@ -1454,7 +1529,11 @@ #define _CODECS_MBCS_ENCODE_METHODDEF #endif /* !defined(_CODECS_MBCS_ENCODE_METHODDEF) */ +#ifndef _CODECS_OEM_ENCODE_METHODDEF + #define _CODECS_OEM_ENCODE_METHODDEF +#endif /* !defined(_CODECS_OEM_ENCODE_METHODDEF) */ + #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=0221e4eece62c905 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=7874e2d559d49368 input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -7780,6 +7780,12 @@ return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); } +PyObject * +_PyUnicode_AsOEMString(PyObject *unicode) +{ + return PyUnicode_EncodeCodePage(CP_OEMCP, unicode, NULL); +} + #undef NEED_RETRY #endif /* HAVE_MBCS */