diff -r 2b7b203e3909 Doc/c-api/sys.rst --- a/Doc/c-api/sys.rst Wed Jan 11 20:18:03 2017 +0200 +++ b/Doc/c-api/sys.rst Wed Jan 11 23:11:17 2017 +0100 @@ -87,6 +87,9 @@ Operating System Utilities .. versionadded:: 3.5 + .. versionchanged:: 3.7 + The function now supports the UTF-8 mode. + .. c:function:: char* Py_EncodeLocale(const wchar_t *text, size_t *error_pos) @@ -98,12 +101,15 @@ Operating System Utilities to free the memory. Return ``NULL`` on encoding error or memory allocation error - If error_pos is not ``NULL``, ``*error_pos`` is set to the index of the - invalid character on encoding error, or set to ``(size_t)-1`` otherwise. + If error_pos is not ``NULL``, ``*error_pos`` is set to ``(size_t)-1`` on + success, or set to the index of the invalid character on encoding error. Use the :c:func:`Py_DecodeLocale` function to decode the bytes string back to a wide character string. + .. versionchanged:: 3.7 + The function now supports the UTF-8 mode. + .. seealso:: The :c:func:`PyUnicode_EncodeFSDefault` and @@ -111,6 +117,9 @@ Operating System Utilities .. versionadded:: 3.5 + .. versionchanged:: 3.7 + The function now supports the UTF-8 mode. + .. _systemfunctions: diff -r 2b7b203e3909 Doc/library/os.rst --- a/Doc/library/os.rst Wed Jan 11 20:18:03 2017 +0200 +++ b/Doc/library/os.rst Wed Jan 11 23:11:17 2017 +0100 @@ -183,6 +183,9 @@ process and user. Support added to accept objects implementing the :class:`os.PathLike` interface. + .. versionchanged:: 3.7 + The UTF-8 mode can now changes the encoding. + .. function:: fsdecode(filename) @@ -198,6 +201,9 @@ process and user. Support added to accept objects implementing the :class:`os.PathLike` interface. + .. versionchanged:: 3.7 + The UTF-8 mode can now changes the encoding. + .. function:: fspath(path) diff -r 2b7b203e3909 Doc/library/sys.rst --- a/Doc/library/sys.rst Wed Jan 11 20:18:03 2017 +0200 +++ b/Doc/library/sys.rst Wed Jan 11 23:11:17 2017 +0100 @@ -295,6 +295,7 @@ always available. :const:`bytes_warning` :option:`-b` :const:`quiet` :option:`-q` :const:`hash_randomization` :option:`-R` + :const:`utf8mode` :option:`-X utf8` ============================= ============================= .. versionchanged:: 3.2 @@ -306,6 +307,9 @@ always available. .. versionchanged:: 3.3 Removed obsolete ``division_warning`` attribute. + .. versionchanged:: 3.7 + Added ``utf8mode`` attribute for the new :option:`-X utf8` flag. + .. data:: float_info @@ -451,7 +455,8 @@ always available. * On Mac OS X, the encoding is ``'utf-8'``. - * On Unix, the encoding is the locale encoding. + * On Unix, the encoding is ``utf-8`` in the UTF-8 mode, or the locale + encoding. * On Windows, the encoding may be ``'utf-8'`` or ``'mbcs'``, depending on user configuration. @@ -463,6 +468,10 @@ always available. Windows is no longer guaranteed to return ``'mbcs'``. See :pep:`529` and :func:`_enablelegacywindowsfsencoding` for more information. + .. versionchanged:: 3.7 + The UTF-8 mode can now changes the encoding. + + .. function:: getfilesystemencodeerrors() Return the name of the error mode used to convert between Unicode filenames diff -r 2b7b203e3909 Doc/using/cmdline.rst --- a/Doc/using/cmdline.rst Wed Jan 11 20:18:03 2017 +0200 +++ b/Doc/using/cmdline.rst Wed Jan 11 23:11:17 2017 +0100 @@ -405,6 +405,7 @@ Miscellaneous options :func:`tracemalloc.start` for more information. * ``-X showalloccount`` to enable the output of the total count of allocated objects for each type (only works when built with ``COUNT_ALLOCS`` defined); + * ``-X utf8`` to enable the UTF-8 mode. It also allows passing arbitrary values and retrieving them through the :data:`sys._xoptions` dictionary. @@ -421,6 +422,9 @@ Miscellaneous options .. versionadded:: 3.6 The ``-X showalloccount`` option. + .. versionchanged:: 3.7 + The ``-X utf8`` option. + Options you shouldn't use ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -711,6 +715,17 @@ conflict. .. versionadded:: 3.6 +.. envvar:: PYTHONUTF8 + + If set to ``1``, enable the UTF-8 mode. + + If set to ``strict``, enable the UTF-8 mode in strict mode. + + Any other value cause an error. + + .. versionadded:: 3.7 + + Debug-mode variables ~~~~~~~~~~~~~~~~~~~~ diff -r 2b7b203e3909 Include/fileobject.h --- a/Include/fileobject.h Wed Jan 11 20:18:03 2017 +0200 +++ b/Include/fileobject.h Wed Jan 11 23:11:17 2017 +0100 @@ -28,6 +28,10 @@ PyAPI_DATA(const char *) Py_FileSystemDe #endif PyAPI_DATA(int) Py_HasFileSystemDefaultEncoding; +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000 +PyAPI_DATA(int) Py_UTF8Mode; +#endif + /* Internal API The std printer acts as a preliminary sys.stderr until the new io diff -r 2b7b203e3909 Lib/_pyio.py --- a/Lib/_pyio.py Wed Jan 11 20:18:03 2017 +0200 +++ b/Lib/_pyio.py Wed Jan 11 23:11:17 2017 +0100 @@ -1914,19 +1914,34 @@ class TextIOWrapper(TextIOBase): raise TypeError("illegal newline type: %r" % (type(newline),)) if newline not in (None, "", "\n", "\r", "\r\n"): raise ValueError("illegal newline value: %r" % (newline,)) + + if encoding and not errors: + errors = "strict" + if encoding is None: try: - encoding = os.device_encoding(buffer.fileno()) - except (AttributeError, UnsupportedOperation): - pass - if encoding is None: + utf8mode = sys.flags.utf8mode + except AttributeError: + # TextIOWrapper created during Python shutdown, sys.flags + # was already set to None. Consider that the UTF-8 mode is + # disabled. + utf8mode = False + + if utf8mode: + encoding = "utf-8" + else: try: - import locale - except ImportError: - # Importing locale may fail if Python is being built - encoding = "ascii" - else: - encoding = locale.getpreferredencoding(False) + encoding = os.device_encoding(buffer.fileno()) + except (AttributeError, UnsupportedOperation): + pass + if encoding is None: + try: + import locale + except ImportError: + # Importing locale may fail if Python is being built + encoding = "ascii" + else: + encoding = locale.getpreferredencoding(False) if not isinstance(encoding, str): raise ValueError("invalid encoding: %r" % encoding) @@ -1937,7 +1952,12 @@ class TextIOWrapper(TextIOBase): raise LookupError(msg % encoding) if errors is None: - errors = "strict" + if sys.flags.utf8mode == 2: + errors = "strict" + elif sys.flags.utf8mode: + errors = "surrogateescape" + else: + errors = "strict" else: if not isinstance(errors, str): raise ValueError("invalid errors: %r" % errors) diff -r 2b7b203e3909 Lib/subprocess.py --- a/Lib/subprocess.py Wed Jan 11 20:18:03 2017 +0200 +++ b/Lib/subprocess.py Wed Jan 11 23:11:17 2017 +0100 @@ -251,6 +251,10 @@ def _args_from_interpreter_flags(): v = getattr(sys.flags, flag) if v > 0: args.append('-' + opt * v) + if sys.flags.utf8mode == 2: + args.extend(('-X', 'utf8=strict')) + elif sys.flags.utf8mode: + args.extend(('-X', 'utf8')) for opt in sys.warnoptions: args.append('-W' + opt) return args diff -r 2b7b203e3909 Lib/test/support/__init__.py --- a/Lib/test/support/__init__.py Wed Jan 11 20:18:03 2017 +0200 +++ b/Lib/test/support/__init__.py Wed Jan 11 23:11:17 2017 +0100 @@ -919,6 +919,7 @@ for name in ( TESTFN_UNDECODABLE = os.fsencode(TESTFN) + name break + if FS_NONASCII: TESTFN_NONASCII = TESTFN + '-' + FS_NONASCII else: diff -r 2b7b203e3909 Lib/test/test_builtin.py --- a/Lib/test/test_builtin.py Wed Jan 11 20:18:03 2017 +0200 +++ b/Lib/test/test_builtin.py Wed Jan 11 23:11:17 2017 +0100 @@ -1002,6 +1002,7 @@ class BuiltinTest(unittest.TestCase): self.assertEqual(fp.read(300), 'XXX'*100) self.assertEqual(fp.read(1000), 'YYY'*100) + @unittest.skipIf(sys.flags.utf8mode, "utf-8 mode is enabled") def test_open_default_encoding(self): old_environ = dict(os.environ) try: diff -r 2b7b203e3909 Lib/test/test_capi.py --- a/Lib/test/test_capi.py Wed Jan 11 20:18:03 2017 +0200 +++ b/Lib/test/test_capi.py Wed Jan 11 23:11:17 2017 +0100 @@ -401,6 +401,7 @@ class EmbeddingTests(unittest.TestCase): os.close(rp) return default_pipe_encoding + @unittest.skipIf(sys.flags.utf8mode, "utf-8 mode is enabled") def test_forced_io_encoding(self): # Checks forced configuration of embedded interpreter IO streams out, err = self.run_embedded_interpreter("forced_io_encoding") diff -r 2b7b203e3909 Lib/test/test_io.py --- a/Lib/test/test_io.py Wed Jan 11 20:18:03 2017 +0200 +++ b/Lib/test/test_io.py Wed Jan 11 23:11:17 2017 +0100 @@ -2446,6 +2446,7 @@ class TextIOWrapperTest(unittest.TestCas t.write("A\rB") self.assertEqual(r.getvalue(), b"XY\nZA\rB") + @unittest.skipIf(sys.flags.utf8mode, "utf-8 mode is enabled") def test_default_encoding(self): old_environ = dict(os.environ) try: @@ -2465,6 +2466,7 @@ class TextIOWrapperTest(unittest.TestCas os.environ.update(old_environ) @support.cpython_only + @unittest.skipIf(sys.flags.utf8mode, "utf-8 mode is enabled") def test_device_encoding(self): # Issue 15989 import _testcapi @@ -2982,6 +2984,7 @@ class TextIOWrapperTest(unittest.TestCas with self.open(filename, 'rb') as f: self.assertEqual(f.read(), 'aaaxxx'.encode(charset)) + @unittest.skipIf(sys.flags.utf8mode, "utf-8 mode is enabled") def test_errors_property(self): with self.open(support.TESTFN, "w") as f: self.assertEqual(f.errors, "strict") diff -r 2b7b203e3909 Lib/test/test_sys.py --- a/Lib/test/test_sys.py Wed Jan 11 20:18:03 2017 +0200 +++ b/Lib/test/test_sys.py Wed Jan 11 23:11:17 2017 +0100 @@ -559,6 +559,9 @@ class SysModuleTest(unittest.TestCase): self.assertTrue(repr(sys.flags)) self.assertEqual(len(sys.flags), len(attrs)) + def test_sys_flags_utf8mode(self): + self.assertIn(sys.flags.utf8mode, {0, 1, 2}) + def assert_raise_on_new_sys_type(self, sys_attr): # Users are intentionally prevented from creating new instances of # sys.flags, sys.version_info, and sys.getwindowsversion. diff -r 2b7b203e3909 Lib/test/test_utf8mode.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Lib/test/test_utf8mode.py Wed Jan 11 23:11:17 2017 +0100 @@ -0,0 +1,147 @@ +import os +import textwrap +import unittest +from test.support.script_helper import assert_python_ok, assert_python_failure + + +class UTF8ModeTests(unittest.TestCase): + def test_xoption(self): + code = 'import sys; print(sys.flags.utf8mode)' + + # Use __cleanenv to ignore PYTHONUTF8 + out = assert_python_ok('-X', 'utf8', '-c', code, __cleanenv=True) + self.assertEqual(out[1].rstrip(), b'1') + + out = assert_python_ok('-X', 'utf8=strict', '-c', code, __cleanenv=True) + self.assertEqual(out[1].rstrip(), b'2') + + out = assert_python_ok('-X', 'utf8=0', '-c', code, __cleanenv=True) + self.assertEqual(out[1].rstrip(), b'0') + + def test_env_var(self): + code = 'import sys; print(sys.flags.utf8mode)' + + out = assert_python_ok('-c', code, PYTHONUTF8='1') + self.assertEqual(out[1].rstrip(), b'1') + + out = assert_python_ok('-c', code, PYTHONUTF8='strict') + self.assertEqual(out[1].rstrip(), b'2') + + out = assert_python_ok('-c', code, PYTHONUTF8='0') + self.assertEqual(out[1].rstrip(), b'0') + + # -X utf8 overrides env var + out = assert_python_ok('-X', 'utf8=strict', '-c', code, PYTHONUTF8='1') + self.assertEqual(out[1].rstrip(), b'2') + + # invalid mode + out = assert_python_failure('-c', code, PYTHONUTF8='xxx') + self.assertIn(b'Error in PYTHONUTF8: invalid UTF-8 mode "xxx"!', + out[2].rstrip()) + + def test_filesystemencoding(self): + code = 'import sys; print(sys.getfilesystemencoding(), sys.getfilesystemencodeerrors())' + + out = assert_python_ok('-X', 'utf8', '-c', code) + self.assertEqual(out[1].rstrip(), b'utf-8 surrogateescape') + + out = assert_python_ok('-X', 'utf8=strict', '-c', code) + self.assertEqual(out[1].rstrip(), b'utf-8 surrogateescape') + + def test_stdio(self): + code = textwrap.dedent(''' + import sys + print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}") + print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}") + print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}") + ''') + + # Use __cleanenv to ignore PYTHONIOENCODING + out = assert_python_ok('-c', code, + PYTHONUTF8='1', __cleanenv=True) + self.assertEqual(out[1].splitlines(), + [b'stdin: utf-8/surrogateescape', + b'stdout: utf-8/surrogateescape', + b'stderr: utf-8/backslashreplace']) + + out = assert_python_ok('-c', code, + PYTHONUTF8='strict', __cleanenv=True) + self.assertEqual(out[1].splitlines(), + [b'stdin: utf-8/strict', + b'stdout: utf-8/strict', + b'stderr: utf-8/backslashreplace']) + + # PYTHONIOENCODING has the priority over PYTHONUTF8 + out = assert_python_ok('-c', code, + PYTHONUTF8='1', PYTHONIOENCODING="latin1") + self.assertEqual(out[1].splitlines(), + [b'stdin: latin1/strict', + b'stdout: latin1/strict', + b'stderr: latin1/backslashreplace']) + + out = assert_python_ok('-c', code, + PYTHONUTF8='1', PYTHONIOENCODING=":namereplace") + self.assertEqual(out[1].splitlines(), + [b'stdin: utf-8/namereplace', + b'stdout: utf-8/namereplace', + b'stderr: utf-8/backslashreplace']) + + def test_io(self): + code = textwrap.dedent(''' + import sys + filename = sys.argv[1] + with open(filename) as fp: + print(f"{fp.encoding}/{fp.errors}") + ''') + filename = __file__ + + out = assert_python_ok('-c', code, filename, PYTHONUTF8='1') + self.assertEqual(out[1].rstrip(), b'utf-8/surrogateescape') + + out = assert_python_ok('-c', code, filename, PYTHONUTF8='strict') + self.assertEqual(out[1].rstrip(), b'utf-8/strict') + + def _check_io_encoding(self, module, encoding=None, errors=None): + filename = __file__ + + # Encoding explicitly set + args = [] + if encoding: + args.append(f'encoding={encoding!r}') + if errors: + args.append(f'errors={errors!r}') + code = textwrap.dedent(''' + import sys + from %s import open + filename = sys.argv[1] + with open(filename, %s) as fp: + print(f"{fp.encoding}/{fp.errors}") + ''') % (module, ', '.join(args)) + out = assert_python_ok('-c', code, filename, + PYTHONUTF8='1') + + if not encoding: + encoding = 'utf-8' + if not errors: + if encoding: + errors = 'strict' + else: + errors = 'surrogateescape' + self.assertEqual(out[1].rstrip().decode(), + f'{encoding}/{errors}') + + def check_io_encoding(self, module): + self._check_io_encoding(module, encoding="latin1") + self._check_io_encoding(module, errors="namereplace") + self._check_io_encoding(module, + encoding="latin1", errors="namereplace") + + def test_io_encoding(self): + self.check_io_encoding('io') + + def test_io_encoding(self): + self.check_io_encoding('_pyio') + + +if __name__ == "__main__": + unittest.main() diff -r 2b7b203e3909 Modules/_io/textio.c --- a/Modules/_io/textio.c Wed Jan 11 20:18:03 2017 +0200 +++ b/Modules/_io/textio.c Wed Jan 11 23:11:17 2017 +0100 @@ -864,7 +864,18 @@ static int self->encodefunc = NULL; self->b2cratio = 0.0; - if (encoding == NULL) { + if (encoding && !errors) { + errors = "strict"; + } + + if (encoding == NULL && Py_UTF8Mode) { + self->encoding = PyUnicode_FromString("utf-8"); + if (self->encoding == NULL) { + goto error; + } + } + + if (encoding == NULL && self->encoding == NULL) { /* Try os.device_encoding(fileno) */ PyObject *fileno; state = IO_STATE(); @@ -895,6 +906,7 @@ static int Py_CLEAR(self->encoding); } } + if (encoding == NULL && self->encoding == NULL) { PyObject *locale_module = _PyIO_get_locale_module(state); if (locale_module == NULL) @@ -946,8 +958,14 @@ static int * of the partially constructed object (like self->encoding) */ - if (errors == NULL) - errors = "strict"; + if (errors == NULL) { + if (Py_UTF8Mode == 2) + errors = "strict"; + else if (Py_UTF8Mode) + errors = "surrogateescape"; + else + errors = "strict"; + } self->errors = PyBytes_FromString(errors); if (self->errors == NULL) goto error; diff -r 2b7b203e3909 Modules/main.c --- a/Modules/main.c Wed Jan 11 20:18:03 2017 +0200 +++ b/Modules/main.c Wed Jan 11 23:11:17 2017 +0100 @@ -350,14 +350,14 @@ Py_Main(int argc, wchar_t **argv) PyCompilerFlags cf; PyObject *warning_option = NULL; PyObject *warning_options = NULL; + int utf8mode = -1; cf.cf_flags = 0; orig_argc = argc; /* For Py_GetArgcArgv() */ orig_argv = argv; - /* Hash randomization needed early for all string operations - (including -W and -X options). */ + /* Hash randomization and -X utf8 needed early */ _PyOS_opterr = 0; /* prevent printing the error in 1st pass */ while ((c = _PyOS_GetOpt(argc, argv, PROGRAM_OPTS)) != EOF) { if (c == 'm' || c == 'c') { @@ -367,7 +367,19 @@ Py_Main(int argc, wchar_t **argv) } if (c == 'E') { Py_IgnoreEnvironmentFlag++; - break; + } + else if (c == 'X') { + if (wcscmp(_PyOS_optarg, L"utf8") == 0) { + utf8mode = 1; + } + else if (wcscmp(_PyOS_optarg, L"utf8=strict") == 0) { + utf8mode = 2; + } + else if (wcscmp(_PyOS_optarg, L"utf8=0") == 0) { + utf8mode = 0; + } + /* other invalid values of "-X utf8" are rejected + in _PyUTF8Mode_Init() */ } } @@ -378,6 +390,35 @@ Py_Main(int argc, wchar_t **argv) exit(1); } + opt = Py_GETENV("PYTHONUTF8"); + if (opt) { + int env_utf8mode = -1; + + if (strcmp(opt, "1") == 0) { + env_utf8mode = 1; + } + else if (strcmp(opt, "strict") == 0) { + env_utf8mode = 2; + } + else if (strcmp(opt, "0") == 0) { + env_utf8mode = 0; + } + else { + fprintf(stderr, + "Error in PYTHONUTF8: invalid UTF-8 mode \"%s\"!\n", opt); + exit(1); + } + + /* -X utf8 has the priority over the PYTHONUTF8 environment variable */ + if (utf8mode == -1) { + utf8mode = env_utf8mode; + } + } + + if (utf8mode != -1) { + Py_UTF8Mode = utf8mode; + } + Py_HashRandomizationFlag = 1; _PyRandom_Init(); diff -r 2b7b203e3909 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Wed Jan 11 20:18:03 2017 +0200 +++ b/Objects/unicodeobject.c Wed Jan 11 23:11:17 2017 +0100 @@ -5067,16 +5067,17 @@ onError: return NULL; } -#if defined(__APPLE__) || defined(__ANDROID__) - -/* Simplified UTF-8 decoder using surrogateescape error handler, - used to decode the command line arguments on Mac OS X and Android. - - Return a pointer to a newly allocated wide character string (use - PyMem_RawFree() to free the memory), or NULL on memory allocation error. */ - + +/* UTF-8 decoder using the surrogateescape error handler . + + On success, return a pointer to a newly allocated wide character string (use + PyMem_RawFree() to free the memory) and write the output length (in number + of wchar_t units) into *p_wlen (if p_wlen is set). + + On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen + (if p_wlen is set). */ wchar_t* -_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) +_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) { const char *e; wchar_t *unicode; @@ -5084,11 +5085,20 @@ wchar_t* /* Note: size will always be longer than the resulting Unicode character count */ - if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) - return NULL; + if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) { + if (p_wlen) { + *p_wlen = (size_t)-1; + } + return NULL; + } + unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); - if (!unicode) - return NULL; + if (!unicode) { + if (p_wlen) { + *p_wlen = (size_t)-1; + } + return NULL; + } /* Unpack UTF-8 encoded data */ e = s + size; @@ -5118,10 +5128,12 @@ wchar_t* } } unicode[outpos] = L'\0'; + if (p_wlen) { + *p_wlen = outpos; + } return unicode; } -#endif /* __APPLE__ or __ANDROID__ */ /* Primary internal function which creates utf8 encoded bytes objects. diff -r 2b7b203e3909 Programs/python.c --- a/Programs/python.c Wed Jan 11 20:18:03 2017 +0200 +++ b/Programs/python.c Wed Jan 11 23:11:17 2017 +0100 @@ -15,14 +15,22 @@ wmain(int argc, wchar_t **argv) } #else +static void _Py_NO_RETURN +fatal_error(const char *msg) +{ + fprintf(stderr, "Fatal Python error: %s\n", msg); + exit(1); +} + int main(int argc, char **argv) { wchar_t **argv_copy; - /* We need a second copy, as Python might modify the first one. */ + /* We need a second copy to release the memory: + Py_Main() modifies argv_copy */ wchar_t **argv_copy2; int i, res; - char *oldloc; + char *loc, *oldloc; /* Force malloc() allocator to bootstrap Python */ (void)_PyMem_SetupAllocators("malloc"); @@ -30,7 +38,7 @@ main(int argc, char **argv) argv_copy = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1)); argv_copy2 = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1)); if (!argv_copy || !argv_copy2) { - fprintf(stderr, "out of memory\n"); + fatal_error("out of memory"); return 1; } @@ -45,19 +53,25 @@ main(int argc, char **argv) oldloc = _PyMem_RawStrdup(setlocale(LC_ALL, NULL)); if (!oldloc) { - fprintf(stderr, "out of memory\n"); - return 1; + fatal_error("out of memory (failed to copy the LC_ALL locale)"); } setlocale(LC_ALL, ""); + + loc = setlocale(LC_CTYPE, NULL); + if (!oldloc) { + fatal_error("failed to get the LC_CTYPE locale"); + } + if (strcmp(loc, "C") == 0) { + /* The POSIX locale enables the UTF-8 mode */ + Py_UTF8Mode = 1; + } + for (i = 0; i < argc; i++) { argv_copy[i] = Py_DecodeLocale(argv[i], NULL); if (!argv_copy[i]) { PyMem_RawFree(oldloc); - fprintf(stderr, "Fatal Python error: " - "unable to decode the command line argument #%i\n", - i + 1); - return 1; + fatal_error("unable to decode the command line arguments"); } argv_copy2[i] = argv_copy[i]; } diff -r 2b7b203e3909 Python/bltinmodule.c --- a/Python/bltinmodule.c Wed Jan 11 20:18:03 2017 +0200 +++ b/Python/bltinmodule.c Wed Jan 11 23:11:17 2017 +0100 @@ -33,6 +33,14 @@ const char *Py_FileSystemDefaultEncoding int Py_HasFileSystemDefaultEncoding = 0; #endif const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape"; +/* UTF-8 mode (PEP 540): + 0: UTF-8 mode disabled, use the locale encoding with strict or + surrogateescape error handler depending on the case + 1: UTF-8 mode enabled, use UTF-8 with surrogateescape error handler by + default and ignore the locale + 2: UTF-8 mode in strict mode, use UTF-8 with strict error handler by default + and ignore the locale */ +int Py_UTF8Mode = 0; _Py_IDENTIFIER(__builtins__); _Py_IDENTIFIER(__dict__); diff -r 2b7b203e3909 Python/fileutils.c --- a/Python/fileutils.c Wed Jan 11 20:18:03 2017 +0200 +++ b/Python/fileutils.c Wed Jan 11 23:11:17 2017 +0100 @@ -20,9 +20,8 @@ extern int winerror_to_errno(int); #include #endif /* HAVE_FCNTL_H */ -#if defined(__APPLE__) || defined(__ANDROID__) -extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size); -#endif +extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, + size_t *p_wlen); #ifdef O_CLOEXEC /* Does open() support the O_CLOEXEC flag? Possible values: @@ -250,40 +249,9 @@ decode_ascii_surrogateescape(const char } #endif - -/* Decode a byte string from the locale encoding with the - surrogateescape error handler: undecodable bytes are decoded as characters - in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate - character, escape the bytes using the surrogateescape error handler instead - of decoding them. - - Return a pointer to a newly allocated wide character string, use - PyMem_RawFree() to free the memory. If size is not NULL, write the number of - wide characters excluding the null character into *size - - Return NULL on decoding error or memory allocation error. If *size* is not - NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on - decoding error. - - Decoding errors should never happen, unless there is a bug in the C - library. - - Use the Py_EncodeLocale() function to encode the character string back to a - byte string. */ -wchar_t* -Py_DecodeLocale(const char* arg, size_t *size) +static wchar_t* +decode_locale(const char* arg, size_t *size) { -#if defined(__APPLE__) || defined(__ANDROID__) - wchar_t *wstr; - wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg)); - if (size != NULL) { - if (wstr != NULL) - *size = wcslen(wstr); - else - *size = (size_t)-1; - } - return wstr; -#else wchar_t *res; size_t argsize; size_t count; @@ -293,19 +261,6 @@ Py_DecodeLocale(const char* arg, size_t mbstate_t mbs; #endif -#ifndef MS_WINDOWS - if (force_ascii == -1) - force_ascii = check_force_ascii(); - - if (force_ascii) { - /* force ASCII encoding to workaround mbstowcs() issue */ - res = decode_ascii_surrogateescape(arg, size); - if (res == NULL) - goto oom; - return res; - } -#endif - #ifdef HAVE_BROKEN_MBSTOWCS /* Some platforms have a broken implementation of * mbstowcs which does not count the characters that @@ -402,43 +357,84 @@ Py_DecodeLocale(const char* arg, size_t goto oom; #endif /* HAVE_MBRTOWC */ return res; + oom: - if (size != NULL) + if (size != NULL) { *size = (size_t)-1; + } return NULL; +} + + +/* Decode a byte string from the locale encoding with the + surrogateescape error handler: undecodable bytes are decoded as characters + in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate + character, escape the bytes using the surrogateescape error handler instead + of decoding them. + + Return a pointer to a newly allocated wide character string, use + PyMem_RawFree() to free the memory. If size is not NULL, write the number of + wide characters excluding the null character into *size + + Return NULL on decoding error or memory allocation error. If *size* is not + NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on + decoding error. + + Decoding errors should never happen, unless there is a bug in the C + library. + + Use the Py_EncodeLocale() function to encode the character string back to a + byte string. */ +wchar_t* +Py_DecodeLocale(const char* arg, size_t *size) +{ +#if defined(__APPLE__) || defined(__ANDROID__) + return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); +#else + if (Py_UTF8Mode) { + return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); + } + +#ifndef MS_WINDOWS + if (force_ascii == -1) + force_ascii = check_force_ascii(); + + if (force_ascii) { + /* force ASCII encoding to workaround mbstowcs() issue */ + wchar_t *wstr = decode_ascii_surrogateescape(arg, size); + if (wstr == NULL) { + if (size != NULL) { + *size = (size_t)-1; + } + return NULL; + } + return wstr; + } +#endif + + return decode_locale(arg, size); #endif /* __APPLE__ or __ANDROID__ */ } -/* Encode a wide character string to the locale encoding with the - surrogateescape error handler: surrogate characters in the range - U+DC80..U+DCFF are converted to bytes 0x80..0xFF. - - Return a pointer to a newly allocated byte string, use PyMem_Free() to free - the memory. Return NULL on encoding or memory allocation error. - - If error_pos is not NULL, *error_pos is set to the index of the invalid - character on encoding error, or set to (size_t)-1 otherwise. - - Use the Py_DecodeLocale() function to decode the bytes string back to a wide - character string. */ -char* -Py_EncodeLocale(const wchar_t *text, size_t *error_pos) +static char* +_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos) { -#if defined(__APPLE__) || defined(__ANDROID__) Py_ssize_t len; PyObject *unicode, *bytes = NULL; char *cpath; unicode = PyUnicode_FromWideChar(text, wcslen(text)); - if (unicode == NULL) + if (unicode == NULL) { return NULL; + } bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape"); Py_DECREF(unicode); if (bytes == NULL) { PyErr_Clear(); - if (error_pos != NULL) + if (error_pos != NULL) { *error_pos = (size_t)-1; + } return NULL; } @@ -447,27 +443,24 @@ Py_EncodeLocale(const wchar_t *text, siz if (cpath == NULL) { PyErr_Clear(); Py_DECREF(bytes); - if (error_pos != NULL) + if (error_pos != NULL) { *error_pos = (size_t)-1; + } return NULL; } memcpy(cpath, PyBytes_AsString(bytes), len + 1); Py_DECREF(bytes); return cpath; -#else /* __APPLE__ */ +} + +static char* +encode_locale(const wchar_t *text, size_t *error_pos) +{ const size_t len = wcslen(text); char *result = NULL, *bytes = NULL; size_t i, size, converted; wchar_t c, buf[2]; -#ifndef MS_WINDOWS - if (force_ascii == -1) - force_ascii = check_force_ascii(); - - if (force_ascii) - return encode_ascii_surrogateescape(text, error_pos); -#endif - /* The function works in two steps: 1. compute the length of the output buffer in bytes (size) 2. outputs the bytes */ @@ -522,6 +515,39 @@ Py_EncodeLocale(const wchar_t *text, siz bytes = result; } return result; +} + +/* Encode a wide character string to the locale encoding with the + surrogateescape error handler: surrogate characters in the range + U+DC80..U+DCFF are converted to bytes 0x80..0xFF. + + Return a pointer to a newly allocated byte string, use PyMem_Free() to free + the memory. Return NULL on encoding or memory allocation error. + + If error_pos is not NULL, *error_pos is set to (size_t)-1 on success, or set + to the index of the invalid character on encoding error. + + Use the Py_DecodeLocale() function to decode the bytes string back to a wide + character string. */ +char* +Py_EncodeLocale(const wchar_t *text, size_t *error_pos) +{ +#if defined(__APPLE__) || defined(__ANDROID__) + return _Py_EncodeLocaleUTF8(text, error_pos); +#else /* __APPLE__ */ + if (Py_UTF8Mode) { + return _Py_EncodeLocaleUTF8(text, error_pos); + } + +#ifndef MS_WINDOWS + if (force_ascii == -1) + force_ascii = check_force_ascii(); + + if (force_ascii) + return encode_ascii_surrogateescape(text, error_pos); +#endif + + return encode_locale(text, error_pos); #endif /* __APPLE__ or __ANDROID__ */ } diff -r 2b7b203e3909 Python/pylifecycle.c --- a/Python/pylifecycle.c Wed Jan 11 20:18:03 2017 +0200 +++ b/Python/pylifecycle.c Wed Jan 11 23:11:17 2017 +0100 @@ -302,6 +302,49 @@ import_init(PyInterpreterState *interp, } +static int +_PyUTF8Mode_Init(void) +{ + PyObject *xoptions, *key, *value; + int mode; + + xoptions = PySys_GetXOptions(); + if (xoptions == NULL) + return -1; + + key = PyUnicode_FromString("utf8"); + if (key == NULL) + return -1; + + value = PyDict_GetItemWithError(xoptions, key); + Py_DECREF(key); + if (value == NULL && PyErr_Occurred()) { + return -1; + } + if (value == NULL) { + return 0; + } + + if (value == Py_True) { + mode = 1; + } + else if (PyUnicode_CompareWithASCIIString(value, "strict") == 0) { + mode = 2; + } + else if (PyUnicode_CompareWithASCIIString(value, "0") == 0) { + mode = 0; + } + else { + fprintf(stderr, "Invalid UTF-8 mode (-X option)!\n"); + return -1; + } + + /* Py_Main() handles -X utf8 early: just make sure that it's consistent */ + assert(Py_UTF8Mode == mode); + return 0; +} + + void _Py_InitializeEx_Private(int install_sigs, int install_importlib) { @@ -344,6 +387,10 @@ void _PyRandom_Init(); + if (_PyUTF8Mode_Init() < 0) { + Py_FatalError("Py_Initialize: UTF-8 mode initialization failed"); + } + interp = PyInterpreterState_New(); if (interp == NULL) Py_FatalError("Py_Initialize: can't make first interpreter"); @@ -997,15 +1044,19 @@ initfsencoding(PyInterpreterState *inter Py_FileSystemDefaultEncodeErrors = "surrogatepass"; } #else - if (Py_FileSystemDefaultEncoding == NULL) - { - Py_FileSystemDefaultEncoding = get_locale_encoding(); - if (Py_FileSystemDefaultEncoding == NULL) - Py_FatalError("Py_Initialize: Unable to get the locale encoding"); + if (Py_FileSystemDefaultEncoding == NULL) { + if (!Py_UTF8Mode) { + Py_FileSystemDefaultEncoding = get_locale_encoding(); + if (Py_FileSystemDefaultEncoding == NULL) + Py_FatalError("Py_Initialize: Unable to get the locale encoding"); - Py_HasFileSystemDefaultEncoding = 0; - interp->fscodec_initialized = 1; - return 0; + Py_HasFileSystemDefaultEncoding = 0; + interp->fscodec_initialized = 1; + return 0; + } + + Py_FileSystemDefaultEncoding = "utf-8"; + Py_HasFileSystemDefaultEncoding = 1; } #endif @@ -1242,6 +1293,11 @@ initstdio(void) encoding = pythonioencoding; } } + else if (Py_UTF8Mode) { + encoding = "utf-8"; + errors = (Py_UTF8Mode == 2 ) ? "strict" : "surrogateescape"; + } + if (!errors && !(pythonioencoding && *pythonioencoding)) { /* When the LC_CTYPE locale is the POSIX locale ("C locale"), stdin and stdout use the surrogateescape error handler by diff -r 2b7b203e3909 Python/sysmodule.c --- a/Python/sysmodule.c Wed Jan 11 20:18:03 2017 +0200 +++ b/Python/sysmodule.c Wed Jan 11 23:11:17 2017 +0100 @@ -1713,6 +1713,7 @@ static PyStructSequence_Field flags_fiel {"quiet", "-q"}, {"hash_randomization", "-R"}, {"isolated", "-I"}, + {"utf8mode", "-X utf8"}, {0} }; @@ -1751,6 +1752,7 @@ make_flags(void) SetFlag(Py_QuietFlag); SetFlag(Py_HashRandomizationFlag); SetFlag(Py_IsolatedFlag); + SetFlag(Py_UTF8Mode); #undef SetFlag if (PyErr_Occurred()) {