diff -r 99ad6e871459 Doc/library/sys.rst --- a/Doc/library/sys.rst Wed Jan 11 06:57:55 2017 +0000 +++ b/Doc/library/sys.rst Wed Jan 11 12:23:56 2017 +0100 @@ -295,6 +295,7 @@ always available. :const:`bytes_warning` :option:`-b` :const:`quiet` :option:`-q` :const:`hash_randomization` :option:`-R` + :const:`utf8mode` :option:`-X utf8` ============================= ============================= .. versionchanged:: 3.2 @@ -306,6 +307,9 @@ always available. .. versionchanged:: 3.3 Removed obsolete ``division_warning`` attribute. + .. versionchanged:: 3.7 + Added ``utf8mode`` attribute for the new :option:`-X utf8` flag. + .. data:: float_info @@ -449,6 +453,9 @@ always available. :func:`os.fsencode` and :func:`os.fsdecode` should be used to ensure that the correct encoding and errors mode are used. + The UTF-8 mode uses ``utf-8`` encoding with ``surrogateescape`` error + handler, or the ``strict`` error handler for UTF-8 mode in strict mode. + * On Mac OS X, the encoding is ``'utf-8'``. * On Unix, the encoding is the locale encoding. @@ -463,6 +470,9 @@ always available. Windows is no longer guaranteed to return ``'mbcs'``. See :pep:`529` and :func:`_enablelegacywindowsfsencoding` for more information. + .. versionchanged:: 3.7 + The UTF-8 mode changes the encoding and error handler. + .. function:: getfilesystemencodeerrors() Return the name of the error mode used to convert between Unicode filenames diff -r 99ad6e871459 Doc/using/cmdline.rst --- a/Doc/using/cmdline.rst Wed Jan 11 06:57:55 2017 +0000 +++ b/Doc/using/cmdline.rst Wed Jan 11 12:23:56 2017 +0100 @@ -405,6 +405,7 @@ Miscellaneous options :func:`tracemalloc.start` for more information. * ``-X showalloccount`` to enable the output of the total count of allocated objects for each type (only works when built with ``COUNT_ALLOCS`` defined); + * ``-X utf8`` to enable the UTF-8 mode. It also allows passing arbitrary values and retrieving them through the :data:`sys._xoptions` dictionary. @@ -421,6 +422,9 @@ Miscellaneous options .. versionadded:: 3.6 The ``-X showalloccount`` option. + .. versionchanged:: 3.7 + The ``-X utf8`` option. + Options you shouldn't use ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -711,6 +715,17 @@ conflict. .. versionadded:: 3.6 +.. envvar:: PYTHONUTF8 + + If set to ``1``, enable the UTF-8 mode. + + If set to ``strict``, enable the UTF-8 mode in strict mode. + + Any other value cause an error. + + .. versionadded:: 3.7 + + Debug-mode variables ~~~~~~~~~~~~~~~~~~~~ diff -r 99ad6e871459 Include/fileobject.h --- a/Include/fileobject.h Wed Jan 11 06:57:55 2017 +0000 +++ b/Include/fileobject.h Wed Jan 11 12:23:56 2017 +0100 @@ -28,6 +28,10 @@ PyAPI_DATA(const char *) Py_FileSystemDe #endif PyAPI_DATA(int) Py_HasFileSystemDefaultEncoding; +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000 +PyAPI_DATA(int) Py_UTF8Mode; +#endif + /* Internal API The std printer acts as a preliminary sys.stderr until the new io diff -r 99ad6e871459 Lib/_pyio.py --- a/Lib/_pyio.py Wed Jan 11 06:57:55 2017 +0000 +++ b/Lib/_pyio.py Wed Jan 11 12:23:56 2017 +0100 @@ -1914,19 +1914,34 @@ class TextIOWrapper(TextIOBase): raise TypeError("illegal newline type: %r" % (type(newline),)) if newline not in (None, "", "\n", "\r", "\r\n"): raise ValueError("illegal newline value: %r" % (newline,)) + + if encoding and not errors: + errors = "strict" + if encoding is None: try: - encoding = os.device_encoding(buffer.fileno()) - except (AttributeError, UnsupportedOperation): - pass - if encoding is None: + utf8mode = sys.flags.utf8mode + except AttributeError: + # TextIOWrapper created during Python shutdown, sys.flags + # was already set to None. Consider that the UTF-8 mode is + # disabled. + utf8mode = False + + if utf8mode: + encoding = "utf-8" + else: try: - import locale - except ImportError: - # Importing locale may fail if Python is being built - encoding = "ascii" - else: - encoding = locale.getpreferredencoding(False) + encoding = os.device_encoding(buffer.fileno()) + except (AttributeError, UnsupportedOperation): + pass + if encoding is None: + try: + import locale + except ImportError: + # Importing locale may fail if Python is being built + encoding = "ascii" + else: + encoding = locale.getpreferredencoding(False) if not isinstance(encoding, str): raise ValueError("invalid encoding: %r" % encoding) @@ -1937,7 +1952,12 @@ class TextIOWrapper(TextIOBase): raise LookupError(msg % encoding) if errors is None: - errors = "strict" + if sys.flags.utf8mode == 2: + errors = "strict" + elif sys.flags.utf8mode: + errors = "surrogateescape" + else: + errors = "strict" else: if not isinstance(errors, str): raise ValueError("invalid errors: %r" % errors) diff -r 99ad6e871459 Lib/subprocess.py --- a/Lib/subprocess.py Wed Jan 11 06:57:55 2017 +0000 +++ b/Lib/subprocess.py Wed Jan 11 12:23:56 2017 +0100 @@ -251,6 +251,10 @@ def _args_from_interpreter_flags(): v = getattr(sys.flags, flag) if v > 0: args.append('-' + opt * v) + if sys.flags.utf8mode == 2: + args.extend(('-X', 'utf8=strict')) + elif sys.flags.utf8mode: + args.extend(('-X', 'utf8')) for opt in sys.warnoptions: args.append('-W' + opt) return args diff -r 99ad6e871459 Lib/test/test_builtin.py --- a/Lib/test/test_builtin.py Wed Jan 11 06:57:55 2017 +0000 +++ b/Lib/test/test_builtin.py Wed Jan 11 12:23:56 2017 +0100 @@ -1002,6 +1002,7 @@ class BuiltinTest(unittest.TestCase): self.assertEqual(fp.read(300), 'XXX'*100) self.assertEqual(fp.read(1000), 'YYY'*100) + @unittest.skipIf(sys.flags.utf8mode, "utf-8 mode is enabled") def test_open_default_encoding(self): old_environ = dict(os.environ) try: diff -r 99ad6e871459 Lib/test/test_capi.py --- a/Lib/test/test_capi.py Wed Jan 11 06:57:55 2017 +0000 +++ b/Lib/test/test_capi.py Wed Jan 11 12:23:56 2017 +0100 @@ -401,6 +401,7 @@ class EmbeddingTests(unittest.TestCase): os.close(rp) return default_pipe_encoding + @unittest.skipIf(sys.flags.utf8mode, "utf-8 mode is enabled") def test_forced_io_encoding(self): # Checks forced configuration of embedded interpreter IO streams out, err = self.run_embedded_interpreter("forced_io_encoding") diff -r 99ad6e871459 Lib/test/test_io.py --- a/Lib/test/test_io.py Wed Jan 11 06:57:55 2017 +0000 +++ b/Lib/test/test_io.py Wed Jan 11 12:23:56 2017 +0100 @@ -2446,6 +2446,7 @@ class TextIOWrapperTest(unittest.TestCas t.write("A\rB") self.assertEqual(r.getvalue(), b"XY\nZA\rB") + @unittest.skipIf(sys.flags.utf8mode, "utf-8 mode is enabled") def test_default_encoding(self): old_environ = dict(os.environ) try: @@ -2465,6 +2466,7 @@ class TextIOWrapperTest(unittest.TestCas os.environ.update(old_environ) @support.cpython_only + @unittest.skipIf(sys.flags.utf8mode, "utf-8 mode is enabled") def test_device_encoding(self): # Issue 15989 import _testcapi @@ -2982,6 +2984,7 @@ class TextIOWrapperTest(unittest.TestCas with self.open(filename, 'rb') as f: self.assertEqual(f.read(), 'aaaxxx'.encode(charset)) + @unittest.skipIf(sys.flags.utf8mode, "utf-8 mode is enabled") def test_errors_property(self): with self.open(support.TESTFN, "w") as f: self.assertEqual(f.errors, "strict") diff -r 99ad6e871459 Lib/test/test_sys.py --- a/Lib/test/test_sys.py Wed Jan 11 06:57:55 2017 +0000 +++ b/Lib/test/test_sys.py Wed Jan 11 12:23:56 2017 +0100 @@ -559,6 +559,9 @@ class SysModuleTest(unittest.TestCase): self.assertTrue(repr(sys.flags)) self.assertEqual(len(sys.flags), len(attrs)) + def test_sys_flags_utf8mode(self): + self.assertIn(sys.flags.utf8mode, {0, 1, 2}) + def assert_raise_on_new_sys_type(self, sys_attr): # Users are intentionally prevented from creating new instances of # sys.flags, sys.version_info, and sys.getwindowsversion. diff -r 99ad6e871459 Lib/test/test_utf8mode.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Lib/test/test_utf8mode.py Wed Jan 11 12:23:56 2017 +0100 @@ -0,0 +1,146 @@ +import os +import textwrap +import unittest +from test.support.script_helper import assert_python_ok, assert_python_failure + + +class UTF8ModeTests(unittest.TestCase): + def test_xoption(self): + code = 'import sys; print(sys.flags.utf8mode)' + + out = assert_python_ok('-X', 'utf8', '-c', code) + self.assertEqual(out[1].rstrip(), b'1') + + out = assert_python_ok('-X', 'utf8=strict', '-c', code) + self.assertEqual(out[1].rstrip(), b'2') + + out = assert_python_ok('-X', 'utf8=0', '-c', code) + self.assertEqual(out[1].rstrip(), b'0') + + def test_env_var(self): + code = 'import sys; print(sys.flags.utf8mode)' + + out = assert_python_ok('-c', code, PYTHONUTF8='1') + self.assertEqual(out[1].rstrip(), b'1') + + out = assert_python_ok('-c', code, PYTHONUTF8='strict') + self.assertEqual(out[1].rstrip(), b'2') + + out = assert_python_ok('-c', code, PYTHONUTF8='0') + self.assertEqual(out[1].rstrip(), b'0') + + # env var overrides -X utf8 + out = assert_python_ok('-X utf8', '-c', code, PYTHONUTF8='strict') + self.assertEqual(out[1].rstrip(), b'2') + + + # invalid mode + out = assert_python_failure('-c', code, PYTHONUTF8='xxx') + self.assertIn(b'Error in PYTHONUTF8: invalid mode "xxx"!', out[2].rstrip()) + + def test_filesystemencoding(self): + code = 'import sys; print(sys.getfilesystemencoding(), sys.getfilesystemencodeerrors())' + + out = assert_python_ok('-c', code, PYTHONUTF8='1') + self.assertEqual(out[1].rstrip(), b'utf-8 surrogateescape') + + out = assert_python_ok('-c', code, PYTHONUTF8='strict') + self.assertEqual(out[1].rstrip(), b'utf-8 strict') + + def test_stdio(self): + code = textwrap.dedent(''' + import sys + print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}") + print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}") + print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}") + ''') + + # Use __cleanenv to ignore PYTHONIOENCODING + out = assert_python_ok('-c', code, + PYTHONUTF8='1', __cleanenv=True) + self.assertEqual(out[1].splitlines(), + [b'stdin: utf-8/surrogateescape', + b'stdout: utf-8/surrogateescape', + b'stderr: utf-8/backslashreplace']) + + out = assert_python_ok('-c', code, + PYTHONUTF8='strict', __cleanenv=True) + self.assertEqual(out[1].splitlines(), + [b'stdin: utf-8/strict', + b'stdout: utf-8/strict', + b'stderr: utf-8/backslashreplace']) + + # PYTHONIOENCODING has the priority over PYTHONUTF8 + out = assert_python_ok('-c', code, + PYTHONUTF8='1', PYTHONIOENCODING="latin1") + self.assertEqual(out[1].splitlines(), + [b'stdin: latin1/strict', + b'stdout: latin1/strict', + b'stderr: latin1/backslashreplace']) + + out = assert_python_ok('-c', code, + PYTHONUTF8='1', PYTHONIOENCODING=":namereplace") + self.assertEqual(out[1].splitlines(), + [b'stdin: utf-8/namereplace', + b'stdout: utf-8/namereplace', + b'stderr: utf-8/backslashreplace']) + + def test_io(self): + code = textwrap.dedent(''' + import sys + filename = sys.argv[1] + with open(filename) as fp: + print(f"{fp.encoding}/{fp.errors}") + ''') + filename = __file__ + + out = assert_python_ok('-c', code, filename, PYTHONUTF8='1') + self.assertEqual(out[1].rstrip(), b'utf-8/surrogateescape') + + out = assert_python_ok('-c', code, filename, PYTHONUTF8='strict') + self.assertEqual(out[1].rstrip(), b'utf-8/strict') + + def _check_io_encoding(self, module, encoding=None, errors=None): + filename = __file__ + + # Encoding explicitly set + args = [] + if encoding: + args.append(f'encoding={encoding!r}') + if errors: + args.append(f'errors={errors!r}') + code = textwrap.dedent(''' + import sys + from %s import open + filename = sys.argv[1] + with open(filename, %s) as fp: + print(f"{fp.encoding}/{fp.errors}") + ''') % (module, ', '.join(args)) + out = assert_python_ok('-c', code, filename, + PYTHONUTF8='1') + + if not encoding: + encoding = 'utf-8' + if not errors: + if encoding: + errors = 'strict' + else: + errors = 'surrogateescape' + self.assertEqual(out[1].rstrip().decode(), + f'{encoding}/{errors}') + + def check_io_encoding(self, module): + self._check_io_encoding(module, encoding="latin1") + self._check_io_encoding(module, errors="namereplace") + self._check_io_encoding(module, + encoding="latin1", errors="namereplace") + + def test_io_encoding(self): + self.check_io_encoding('io') + + def test_io_encoding(self): + self.check_io_encoding('_pyio') + + +if __name__ == "__main__": + unittest.main() diff -r 99ad6e871459 Modules/_io/textio.c --- a/Modules/_io/textio.c Wed Jan 11 06:57:55 2017 +0000 +++ b/Modules/_io/textio.c Wed Jan 11 12:23:56 2017 +0100 @@ -864,7 +864,18 @@ static int self->encodefunc = NULL; self->b2cratio = 0.0; - if (encoding == NULL) { + if (encoding && !errors) { + errors = "strict"; + } + + if (encoding == NULL && Py_UTF8Mode) { + self->encoding = PyUnicode_FromString("utf-8"); + if (self->encoding == NULL) { + goto error; + } + } + + if (encoding == NULL && self->encoding == NULL) { /* Try os.device_encoding(fileno) */ PyObject *fileno; state = IO_STATE(); @@ -895,6 +906,7 @@ static int Py_CLEAR(self->encoding); } } + if (encoding == NULL && self->encoding == NULL) { PyObject *locale_module = _PyIO_get_locale_module(state); if (locale_module == NULL) @@ -946,8 +958,14 @@ static int * of the partially constructed object (like self->encoding) */ - if (errors == NULL) - errors = "strict"; + if (errors == NULL) { + if (Py_UTF8Mode == 2) + errors = "strict"; + else if (Py_UTF8Mode) + errors = "surrogateescape"; + else + errors = "strict"; + } self->errors = PyBytes_FromString(errors); if (self->errors == NULL) goto error; diff -r 99ad6e871459 Modules/main.c --- a/Modules/main.c Wed Jan 11 06:57:55 2017 +0000 +++ b/Modules/main.c Wed Jan 11 12:23:56 2017 +0100 @@ -356,8 +356,7 @@ Py_Main(int argc, wchar_t **argv) orig_argc = argc; /* For Py_GetArgcArgv() */ orig_argv = argv; - /* Hash randomization needed early for all string operations - (including -W and -X options). */ + /* Hash randomization and -X utf8 needed early */ _PyOS_opterr = 0; /* prevent printing the error in 1st pass */ while ((c = _PyOS_GetOpt(argc, argv, PROGRAM_OPTS)) != EOF) { if (c == 'm' || c == 'c') { @@ -367,7 +366,19 @@ Py_Main(int argc, wchar_t **argv) } if (c == 'E') { Py_IgnoreEnvironmentFlag++; - break; + } + else if (c == 'X') { + if (wcscmp(_PyOS_optarg, L"utf8") == 0) { + Py_UTF8Mode = 1; + } + else if (wcscmp(_PyOS_optarg, L"utf8=strict") == 0) { + Py_UTF8Mode = 2; + } + else if (wcscmp(_PyOS_optarg, L"utf8=0") == 0) { + Py_UTF8Mode = 0; + } + /* FIXME: need to encode argv with Py_EncodeLocale() + and decode from UTF-8 */ } } @@ -378,6 +389,24 @@ Py_Main(int argc, wchar_t **argv) exit(1); } + opt = Py_GETENV("PYTHONUTF8"); + if (opt) { + if (strcmp(opt, "1") == 0) { + Py_UTF8Mode = 1; + } + else if (strcmp(opt, "strict") == 0) { + Py_UTF8Mode = 2; + } + else if (strcmp(opt, "0") == 0) { + Py_UTF8Mode = 0; + } + else { + fprintf(stderr, + "Error in PYTHONUTF8: invalid mode \"%s\"!\n", opt); + exit(1); + } + } + Py_HashRandomizationFlag = 1; _PyRandom_Init(); diff -r 99ad6e871459 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Wed Jan 11 06:57:55 2017 +0000 +++ b/Objects/unicodeobject.c Wed Jan 11 12:23:56 2017 +0100 @@ -5067,8 +5067,6 @@ onError: return NULL; } -#if defined(__APPLE__) || defined(__ANDROID__) - /* Simplified UTF-8 decoder using surrogateescape error handler, used to decode the command line arguments on Mac OS X and Android. @@ -5121,8 +5119,6 @@ wchar_t* return unicode; } -#endif /* __APPLE__ or __ANDROID__ */ - /* Primary internal function which creates utf8 encoded bytes objects. Allocation strategy: if the string is short, convert into a stack buffer diff -r 99ad6e871459 Programs/python.c --- a/Programs/python.c Wed Jan 11 06:57:55 2017 +0000 +++ b/Programs/python.c Wed Jan 11 12:23:56 2017 +0100 @@ -15,14 +15,22 @@ wmain(int argc, wchar_t **argv) } #else +static void _Py_NO_RETURN +fatal_error(const char *msg) +{ + fprintf(stderr, "Fatal Python error: %s\n", msg); + exit(1); +} + int main(int argc, char **argv) { wchar_t **argv_copy; - /* We need a second copy, as Python might modify the first one. */ + /* We need a second copy to release the memory: + Py_Main() modifies argv_copy */ wchar_t **argv_copy2; int i, res; - char *oldloc; + char *loc, *oldloc; /* Force malloc() allocator to bootstrap Python */ (void)_PyMem_SetupAllocators("malloc"); @@ -30,7 +38,7 @@ main(int argc, char **argv) argv_copy = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1)); argv_copy2 = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1)); if (!argv_copy || !argv_copy2) { - fprintf(stderr, "out of memory\n"); + fatal_error("out of memory"); return 1; } @@ -45,19 +53,25 @@ main(int argc, char **argv) oldloc = _PyMem_RawStrdup(setlocale(LC_ALL, NULL)); if (!oldloc) { - fprintf(stderr, "out of memory\n"); - return 1; + fatal_error("out of memory (failed to copy the LC_ALL locale)"); } setlocale(LC_ALL, ""); + + loc = setlocale(LC_CTYPE, NULL); + if (!oldloc) { + fatal_error("failed to get the LC_CTYPE locale"); + } + if (strcmp(loc, "C") == 0) { + /* The POSIX locale enables the UTF-8 mode */ + Py_UTF8Mode = 1; + } + for (i = 0; i < argc; i++) { argv_copy[i] = Py_DecodeLocale(argv[i], NULL); if (!argv_copy[i]) { PyMem_RawFree(oldloc); - fprintf(stderr, "Fatal Python error: " - "unable to decode the command line argument #%i\n", - i + 1); - return 1; + fatal_error("unable to decode the command line arguments"); } argv_copy2[i] = argv_copy[i]; } diff -r 99ad6e871459 Python/bltinmodule.c --- a/Python/bltinmodule.c Wed Jan 11 06:57:55 2017 +0000 +++ b/Python/bltinmodule.c Wed Jan 11 12:23:56 2017 +0100 @@ -33,6 +33,14 @@ const char *Py_FileSystemDefaultEncoding int Py_HasFileSystemDefaultEncoding = 0; #endif const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape"; +/* UTF-8 mode (PEP 540): + 0: UTF-8 mode disabled, use the locale encoding with strict or + surrogateescape error handler depending on the case + 1: UTF-8 mode enabled, use UTF-8 with surrogateescape error handler by + default and ignore the locale + 2: UTF-8 mode in strict mode, use UTF-8 with strict error handler by default + and ignore the locale */ +int Py_UTF8Mode = 0; _Py_IDENTIFIER(__builtins__); _Py_IDENTIFIER(__dict__); diff -r 99ad6e871459 Python/fileutils.c --- a/Python/fileutils.c Wed Jan 11 06:57:55 2017 +0000 +++ b/Python/fileutils.c Wed Jan 11 12:23:56 2017 +0100 @@ -20,9 +20,7 @@ extern int winerror_to_errno(int); #include #endif /* HAVE_FCNTL_H */ -#if defined(__APPLE__) || defined(__ANDROID__) extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size); -#endif #ifdef O_CLOEXEC /* Does open() support the O_CLOEXEC flag? Possible values: @@ -251,6 +249,21 @@ decode_ascii_surrogateescape(const char #endif +static wchar_t* +_Py_DecodeLocaleUTF8(const char* arg, size_t *size) +{ + wchar_t *wstr; + wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg)); + if (size != NULL) { + if (wstr != NULL) + *size = wcslen(wstr); + else + *size = (size_t)-1; + } + return wstr; +} + + /* Decode a byte string from the locale encoding with the surrogateescape error handler: undecodable bytes are decoded as characters in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate @@ -274,15 +287,7 @@ wchar_t* Py_DecodeLocale(const char* arg, size_t *size) { #if defined(__APPLE__) || defined(__ANDROID__) - wchar_t *wstr; - wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg)); - if (size != NULL) { - if (wstr != NULL) - *size = wcslen(wstr); - else - *size = (size_t)-1; - } - return wstr; + return _Py_DecodeLocaleUTF8(arg, size); #else wchar_t *res; size_t argsize; @@ -293,6 +298,11 @@ Py_DecodeLocale(const char* arg, size_t mbstate_t mbs; #endif + if (Py_UTF8Mode) { + /* FIXME: support strict mode */ + return _Py_DecodeLocaleUTF8(arg, size); + } + #ifndef MS_WINDOWS if (force_ascii == -1) force_ascii = check_force_ascii(); @@ -409,22 +419,9 @@ oom: #endif /* __APPLE__ or __ANDROID__ */ } -/* Encode a wide character string to the locale encoding with the - surrogateescape error handler: surrogate characters in the range - U+DC80..U+DCFF are converted to bytes 0x80..0xFF. - - Return a pointer to a newly allocated byte string, use PyMem_Free() to free - the memory. Return NULL on encoding or memory allocation error. - - If error_pos is not NULL, *error_pos is set to the index of the invalid - character on encoding error, or set to (size_t)-1 otherwise. - - Use the Py_DecodeLocale() function to decode the bytes string back to a wide - character string. */ -char* -Py_EncodeLocale(const wchar_t *text, size_t *error_pos) +static char* +_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos) { -#if defined(__APPLE__) || defined(__ANDROID__) Py_ssize_t len; PyObject *unicode, *bytes = NULL; char *cpath; @@ -454,12 +451,35 @@ Py_EncodeLocale(const wchar_t *text, siz memcpy(cpath, PyBytes_AsString(bytes), len + 1); Py_DECREF(bytes); return cpath; +} + +/* Encode a wide character string to the locale encoding with the + surrogateescape error handler: surrogate characters in the range + U+DC80..U+DCFF are converted to bytes 0x80..0xFF. + + Return a pointer to a newly allocated byte string, use PyMem_Free() to free + the memory. Return NULL on encoding or memory allocation error. + + If error_pos is not NULL, *error_pos is set to the index of the invalid + character on encoding error, or set to (size_t)-1 otherwise. + + Use the Py_DecodeLocale() function to decode the bytes string back to a wide + character string. */ +char* +Py_EncodeLocale(const wchar_t *text, size_t *error_pos) +{ +#if defined(__APPLE__) || defined(__ANDROID__) + return _Py_EncodeLocaleUTF8(text, error_pos); #else /* __APPLE__ */ const size_t len = wcslen(text); char *result = NULL, *bytes = NULL; size_t i, size, converted; wchar_t c, buf[2]; + if (Py_UTF8Mode) { + return _Py_EncodeLocaleUTF8(text, error_pos); + } + #ifndef MS_WINDOWS if (force_ascii == -1) force_ascii = check_force_ascii(); diff -r 99ad6e871459 Python/pylifecycle.c --- a/Python/pylifecycle.c Wed Jan 11 06:57:55 2017 +0000 +++ b/Python/pylifecycle.c Wed Jan 11 12:23:56 2017 +0100 @@ -302,6 +302,49 @@ import_init(PyInterpreterState *interp, } +static int +_PyUTF8Mode_Init(void) +{ + PyObject *xoptions, *key, *value; + int mode; + + xoptions = PySys_GetXOptions(); + if (xoptions == NULL) + return -1; + + key = PyUnicode_FromString("utf8"); + if (key == NULL) + return -1; + + value = PyDict_GetItemWithError(xoptions, key); + Py_DECREF(key); + if (value == NULL && PyErr_Occurred()) { + return -1; + } + if (value == NULL) { + return 0; + } + + if (value == Py_True) { + mode = 1; + } + else if (PyUnicode_CompareWithASCIIString(value, "strict") == 0) { + mode = 2; + } + else if (PyUnicode_CompareWithASCIIString(value, "0") == 0) { + mode = 0; + } + else { + fprintf(stderr, "Invalid UTF-8 mode (-X option)!\n"); + return -1; + } + + /* Py_Main() handles -X utf8 early: just make sure that it's consistent */ + assert(Py_UTF8Mode == mode); + return 0; +} + + void _Py_InitializeEx_Private(int install_sigs, int install_importlib) { @@ -344,6 +387,10 @@ void _PyRandom_Init(); + if (_PyUTF8Mode_Init() < 0) { + Py_FatalError("Py_Initialize: UTF-8 mode initialization failed"); + } + interp = PyInterpreterState_New(); if (interp == NULL) Py_FatalError("Py_Initialize: can't make first interpreter"); @@ -997,15 +1044,32 @@ initfsencoding(PyInterpreterState *inter Py_FileSystemDefaultEncodeErrors = "surrogatepass"; } #else - if (Py_FileSystemDefaultEncoding == NULL) - { - Py_FileSystemDefaultEncoding = get_locale_encoding(); - if (Py_FileSystemDefaultEncoding == NULL) - Py_FatalError("Py_Initialize: Unable to get the locale encoding"); + if (Py_FileSystemDefaultEncoding == NULL) { + char *encoding, *errors; + if (!Py_UTF8Mode) { + Py_FileSystemDefaultEncoding = get_locale_encoding(); + if (Py_FileSystemDefaultEncoding == NULL) + Py_FatalError("Py_Initialize: Unable to get the locale encoding"); - Py_HasFileSystemDefaultEncoding = 0; - interp->fscodec_initialized = 1; - return 0; + Py_HasFileSystemDefaultEncoding = 0; + interp->fscodec_initialized = 1; + return 0; + } + + encoding = _PyMem_RawStrdup("utf-8"); + if (!encoding) { + return -1; + } + + errors = Py_UTF8Mode == 2 ? "strict" : "surrogateescape"; + errors = _PyMem_RawStrdup(errors); + if (!errors) { + PyMem_RawFree(encoding); + return -1; + } + + Py_FileSystemDefaultEncoding = encoding; + Py_FileSystemDefaultEncodeErrors = errors; } #endif @@ -1242,6 +1306,11 @@ initstdio(void) encoding = pythonioencoding; } } + else if (Py_UTF8Mode) { + encoding = "utf-8"; + errors = (Py_UTF8Mode == 2 ) ? "strict" : "surrogateescape"; + } + if (!errors && !(pythonioencoding && *pythonioencoding)) { /* When the LC_CTYPE locale is the POSIX locale ("C locale"), stdin and stdout use the surrogateescape error handler by diff -r 99ad6e871459 Python/sysmodule.c --- a/Python/sysmodule.c Wed Jan 11 06:57:55 2017 +0000 +++ b/Python/sysmodule.c Wed Jan 11 12:23:56 2017 +0100 @@ -1713,6 +1713,7 @@ static PyStructSequence_Field flags_fiel {"quiet", "-q"}, {"hash_randomization", "-R"}, {"isolated", "-I"}, + {"utf8mode", "-X utf8"}, {0} }; @@ -1751,6 +1752,7 @@ make_flags(void) SetFlag(Py_QuietFlag); SetFlag(Py_HashRandomizationFlag); SetFlag(Py_IsolatedFlag); + SetFlag(Py_UTF8Mode); #undef SetFlag if (PyErr_Occurred()) {