--- Doc/using/cmdline.rst | 39 +++ Lib/test/support/script_helper.py | 130 ++++++++++++ Lib/test/test_c_locale_coercion.py | 381 +++++++++++++++++++++++++++++++++++++ Lib/test/test_cmd_line.py | 1 Modules/main.c | 15 - configure.ac | 34 +++ 6 files changed, 594 insertions(+), 6 deletions(-) --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -618,6 +618,45 @@ conflict. .. versionadded:: 3.4 + +.. envvar:: PYTHONCOERCECLOCALE + + If set to the value ``0``, causes the main Python command line application + to skip coercing the legacy ASCII-based C locale to a more capable UTF-8 + based alternative. Note that this setting is checked even when the + :option:`-E` or :option:`-I` options are used, as it is handled prior to + the processing of command line options. + + If this variable is *not* set, or is set to a value other than ``0``, and + the current locale reported for the ``LC_CTYPE`` category is the default + ``C`` locale, then the Python CLI will attempt to configure one of the + following locales for the given locale categories before loading the + interpreter runtime: + + * ``C.UTF-8`` (``LC_ALL``) + * ``C.utf8`` (``LC_ALL``) + * ``UTF-8`` (``LC_CTYPE``) + + If setting one of these locale categories succeeds, then the matching + environment variables will be set (both ``LC_ALL`` and ``LANG`` for the + ``LC_ALL`` category, and ``LC_CTYPE`` for the ``LC_CTYPE`` category) in + the current process environment before the Python runtime is initialized. + + Configuring one of these locales (either explicitly or via the above + implicit locale coercion) will automatically set the error handler for + :data:`sys.stdin` and :data:`sys.stdout` to ``surrogateescape``. This + behavior can be overridden using :envvar:`PYTHONIOENCODING` as usual. + + For debugging purposes, setting ``PYTHONCOERCECLOCALE=warn`` will cause + Python to emit warning messages on ``stderr`` if either the locale coercion + activates, or else if a locale that *would* have triggered coercion is + still active when the Python runtime is initialized. + + Availability: \*nix + + .. versionadded:: 3.7 + See :pep:`538` for more details. + Debug-mode variables ~~~~~~~~~~~~~~~~~~~~ --- /dev/null +++ b/Lib/test/test_c_locale_coercion.py @@ -0,0 +1,381 @@ +# Tests the attempted automatic coercion of the C locale to a UTF-8 locale + +import unittest +import itertools +import locale +import sys +import sysconfig +from collections import namedtuple + +import test.support +from test.support.script_helper import run_python_until_end + +# Set our expectation for the default encoding used in the C locale +# for the filesystem encoding and the standard streams + +# AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII +if sys.platform.startswith("aix"): + C_LOCALE_STREAM_ENCODING = "iso8859-1" +else: + C_LOCALE_STREAM_ENCODING = "ascii" + +# FS encoding is UTF-8 on macOS, other *nix platforms use the locale encoding +if sys.platform == "darwin": + C_LOCALE_FS_ENCODING = "utf-8" +else: + C_LOCALE_FS_ENCODING = C_LOCALE_STREAM_ENCODING + +# Note that the above is probably still wrong in some cases, such as: +# * Windows when PYTHONLEGACYWINDOWSFSENCODING is set +# * AIX and any other platforms that use latin-1 in the C locale +# +# Options for dealing with this: +# * Don't set PYTHON_COERCE_C_LOCALE on such platforms (e.g. Windows doesn't) +# * Fix the test expectations to match the actual platform behaviour + +# In order to get the warning messages to match up as expected, the candidate +# order here must much the target locale order in Python/pylifecycle.c +_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8") + +# There's no reliable cross-platform way of checking locale alias +# lists, so the only way of knowing which of these locales will work +# is to try them with locale.setlocale(). We do that in a subprocess +# to avoid altering the locale of the test runner. +# +# If the relevant locale module attributes exist, and we're not on a platform +# where we expect it to always succeed, we also check that +# `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter +# will skip locale coercion for that particular target locale +_check_nl_langinfo_CODESET = bool( + sys.platform not in ("darwin", "linux") and + hasattr(locale, "nl_langinfo") and + hasattr(locale, "CODESET") +) + +def _set_locale_in_subprocess(locale_name): + cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))" + if _check_nl_langinfo_CODESET: + # If there's no valid CODESET, we expect coercion to be skipped + cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))" + cmd = cmd_fmt.format(locale_name) + result, py_cmd = run_python_until_end("-c", cmd, __isolated=True) + return result.rc == 0 + + + +_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all" +_EncodingDetails = namedtuple("EncodingDetails", _fields) + +class EncodingDetails(_EncodingDetails): + # XXX (ncoghlan): Using JSON for child state reporting may be less fragile + CHILD_PROCESS_SCRIPT = ";".join([ + "import sys, os", + "print(sys.getfilesystemencoding())", + "print(sys.stdin.encoding + ':' + sys.stdin.errors)", + "print(sys.stdout.encoding + ':' + sys.stdout.errors)", + "print(sys.stderr.encoding + ':' + sys.stderr.errors)", + "print(os.environ.get('LANG', 'not set'))", + "print(os.environ.get('LC_CTYPE', 'not set'))", + "print(os.environ.get('LC_ALL', 'not set'))", + ]) + + # expected_details = EncodingDetails.get_expected_details( + # coercion_expected, + # expected_fs_encoding, + # expected_stream_encoding, + # env_vars + # ) + @classmethod + def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars): + """Returns expected child process details for a given encoding""" + _stream = stream_encoding + ":{}" + # stdin and stdout should use surrogateescape either because the + # coercion triggered, or because the C locale was detected + stream_info = 2*[_stream.format("surrogateescape")] + # stderr should always use backslashreplace + stream_info.append(_stream.format("backslashreplace")) + expected_lang = env_vars.get("LANG", "not set").lower() + if coercion_expected: + expected_lc_ctype = CLI_COERCION_TARGET.lower() + else: + expected_lc_ctype = env_vars.get("LC_CTYPE", "not set").lower() + expected_lc_all = env_vars.get("LC_ALL", "not set").lower() + env_info = expected_lang, expected_lc_ctype, expected_lc_all + print('stream_info = {} ({})'.format(stream_info, type(stream_info))) + print('env_info = {} ({})'.format(env_info, type(env_info))) + # [ 289s] stream_info = ['ascii:surrogateescape', 'ascii:surrogateescape', 'ascii:backslashreplace'] () + # [ 289s] env_info = ('', 'invalid.ascii', 'c') () + params = itertools.chain(stream_info, env_info) + return dict(cls(fs_encoding, *params)._asdict()) + + @staticmethod + def _handle_output_variations(data): + """Adjust the output to handle platform specific idiosyncrasies + + * Some platforms report ASCII as ANSI_X3.4-1968 + * Some platforms report ASCII as US-ASCII + * Some platforms report UTF-8 instead of utf-8 + """ + data = data.replace(b"ANSI_X3.4-1968", b"ascii") + data = data.replace(b"US-ASCII", b"ascii") + data = data.lower() + return data + + @classmethod + def get_child_details(cls, env_vars): + """Retrieves fsencoding and standard stream details from a child process + + Returns (encoding_details, stderr_lines): + + - encoding_details: EncodingDetails for eager decoding + - stderr_lines: result of calling splitlines() on the stderr output + + The child is run in isolated mode if the current interpreter supports + that. + """ + result, py_cmd = run_python_until_end( + "-c", cls.CHILD_PROCESS_SCRIPT, + __isolated=True, + **env_vars + ) + if not result.rc == 0: + result.fail(py_cmd) + # All subprocess outputs in this test case should be pure ASCII + adjusted_output = cls._handle_output_variations(result.out) + print('adjusted_output = {} ({})'.format(adjusted_output.decode(), type(adjusted_output))) + stdout_lines = adjusted_output.decode("ascii").splitlines() + print('stdout_lines = {} ({})'.format(stdout_lines, type(stdout_lines))) + child_encoding_details = dict(cls(*stdout_lines)._asdict()) + print('child_encoding_details = {} ({})'.format(child_encoding_details, type(child_encoding_details))) + stderr_lines = result.err.decode("ascii").rstrip().splitlines() + print('stderr_lines = {} ({})'.format(stderr_lines, type(stderr_lines))) + return child_encoding_details, stderr_lines + + +# Details of the shared library warning emitted at runtime +LEGACY_LOCALE_WARNING = ( + "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " + "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " + "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " + "locales is recommended." +) + +# Details of the CLI locale coercion warning emitted at runtime +CLI_COERCION_WARNING_FMT = ( + "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale " + "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)." +) + + +AVAILABLE_TARGETS = None +CLI_COERCION_TARGET = None +CLI_COERCION_WARNING = None + +def setUpModule(): + global AVAILABLE_TARGETS + global CLI_COERCION_TARGET + global CLI_COERCION_WARNING + + if AVAILABLE_TARGETS is not None: + # initialization already done + return + AVAILABLE_TARGETS = [] + + # Find the target locales available in the current system + for target_locale in _C_UTF8_LOCALES: + if _set_locale_in_subprocess(target_locale): + AVAILABLE_TARGETS.append(target_locale) + + if AVAILABLE_TARGETS: + # Coercion is expected to use the first available target locale + CLI_COERCION_TARGET = AVAILABLE_TARGETS[0] + CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET) + + +class _LocaleHandlingTestCase(unittest.TestCase): + # Base class to check expected locale handling behaviour + + def _check_child_encoding_details(self, + env_vars, + expected_fs_encoding, + expected_stream_encoding, + expected_warnings, + coercion_expected): + """Check the C locale handling for the given process environment + + Parameters: + expected_fs_encoding: expected sys.getfilesystemencoding() result + expected_stream_encoding: expected encoding for standard streams + expected_warning: stderr output to expect (if any) + """ + result = EncodingDetails.get_child_details(env_vars) + encoding_details, stderr_lines = result + expected_details = EncodingDetails.get_expected_details( + coercion_expected, + expected_fs_encoding, + expected_stream_encoding, + env_vars + ) + self.assertEqual(encoding_details, expected_details) + if expected_warnings is None: + expected_warnings = [] + self.assertEqual(stderr_lines, expected_warnings) + + +class LocaleConfigurationTests(_LocaleHandlingTestCase): + # Test explicit external configuration via the process environment + + def setUpClass(): + # This relies on setupModule() having been run, so it can't be + # handled via the @unittest.skipUnless decorator + if not AVAILABLE_TARGETS: + raise unittest.SkipTest("No C-with-UTF-8 locale available") + + def test_external_target_locale_configuration(self): + + # Explicitly setting a target locale should give the same behaviour as + # is seen when implicitly coercing to that target locale + self.maxDiff = None + + expected_fs_encoding = "utf-8" + expected_stream_encoding = "utf-8" + + base_var_dict = { + "LANG": "", + "LC_CTYPE": "", + "LC_ALL": "", + } + for env_var in ("LANG", "LC_CTYPE"): + for locale_to_set in AVAILABLE_TARGETS: + # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as + # expected, so skip that combination for now + # See https://bugs.python.org/issue30672 for discussion + if env_var == "LANG" and locale_to_set == "UTF-8": + continue + + with self.subTest(env_var=env_var, + configured_locale=locale_to_set): + var_dict = base_var_dict.copy() + var_dict[env_var] = locale_to_set + self._check_child_encoding_details(var_dict, + expected_fs_encoding, + expected_stream_encoding, + expected_warnings=None, + coercion_expected=False) + + + +@test.support.cpython_only +@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"), + "C locale coercion disabled at build time") +class LocaleCoercionTests(_LocaleHandlingTestCase): + # Test implicit reconfiguration of the environment during CLI startup + + def _check_c_locale_coercion(self, + fs_encoding, stream_encoding, + coerce_c_locale, + expected_warnings=None, + coercion_expected=True, + **extra_vars): + """Check the C locale handling for various configurations + + Parameters: + fs_encoding: expected sys.getfilesystemencoding() result + stream_encoding: expected encoding for standard streams + coerce_c_locale: setting to use for PYTHONCOERCECLOCALE + None: don't set the variable at all + str: the value set in the child's environment + expected_warnings: expected warning lines on stderr + extra_vars: additional environment variables to set in subprocess + """ + self.maxDiff = None + + if not AVAILABLE_TARGETS: + # Locale coercion is disabled when there aren't any target locales + fs_encoding = C_LOCALE_FS_ENCODING + stream_encoding = C_LOCALE_STREAM_ENCODING + coercion_expected = False + if expected_warnings: + expected_warnings = [LEGACY_LOCALE_WARNING] + + base_var_dict = { + "LANG": "", + "LC_CTYPE": "", + "LC_ALL": "", + } + base_var_dict.update(extra_vars) + for env_var in ("LANG", "LC_CTYPE"): + for locale_to_set in ("", "C", "POSIX", "invalid.ascii"): + # XXX (ncoghlan): *BSD platforms don't behave as expected in the + # POSIX locale, so we skip that for now + # See https://bugs.python.org/issue30672 for discussion + if locale_to_set == "POSIX": + continue + with self.subTest(env_var=env_var, + nominal_locale=locale_to_set, + PYTHONCOERCECLOCALE=coerce_c_locale): + var_dict = base_var_dict.copy() + var_dict[env_var] = locale_to_set + if coerce_c_locale is not None: + var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale + # Check behaviour on successful coercion + self._check_child_encoding_details(var_dict, + fs_encoding, + stream_encoding, + expected_warnings, + coercion_expected) + + def test_test_PYTHONCOERCECLOCALE_not_set(self): + # This should coerce to the first available target locale by default + self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None) + + def test_PYTHONCOERCECLOCALE_not_zero(self): + # *Any* string other than "0" is considered "set" for our purposes + # and hence should result in the locale coercion being enabled + for setting in ("", "1", "true", "false"): + self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting) + + def test_PYTHONCOERCECLOCALE_set_to_warn(self): + # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales + self._check_c_locale_coercion("utf-8", "utf-8", + coerce_c_locale="warn", + expected_warnings=[CLI_COERCION_WARNING]) + + + def test_PYTHONCOERCECLOCALE_set_to_zero(self): + # The setting "0" should result in the locale coercion being disabled + self._check_c_locale_coercion(C_LOCALE_FS_ENCODING, + C_LOCALE_STREAM_ENCODING, + coerce_c_locale="0", + coercion_expected=False) + # Setting LC_ALL=C shouldn't make any difference to the behaviour + self._check_c_locale_coercion(C_LOCALE_FS_ENCODING, + C_LOCALE_STREAM_ENCODING, + coerce_c_locale="0", + LC_ALL="C", + coercion_expected=False) + + def test_LC_ALL_set_to_C(self): + # Setting LC_ALL should render the locale coercion ineffective + self._check_c_locale_coercion(C_LOCALE_FS_ENCODING, + C_LOCALE_STREAM_ENCODING, + coerce_c_locale=None, + LC_ALL="C", + coercion_expected=False) + # And result in a warning about a lack of locale compatibility + self._check_c_locale_coercion(C_LOCALE_FS_ENCODING, + C_LOCALE_STREAM_ENCODING, + coerce_c_locale="warn", + LC_ALL="C", + expected_warnings=[LEGACY_LOCALE_WARNING], + coercion_expected=False) + +def test_main(): + test.support.run_unittest( + LocaleConfigurationTests, + LocaleCoercionTests + ) + test.support.reap_children() + +if __name__ == "__main__": + test_main() --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -151,6 +151,7 @@ class CmdLineTest(unittest.TestCase): env = os.environ.copy() # Use C locale to get ascii for the locale encoding env['LC_ALL'] = 'C' + env['PYTHONCOERCECLOCALE'] = '0' code = ( b'import locale; ' b'print(ascii("' + undecodable + b'"), ' --- a/Modules/main.c +++ b/Modules/main.c @@ -96,12 +96,15 @@ static char *usage_5 = "PYTHONIOENCODING: Encoding[:errors] used for stdin/stdout/stderr.\n" "PYTHONFAULTHANDLER: dump the Python traceback on fatal errors.\n\ "; -static char *usage_6 = "\ -PYTHONHASHSEED: if this variable is set to 'random', a random value is used\n\ - to seed the hashes of str, bytes and datetime objects. It can also be\n\ - set to an integer in the range [0,4294967295] to get hash values with a\n\ - predictable seed.\n\ -"; +static char *usage_6 = +"PYTHONHASHSEED: if this variable is set to 'random', a random value is used\n" +" to seed the hashes of str, bytes and datetime objects. It can also be\n" +" set to an integer in the range [0,4294967295] to get hash values with a\n" +" predictable seed.\n" +"\n" +"PYTHONCOERCECLOCALE: if this variable is set to 0, it disables the locale\n" +" coercion behavior. Use PYTHONCOERCECLOCALE=warn to request display of\n" +" locale coercion and locale compatibility warnings on stderr.\n"; static int usage(int exitcode, wchar_t* program) --- a/configure.ac +++ b/configure.ac @@ -2905,6 +2905,40 @@ then fi AC_MSG_RESULT($with_pymalloc) +# Check for --with-c-locale-coercion +AC_MSG_CHECKING(for --with-c-locale-coercion) +AC_ARG_WITH(c-locale-coercion, + AS_HELP_STRING([--with(out)-c-locale-coercion], + [disable/enable C locale coercion to a UTF-8 based locale])) + +if test -z "$with_c_locale_coercion" +then + with_c_locale_coercion="yes" +fi +if test "$with_c_locale_coercion" != "no" +then + AC_DEFINE(PY_COERCE_C_LOCALE, 1, + [Define if you want to coerce the C locale to a UTF-8 based locale]) +fi +AC_MSG_RESULT($with_c_locale_coercion) + +# Check for --with-c-locale-warning +AC_MSG_CHECKING(for --with-c-locale-warning) +AC_ARG_WITH(c-locale-warning, + AS_HELP_STRING([--with(out)-c-locale-warning], + [disable/enable locale compatibility warning in the C locale])) + +if test -z "$with_c_locale_warning" +then + with_c_locale_warning="yes" +fi +if test "$with_c_locale_warning" != "no" +then + AC_DEFINE(PY_WARN_ON_C_LOCALE, 1, + [Define to emit a locale compatibility warning in the C locale]) +fi +AC_MSG_RESULT($with_c_locale_warning) + # Check for Valgrind support AC_MSG_CHECKING([for --with-valgrind]) AC_ARG_WITH([valgrind], --- /dev/null +++ b/Lib/test/support/script_helper.py @@ -0,0 +1,130 @@ +# Common utility functions used by various script execution tests +# e.g. test_cmd_line, test_cmd_line_script and test_runpy + +import collections +import sys +import os +import os.path +import subprocess + +# Cached result of the expensive test performed in the function below. +__cached_interp_requires_environment = None + +def interpreter_requires_environment(): + """ + Returns True if our sys.executable interpreter requires environment + variables in order to be able to run at all. + + This is designed to be used with @unittest.skipIf() to annotate tests + that need to use an assert_python*() function to launch an isolated + mode (-I) or no environment mode (-E) sub-interpreter process. + + A normal build & test does not run into this situation but it can happen + when trying to run the standard library test suite from an interpreter that + doesn't have an obvious home with Python's current home finding logic. + + Setting PYTHONHOME is one way to get most of the testsuite to run in that + situation. PYTHONPATH or PYTHONUSERSITE are other common environment + variables that might impact whether or not the interpreter can start. + """ + global __cached_interp_requires_environment + if __cached_interp_requires_environment is None: + # If PYTHONHOME is set, assume that we need it + if 'PYTHONHOME' in os.environ: + __cached_interp_requires_environment = True + return True + + # Try running an interpreter with -E to see if it works or not. + try: + subprocess.check_call([sys.executable, '-E', + '-c', 'import sys; sys.exit(0)']) + except subprocess.CalledProcessError: + __cached_interp_requires_environment = True + else: + __cached_interp_requires_environment = False + + return __cached_interp_requires_environment + + +class _PythonRunResult(collections.namedtuple("_PythonRunResult", + ("rc", "out", "err"))): + """Helper for reporting Python subprocess run results""" + def fail(self, cmd_line): + """Provide helpful details about failed subcommand runs""" + # Limit to 80 lines to ASCII characters + maxlen = 80 * 100 + out, err = self.out, self.err + if len(out) > maxlen: + out = b'(... truncated stdout ...)' + out[-maxlen:] + if len(err) > maxlen: + err = b'(... truncated stderr ...)' + err[-maxlen:] + out = out.decode('ascii', 'replace').rstrip() + err = err.decode('ascii', 'replace').rstrip() + raise AssertionError("Process return code is %d\n" + "command line: %r\n" + "\n" + "stdout:\n" + "---\n" + "%s\n" + "---\n" + "\n" + "stderr:\n" + "---\n" + "%s\n" + "---" + % (self.rc, cmd_line, + out, + err)) + + +# Executing the interpreter in a subprocess +def run_python_until_end(*args, **env_vars): + env_required = interpreter_requires_environment() + cwd = env_vars.pop('__cwd', None) + if '__isolated' in env_vars: + isolated = env_vars.pop('__isolated') + else: + isolated = not env_vars and not env_required + cmd_line = [sys.executable, '-X', 'faulthandler'] + if isolated: + # isolated mode: ignore Python environment variables, ignore user + # site-packages, and don't add the current directory to sys.path + cmd_line.append('-I') + elif not env_vars and not env_required: + # ignore Python environment variables + cmd_line.append('-E') + + # But a special flag that can be set to override -- in this case, the + # caller is responsible to pass the full environment. + if env_vars.pop('__cleanenv', None): + env = {} + if sys.platform == 'win32': + # Windows requires at least the SYSTEMROOT environment variable to + # start Python. + env['SYSTEMROOT'] = os.environ['SYSTEMROOT'] + + # Other interesting environment variables, not copied currently: + # COMSPEC, HOME, PATH, TEMP, TMPDIR, TMP. + else: + # Need to preserve the original environment, for in-place testing of + # shared library builds. + env = os.environ.copy() + + # set TERM='' unless the TERM environment variable is passed explicitly + # see issues #11390 and #18300 + if 'TERM' not in env_vars: + env['TERM'] = '' + + env.update(env_vars) + cmd_line.extend(args) + proc = subprocess.Popen(cmd_line, stdin=subprocess.PIPE, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + env=env, cwd=cwd) + with proc: + try: + out, err = proc.communicate() + finally: + proc.kill() + subprocess._cleanup() + rc = proc.returncode + return _PythonRunResult(rc, out, err), cmd_line