mirror of https://github.com/python/cpython
432 lines
19 KiB
Python
432 lines
19 KiB
Python
# Tests the attempted automatic coercion of the C locale to a UTF-8 locale
|
|
|
|
import locale
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import sysconfig
|
|
import unittest
|
|
from collections import namedtuple
|
|
|
|
from test import support
|
|
from test.support.script_helper import run_python_until_end
|
|
|
|
|
|
# Set the list of ways we expect to be able to ask for the "C" locale
|
|
EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii"]
|
|
|
|
# Set our expectation for the default encoding used in the C locale
|
|
# for the filesystem encoding and the standard streams
|
|
EXPECTED_C_LOCALE_STREAM_ENCODING = "ascii"
|
|
EXPECTED_C_LOCALE_FS_ENCODING = "ascii"
|
|
|
|
# Set our expectation for the default locale used when none is specified
|
|
EXPECT_COERCION_IN_DEFAULT_LOCALE = True
|
|
|
|
TARGET_LOCALES = ["C.UTF-8", "C.utf8", "UTF-8"]
|
|
|
|
# Apply some platform dependent overrides
|
|
if sys.platform.startswith("linux"):
|
|
if support.is_android:
|
|
# Android defaults to using UTF-8 for all system interfaces
|
|
EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8"
|
|
EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
|
|
else:
|
|
# Linux distros typically alias the POSIX locale directly to the C
|
|
# locale.
|
|
# TODO: Once https://bugs.python.org/issue30672 is addressed, we'll be
|
|
# able to check this case unconditionally
|
|
EXPECTED_C_LOCALE_EQUIVALENTS.append("POSIX")
|
|
elif sys.platform.startswith("aix"):
|
|
# AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII
|
|
EXPECTED_C_LOCALE_STREAM_ENCODING = "iso8859-1"
|
|
EXPECTED_C_LOCALE_FS_ENCODING = "iso8859-1"
|
|
elif sys.platform == "darwin":
|
|
# FS encoding is UTF-8 on macOS
|
|
EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
|
|
elif sys.platform == "cygwin":
|
|
# Cygwin defaults to using C.UTF-8
|
|
# TODO: Work out a robust dynamic test for this that doesn't rely on
|
|
# CPython's own locale handling machinery
|
|
EXPECT_COERCION_IN_DEFAULT_LOCALE = False
|
|
|
|
# Note that the above expectations are still wrong in some cases, such as:
|
|
# * Windows when PYTHONLEGACYWINDOWSFSENCODING is set
|
|
# * Any platform other than AIX that uses latin-1 in the C locale
|
|
# * Any Linux distro where POSIX isn't a simple alias for the C locale
|
|
# * Any Linux distro where the default locale is something other than "C"
|
|
#
|
|
# Options for dealing with this:
|
|
# * Don't set the PY_COERCE_C_LOCALE preprocessor definition on
|
|
# such platforms (e.g. it isn't set on Windows)
|
|
# * Fix the test expectations to match the actual platform behaviour
|
|
|
|
# In order to get the warning messages to match up as expected, the candidate
|
|
# order here must much the target locale order in Python/pylifecycle.c
|
|
_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")
|
|
|
|
# There's no reliable cross-platform way of checking locale alias
|
|
# lists, so the only way of knowing which of these locales will work
|
|
# is to try them with locale.setlocale(). We do that in a subprocess
|
|
# in setUpModule() below to avoid altering the locale of the test runner.
|
|
#
|
|
# If the relevant locale module attributes exist, and we're not on a platform
|
|
# where we expect it to always succeed, we also check that
|
|
# `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter
|
|
# will skip locale coercion for that particular target locale
|
|
_check_nl_langinfo_CODESET = bool(
|
|
sys.platform not in ("darwin", "linux") and
|
|
hasattr(locale, "nl_langinfo") and
|
|
hasattr(locale, "CODESET")
|
|
)
|
|
|
|
def _set_locale_in_subprocess(locale_name):
|
|
cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))"
|
|
if _check_nl_langinfo_CODESET:
|
|
# If there's no valid CODESET, we expect coercion to be skipped
|
|
cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))"
|
|
cmd = cmd_fmt.format(locale_name)
|
|
result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='')
|
|
return result.rc == 0
|
|
|
|
|
|
|
|
_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all"
|
|
_EncodingDetails = namedtuple("EncodingDetails", _fields)
|
|
|
|
class EncodingDetails(_EncodingDetails):
|
|
# XXX (ncoghlan): Using JSON for child state reporting may be less fragile
|
|
CHILD_PROCESS_SCRIPT = ";".join([
|
|
"import sys, os",
|
|
"print(sys.getfilesystemencoding())",
|
|
"print(sys.stdin.encoding + ':' + sys.stdin.errors)",
|
|
"print(sys.stdout.encoding + ':' + sys.stdout.errors)",
|
|
"print(sys.stderr.encoding + ':' + sys.stderr.errors)",
|
|
"print(os.environ.get('LANG', 'not set'))",
|
|
"print(os.environ.get('LC_CTYPE', 'not set'))",
|
|
"print(os.environ.get('LC_ALL', 'not set'))",
|
|
])
|
|
|
|
@classmethod
|
|
def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars):
|
|
"""Returns expected child process details for a given encoding"""
|
|
_stream = stream_encoding + ":{}"
|
|
# stdin and stdout should use surrogateescape either because the
|
|
# coercion triggered, or because the C locale was detected
|
|
stream_info = 2*[_stream.format("surrogateescape")]
|
|
# stderr should always use backslashreplace
|
|
stream_info.append(_stream.format("backslashreplace"))
|
|
expected_lang = env_vars.get("LANG", "not set")
|
|
if coercion_expected:
|
|
expected_lc_ctype = CLI_COERCION_TARGET
|
|
else:
|
|
expected_lc_ctype = env_vars.get("LC_CTYPE", "not set")
|
|
expected_lc_all = env_vars.get("LC_ALL", "not set")
|
|
env_info = expected_lang, expected_lc_ctype, expected_lc_all
|
|
return dict(cls(fs_encoding, *stream_info, *env_info)._asdict())
|
|
|
|
@classmethod
|
|
def get_child_details(cls, env_vars):
|
|
"""Retrieves fsencoding and standard stream details from a child process
|
|
|
|
Returns (encoding_details, stderr_lines):
|
|
|
|
- encoding_details: EncodingDetails for eager decoding
|
|
- stderr_lines: result of calling splitlines() on the stderr output
|
|
|
|
The child is run in isolated mode if the current interpreter supports
|
|
that.
|
|
"""
|
|
result, py_cmd = run_python_until_end(
|
|
"-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
|
|
**env_vars
|
|
)
|
|
if not result.rc == 0:
|
|
result.fail(py_cmd)
|
|
# All subprocess outputs in this test case should be pure ASCII
|
|
stdout_lines = result.out.decode("ascii").splitlines()
|
|
child_encoding_details = dict(cls(*stdout_lines)._asdict())
|
|
stderr_lines = result.err.decode("ascii").rstrip().splitlines()
|
|
return child_encoding_details, stderr_lines
|
|
|
|
|
|
# Details of the shared library warning emitted at runtime
|
|
LEGACY_LOCALE_WARNING = (
|
|
"Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
|
|
"encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
|
|
"C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
|
|
"locales is recommended."
|
|
)
|
|
|
|
# Details of the CLI locale coercion warning emitted at runtime
|
|
CLI_COERCION_WARNING_FMT = (
|
|
"Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
|
|
"or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)."
|
|
)
|
|
|
|
|
|
AVAILABLE_TARGETS = None
|
|
CLI_COERCION_TARGET = None
|
|
CLI_COERCION_WARNING = None
|
|
|
|
def setUpModule():
|
|
global AVAILABLE_TARGETS
|
|
global CLI_COERCION_TARGET
|
|
global CLI_COERCION_WARNING
|
|
|
|
if AVAILABLE_TARGETS is not None:
|
|
# initialization already done
|
|
return
|
|
AVAILABLE_TARGETS = []
|
|
|
|
# Find the target locales available in the current system
|
|
for target_locale in _C_UTF8_LOCALES:
|
|
if _set_locale_in_subprocess(target_locale):
|
|
AVAILABLE_TARGETS.append(target_locale)
|
|
|
|
if AVAILABLE_TARGETS:
|
|
# Coercion is expected to use the first available target locale
|
|
CLI_COERCION_TARGET = AVAILABLE_TARGETS[0]
|
|
CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET)
|
|
|
|
if support.verbose:
|
|
print(f"AVAILABLE_TARGETS = {AVAILABLE_TARGETS!r}")
|
|
print(f"EXPECTED_C_LOCALE_EQUIVALENTS = {EXPECTED_C_LOCALE_EQUIVALENTS!r}")
|
|
print(f"EXPECTED_C_LOCALE_STREAM_ENCODING = {EXPECTED_C_LOCALE_STREAM_ENCODING!r}")
|
|
print(f"EXPECTED_C_LOCALE_FS_ENCODING = {EXPECTED_C_LOCALE_FS_ENCODING!r}")
|
|
print(f"EXPECT_COERCION_IN_DEFAULT_LOCALE = {EXPECT_COERCION_IN_DEFAULT_LOCALE!r}")
|
|
print(f"_C_UTF8_LOCALES = {_C_UTF8_LOCALES!r}")
|
|
print(f"_check_nl_langinfo_CODESET = {_check_nl_langinfo_CODESET!r}")
|
|
|
|
|
|
class _LocaleHandlingTestCase(unittest.TestCase):
|
|
# Base class to check expected locale handling behaviour
|
|
|
|
def _check_child_encoding_details(self,
|
|
env_vars,
|
|
expected_fs_encoding,
|
|
expected_stream_encoding,
|
|
expected_warnings,
|
|
coercion_expected):
|
|
"""Check the C locale handling for the given process environment
|
|
|
|
Parameters:
|
|
expected_fs_encoding: expected sys.getfilesystemencoding() result
|
|
expected_stream_encoding: expected encoding for standard streams
|
|
expected_warning: stderr output to expect (if any)
|
|
"""
|
|
result = EncodingDetails.get_child_details(env_vars)
|
|
encoding_details, stderr_lines = result
|
|
expected_details = EncodingDetails.get_expected_details(
|
|
coercion_expected,
|
|
expected_fs_encoding,
|
|
expected_stream_encoding,
|
|
env_vars
|
|
)
|
|
self.assertEqual(encoding_details, expected_details)
|
|
if expected_warnings is None:
|
|
expected_warnings = []
|
|
self.assertEqual(stderr_lines, expected_warnings)
|
|
|
|
|
|
class LocaleConfigurationTests(_LocaleHandlingTestCase):
|
|
# Test explicit external configuration via the process environment
|
|
|
|
@classmethod
|
|
def setUpClass(cls):
|
|
# This relies on setUpModule() having been run, so it can't be
|
|
# handled via the @unittest.skipUnless decorator
|
|
if not AVAILABLE_TARGETS:
|
|
raise unittest.SkipTest("No C-with-UTF-8 locale available")
|
|
|
|
def test_external_target_locale_configuration(self):
|
|
|
|
# Explicitly setting a target locale should give the same behaviour as
|
|
# is seen when implicitly coercing to that target locale
|
|
self.maxDiff = None
|
|
|
|
expected_fs_encoding = "utf-8"
|
|
expected_stream_encoding = "utf-8"
|
|
|
|
base_var_dict = {
|
|
"LANG": "",
|
|
"LC_CTYPE": "",
|
|
"LC_ALL": "",
|
|
"PYTHONCOERCECLOCALE": "",
|
|
}
|
|
for env_var in ("LANG", "LC_CTYPE"):
|
|
for locale_to_set in AVAILABLE_TARGETS:
|
|
# XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as
|
|
# expected, so skip that combination for now
|
|
# See https://bugs.python.org/issue30672 for discussion
|
|
if env_var == "LANG" and locale_to_set == "UTF-8":
|
|
continue
|
|
|
|
with self.subTest(env_var=env_var,
|
|
configured_locale=locale_to_set):
|
|
var_dict = base_var_dict.copy()
|
|
var_dict[env_var] = locale_to_set
|
|
self._check_child_encoding_details(var_dict,
|
|
expected_fs_encoding,
|
|
expected_stream_encoding,
|
|
expected_warnings=None,
|
|
coercion_expected=False)
|
|
|
|
|
|
|
|
@support.cpython_only
|
|
@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
|
|
"C locale coercion disabled at build time")
|
|
class LocaleCoercionTests(_LocaleHandlingTestCase):
|
|
# Test implicit reconfiguration of the environment during CLI startup
|
|
|
|
def _check_c_locale_coercion(self,
|
|
fs_encoding, stream_encoding,
|
|
coerce_c_locale,
|
|
expected_warnings=None,
|
|
coercion_expected=True,
|
|
**extra_vars):
|
|
"""Check the C locale handling for various configurations
|
|
|
|
Parameters:
|
|
fs_encoding: expected sys.getfilesystemencoding() result
|
|
stream_encoding: expected encoding for standard streams
|
|
coerce_c_locale: setting to use for PYTHONCOERCECLOCALE
|
|
None: don't set the variable at all
|
|
str: the value set in the child's environment
|
|
expected_warnings: expected warning lines on stderr
|
|
extra_vars: additional environment variables to set in subprocess
|
|
"""
|
|
self.maxDiff = None
|
|
|
|
if not AVAILABLE_TARGETS:
|
|
# Locale coercion is disabled when there aren't any target locales
|
|
fs_encoding = EXPECTED_C_LOCALE_FS_ENCODING
|
|
stream_encoding = EXPECTED_C_LOCALE_STREAM_ENCODING
|
|
coercion_expected = False
|
|
if expected_warnings:
|
|
expected_warnings = [LEGACY_LOCALE_WARNING]
|
|
|
|
base_var_dict = {
|
|
"LANG": "",
|
|
"LC_CTYPE": "",
|
|
"LC_ALL": "",
|
|
"PYTHONCOERCECLOCALE": "",
|
|
}
|
|
base_var_dict.update(extra_vars)
|
|
if coerce_c_locale is not None:
|
|
base_var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale
|
|
|
|
# Check behaviour for the default locale
|
|
with self.subTest(default_locale=True,
|
|
PYTHONCOERCECLOCALE=coerce_c_locale):
|
|
if EXPECT_COERCION_IN_DEFAULT_LOCALE:
|
|
_expected_warnings = expected_warnings
|
|
_coercion_expected = coercion_expected
|
|
else:
|
|
_expected_warnings = None
|
|
_coercion_expected = False
|
|
# On Android CLI_COERCION_WARNING is not printed when all the
|
|
# locale environment variables are undefined or empty. When
|
|
# this code path is run with environ['LC_ALL'] == 'C', then
|
|
# LEGACY_LOCALE_WARNING is printed.
|
|
if (support.is_android and
|
|
_expected_warnings == [CLI_COERCION_WARNING]):
|
|
_expected_warnings = None
|
|
self._check_child_encoding_details(base_var_dict,
|
|
fs_encoding,
|
|
stream_encoding,
|
|
_expected_warnings,
|
|
_coercion_expected)
|
|
|
|
# Check behaviour for explicitly configured locales
|
|
for locale_to_set in EXPECTED_C_LOCALE_EQUIVALENTS:
|
|
for env_var in ("LANG", "LC_CTYPE"):
|
|
with self.subTest(env_var=env_var,
|
|
nominal_locale=locale_to_set,
|
|
PYTHONCOERCECLOCALE=coerce_c_locale):
|
|
var_dict = base_var_dict.copy()
|
|
var_dict[env_var] = locale_to_set
|
|
# Check behaviour on successful coercion
|
|
self._check_child_encoding_details(var_dict,
|
|
fs_encoding,
|
|
stream_encoding,
|
|
expected_warnings,
|
|
coercion_expected)
|
|
|
|
def test_PYTHONCOERCECLOCALE_not_set(self):
|
|
# This should coerce to the first available target locale by default
|
|
self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None)
|
|
|
|
def test_PYTHONCOERCECLOCALE_not_zero(self):
|
|
# *Any* string other than "0" is considered "set" for our purposes
|
|
# and hence should result in the locale coercion being enabled
|
|
for setting in ("", "1", "true", "false"):
|
|
self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting)
|
|
|
|
def test_PYTHONCOERCECLOCALE_set_to_warn(self):
|
|
# PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales
|
|
self._check_c_locale_coercion("utf-8", "utf-8",
|
|
coerce_c_locale="warn",
|
|
expected_warnings=[CLI_COERCION_WARNING])
|
|
|
|
|
|
def test_PYTHONCOERCECLOCALE_set_to_zero(self):
|
|
# The setting "0" should result in the locale coercion being disabled
|
|
self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
|
|
EXPECTED_C_LOCALE_STREAM_ENCODING,
|
|
coerce_c_locale="0",
|
|
coercion_expected=False)
|
|
# Setting LC_ALL=C shouldn't make any difference to the behaviour
|
|
self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
|
|
EXPECTED_C_LOCALE_STREAM_ENCODING,
|
|
coerce_c_locale="0",
|
|
LC_ALL="C",
|
|
coercion_expected=False)
|
|
|
|
def test_LC_ALL_set_to_C(self):
|
|
# Setting LC_ALL should render the locale coercion ineffective
|
|
self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
|
|
EXPECTED_C_LOCALE_STREAM_ENCODING,
|
|
coerce_c_locale=None,
|
|
LC_ALL="C",
|
|
coercion_expected=False)
|
|
# And result in a warning about a lack of locale compatibility
|
|
self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
|
|
EXPECTED_C_LOCALE_STREAM_ENCODING,
|
|
coerce_c_locale="warn",
|
|
LC_ALL="C",
|
|
expected_warnings=[LEGACY_LOCALE_WARNING],
|
|
coercion_expected=False)
|
|
|
|
def test_PYTHONCOERCECLOCALE_set_to_one(self):
|
|
# skip the test if the LC_CTYPE locale is C or coerced
|
|
old_loc = locale.setlocale(locale.LC_CTYPE, None)
|
|
self.addCleanup(locale.setlocale, locale.LC_CTYPE, old_loc)
|
|
loc = locale.setlocale(locale.LC_CTYPE, "")
|
|
if loc == "C":
|
|
self.skipTest("test requires LC_CTYPE locale different than C")
|
|
if loc in TARGET_LOCALES :
|
|
self.skipTest("coerced LC_CTYPE locale: %s" % loc)
|
|
|
|
# bpo-35336: PYTHONCOERCECLOCALE=1 must not coerce the LC_CTYPE locale
|
|
# if it's not equal to "C"
|
|
code = 'import locale; print(locale.setlocale(locale.LC_CTYPE, None))'
|
|
env = dict(os.environ, PYTHONCOERCECLOCALE='1')
|
|
cmd = subprocess.run([sys.executable, '-c', code],
|
|
stdout=subprocess.PIPE,
|
|
env=env,
|
|
text=True)
|
|
self.assertEqual(cmd.stdout.rstrip(), loc)
|
|
|
|
|
|
def test_main():
|
|
support.run_unittest(
|
|
LocaleConfigurationTests,
|
|
LocaleCoercionTests
|
|
)
|
|
support.reap_children()
|
|
|
|
if __name__ == "__main__":
|
|
test_main()
|