bpo-30565: Add PYTHONCOERCECLOCALE=warn runtime flag (GH-2260)

- removes PY_WARN_ON_C_LOCALE build time flag
- locale coercion and compatibility warnings are now always compiled
  in, but are off by default
- adds PYTHONCOERCECLOCALE=warn runtime option to aid in
  debugging potentially locale related compatibility problems

Due to not-yet-resolved test failures on *BSD systems (including
Mac OS X), this also temporarily disables UTF-8 as a locale coercion
target, and skips testing the interpreter's behavior in the POSIX locale.
This commit is contained in:
Nick Coghlan 2017-06-18 12:29:42 +10:00 committed by GitHub
parent 6a98a04e21
commit eb81795d7d
5 changed files with 184 additions and 122 deletions

View File

@ -744,6 +744,11 @@ conflict.
:data:`sys.stdin` and :data:`sys.stdout` to ``surrogateescape``. This
behavior can be overridden using :envvar:`PYTHONIOENCODING` as usual.
For debugging purposes, setting ``PYTHONCOERCECLOCALE=warn`` will cause
Python to emit warning messages on ``stderr`` if either the locale coercion
activates, or else if a locale that *would* have triggered coercion is
still active when the Python runtime is initialized.
Availability: \*nix
.. versionadded:: 3.7

View File

@ -96,20 +96,11 @@ defined coercion target locales (currently ``C.UTF-8``, ``C.utf8``, and
``UTF-8``). The default error handler for ``stderr`` continues to be
``backslashreplace``, regardless of locale.
.. note::
In the current implementation, a warning message is printed directly to
``stderr`` even for successful implicit locale coercion. This gives
redistributors and system integrators the opportunity to determine if they
should be making an environmental change to avoid the need for implicit
coercion at the Python interpreter level.
However, it's not clear that this is going to be the best approach for
the final 3.7.0 release, and we may end up deciding to disable the warning
by default and provide some way of opting into it at runtime or build time.
Concrete examples of use cases where it would be preferrable to disable the
warning by default can be noted on :issue:`30565`.
Locale coercion is silent by default, but to assist in debugging potentially
locale related integration problems, explicit warnings (emitted directly on
``stderr`` can be requested by setting ``PYTHONCOERCECLOCALE=warn``. This
setting will also cause the Python runtime to emit a warning if the legacy C
locale remains active when the core interpreter is initialized.
.. seealso::

View File

@ -22,13 +22,23 @@ if sys.platform == "darwin":
else:
C_LOCALE_FS_ENCODING = C_LOCALE_STREAM_ENCODING
# XXX (ncoghlan): The above is probably still wrong for:
# Note that the above is probably still wrong in some cases, such as:
# * Windows when PYTHONLEGACYWINDOWSFSENCODING is set
# * AIX and any other platforms that use latin-1 in the C locale
#
# Options for dealing with this:
# * Don't set PYTHON_COERCE_C_LOCALE on such platforms (e.g. Windows doesn't)
# * Fix the test expectations to match the actual platform behaviour
# In order to get the warning messages to match up as expected, the candidate
# order here must much the target locale order in Python/pylifecycle.c
_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")
_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8") #, "UTF-8")
# XXX (ncoghlan): Using UTF-8 as a target locale is currently disabled due to
# problems encountered on *BSD systems with those test cases
# For additional details see:
# nl_langinfo CODESET error: https://bugs.python.org/issue30647
# locale handling differences: https://bugs.python.org/issue30672
# There's no reliable cross-platform way of checking locale alias
# lists, so the only way of knowing which of these locales will work
@ -40,20 +50,24 @@ def _set_locale_in_subprocess(locale_name):
result, py_cmd = run_python_until_end("-c", cmd, __isolated=True)
return result.rc == 0
_EncodingDetails = namedtuple("EncodingDetails",
"fsencoding stdin_info stdout_info stderr_info")
_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all"
_EncodingDetails = namedtuple("EncodingDetails", _fields)
class EncodingDetails(_EncodingDetails):
# XXX (ncoghlan): Using JSON for child state reporting may be less fragile
CHILD_PROCESS_SCRIPT = ";".join([
"import sys",
"import sys, os",
"print(sys.getfilesystemencoding())",
"print(sys.stdin.encoding + ':' + sys.stdin.errors)",
"print(sys.stdout.encoding + ':' + sys.stdout.errors)",
"print(sys.stderr.encoding + ':' + sys.stderr.errors)",
"print(os.environ.get('LANG', 'not set'))",
"print(os.environ.get('LC_CTYPE', 'not set'))",
"print(os.environ.get('LC_ALL', 'not set'))",
])
@classmethod
def get_expected_details(cls, fs_encoding, stream_encoding):
def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars):
"""Returns expected child process details for a given encoding"""
_stream = stream_encoding + ":{}"
# stdin and stdout should use surrogateescape either because the
@ -61,7 +75,14 @@ class EncodingDetails(_EncodingDetails):
stream_info = 2*[_stream.format("surrogateescape")]
# stderr should always use backslashreplace
stream_info.append(_stream.format("backslashreplace"))
return dict(cls(fs_encoding, *stream_info)._asdict())
expected_lang = env_vars.get("LANG", "not set").lower()
if coercion_expected:
expected_lc_ctype = CLI_COERCION_TARGET.lower()
else:
expected_lc_ctype = env_vars.get("LC_CTYPE", "not set").lower()
expected_lc_all = env_vars.get("LC_ALL", "not set").lower()
env_info = expected_lang, expected_lc_ctype, expected_lc_all
return dict(cls(fs_encoding, *stream_info, *env_info)._asdict())
@staticmethod
def _handle_output_variations(data):
@ -97,64 +118,20 @@ class EncodingDetails(_EncodingDetails):
result.fail(py_cmd)
# All subprocess outputs in this test case should be pure ASCII
adjusted_output = cls._handle_output_variations(result.out)
stdout_lines = adjusted_output.decode("ascii").rstrip().splitlines()
stdout_lines = adjusted_output.decode("ascii").splitlines()
child_encoding_details = dict(cls(*stdout_lines)._asdict())
stderr_lines = result.err.decode("ascii").rstrip().splitlines()
return child_encoding_details, stderr_lines
class _ChildProcessEncodingTestCase(unittest.TestCase):
# Base class to check for expected encoding details in a child process
def _check_child_encoding_details(self,
env_vars,
expected_fs_encoding,
expected_stream_encoding,
expected_warning):
"""Check the C locale handling for the given process environment
Parameters:
expected_fs_encoding: expected sys.getfilesystemencoding() result
expected_stream_encoding: expected encoding for standard streams
expected_warning: stderr output to expect (if any)
"""
result = EncodingDetails.get_child_details(env_vars)
encoding_details, stderr_lines = result
self.assertEqual(encoding_details,
EncodingDetails.get_expected_details(
expected_fs_encoding,
expected_stream_encoding))
self.assertEqual(stderr_lines, expected_warning)
# Details of the shared library warning emitted at runtime
LIBRARY_C_LOCALE_WARNING = (
LEGACY_LOCALE_WARNING = (
"Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
"encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
"C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
"locales is recommended."
)
@unittest.skipUnless(sysconfig.get_config_var("PY_WARN_ON_C_LOCALE"),
"C locale runtime warning disabled at build time")
class LocaleWarningTests(_ChildProcessEncodingTestCase):
# Test warning emitted when running in the C locale
def test_library_c_locale_warning(self):
self.maxDiff = None
for locale_to_set in ("C", "POSIX", "invalid.ascii"):
# XXX (ncoghlan): Mac OS X doesn't behave as expected in the
# POSIX locale, so we skip that for now
if sys.platform == "darwin" and locale_to_set == "POSIX":
continue
var_dict = {
"LC_ALL": locale_to_set
}
with self.subTest(forced_locale=locale_to_set):
self._check_child_encoding_details(var_dict,
C_LOCALE_FS_ENCODING,
C_LOCALE_STREAM_ENCODING,
[LIBRARY_C_LOCALE_WARNING])
# Details of the CLI locale coercion warning emitted at runtime
CLI_COERCION_WARNING_FMT = (
"Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
@ -163,9 +140,13 @@ CLI_COERCION_WARNING_FMT = (
AVAILABLE_TARGETS = None
CLI_COERCION_TARGET = None
CLI_COERCION_WARNING = None
def setUpModule():
global AVAILABLE_TARGETS
global CLI_COERCION_TARGET
global CLI_COERCION_WARNING
if AVAILABLE_TARGETS is not None:
# initialization already done
@ -177,26 +158,57 @@ def setUpModule():
if _set_locale_in_subprocess(target_locale):
AVAILABLE_TARGETS.append(target_locale)
if AVAILABLE_TARGETS:
# Coercion is expected to use the first available target locale
CLI_COERCION_TARGET = AVAILABLE_TARGETS[0]
CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET)
class _LocaleCoercionTargetsTestCase(_ChildProcessEncodingTestCase):
# Base class for test cases that rely on coercion targets being defined
class _LocaleHandlingTestCase(unittest.TestCase):
# Base class to check expected locale handling behaviour
@classmethod
def setUpClass(cls):
def _check_child_encoding_details(self,
env_vars,
expected_fs_encoding,
expected_stream_encoding,
expected_warnings,
coercion_expected):
"""Check the C locale handling for the given process environment
Parameters:
expected_fs_encoding: expected sys.getfilesystemencoding() result
expected_stream_encoding: expected encoding for standard streams
expected_warning: stderr output to expect (if any)
"""
result = EncodingDetails.get_child_details(env_vars)
encoding_details, stderr_lines = result
expected_details = EncodingDetails.get_expected_details(
coercion_expected,
expected_fs_encoding,
expected_stream_encoding,
env_vars
)
self.assertEqual(encoding_details, expected_details)
if expected_warnings is None:
expected_warnings = []
self.assertEqual(stderr_lines, expected_warnings)
class LocaleConfigurationTests(_LocaleHandlingTestCase):
# Test explicit external configuration via the process environment
def setUpClass():
# This relies on setupModule() having been run, so it can't be
# handled via the @unittest.skipUnless decorator
if not AVAILABLE_TARGETS:
raise unittest.SkipTest("No C-with-UTF-8 locale available")
class LocaleConfigurationTests(_LocaleCoercionTargetsTestCase):
# Test explicit external configuration via the process environment
def test_external_target_locale_configuration(self):
# Explicitly setting a target locale should give the same behaviour as
# is seen when implicitly coercing to that target locale
self.maxDiff = None
expected_warning = []
expected_fs_encoding = "utf-8"
expected_stream_encoding = "utf-8"
@ -209,6 +221,7 @@ class LocaleConfigurationTests(_LocaleCoercionTargetsTestCase):
for locale_to_set in AVAILABLE_TARGETS:
# XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as
# expected, so skip that combination for now
# See https://bugs.python.org/issue30672 for discussion
if env_var == "LANG" and locale_to_set == "UTF-8":
continue
@ -219,17 +232,23 @@ class LocaleConfigurationTests(_LocaleCoercionTargetsTestCase):
self._check_child_encoding_details(var_dict,
expected_fs_encoding,
expected_stream_encoding,
expected_warning)
expected_warnings=None,
coercion_expected=False)
@test.support.cpython_only
@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
"C locale coercion disabled at build time")
class LocaleCoercionTests(_LocaleCoercionTargetsTestCase):
class LocaleCoercionTests(_LocaleHandlingTestCase):
# Test implicit reconfiguration of the environment during CLI startup
def _check_c_locale_coercion(self, fs_encoding, stream_encoding, coerce_c_locale):
def _check_c_locale_coercion(self,
fs_encoding, stream_encoding,
coerce_c_locale,
expected_warnings=None,
coercion_expected=True,
**extra_vars):
"""Check the C locale handling for various configurations
Parameters:
@ -238,27 +257,31 @@ class LocaleCoercionTests(_LocaleCoercionTargetsTestCase):
coerce_c_locale: setting to use for PYTHONCOERCECLOCALE
None: don't set the variable at all
str: the value set in the child's environment
expected_warnings: expected warning lines on stderr
extra_vars: additional environment variables to set in subprocess
"""
# Check for expected warning on stderr if C locale is coerced
self.maxDiff = None
expected_warning = []
if coerce_c_locale != "0":
# Expect coercion to use the first available locale
warning_msg = CLI_COERCION_WARNING_FMT.format(AVAILABLE_TARGETS[0])
expected_warning.append(warning_msg)
if not AVAILABLE_TARGETS:
# Locale coercion is disabled when there aren't any target locales
fs_encoding = C_LOCALE_FS_ENCODING
stream_encoding = C_LOCALE_STREAM_ENCODING
coercion_expected = False
if expected_warnings:
expected_warnings = [LEGACY_LOCALE_WARNING]
base_var_dict = {
"LANG": "",
"LC_CTYPE": "",
"LC_ALL": "",
}
base_var_dict.update(extra_vars)
for env_var in ("LANG", "LC_CTYPE"):
for locale_to_set in ("", "C", "POSIX", "invalid.ascii"):
# XXX (ncoghlan): Mac OS X doesn't behave as expected in the
# XXX (ncoghlan): *BSD platforms don't behave as expected in the
# POSIX locale, so we skip that for now
if sys.platform == "darwin" and locale_to_set == "POSIX":
# See https://bugs.python.org/issue30672 for discussion
if locale_to_set == "POSIX":
continue
with self.subTest(env_var=env_var,
nominal_locale=locale_to_set,
@ -267,33 +290,62 @@ class LocaleCoercionTests(_LocaleCoercionTargetsTestCase):
var_dict[env_var] = locale_to_set
if coerce_c_locale is not None:
var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale
# Check behaviour on successful coercion
self._check_child_encoding_details(var_dict,
fs_encoding,
stream_encoding,
expected_warning)
expected_warnings,
coercion_expected)
def test_test_PYTHONCOERCECLOCALE_not_set(self):
# This should coerce to the first available target locale by default
self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None)
def test_PYTHONCOERCECLOCALE_not_zero(self):
# *Any* string other that "0" is considered "set" for our purposes
# *Any* string other than "0" is considered "set" for our purposes
# and hence should result in the locale coercion being enabled
for setting in ("", "1", "true", "false"):
self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting)
def test_PYTHONCOERCECLOCALE_set_to_warn(self):
# PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales
self._check_c_locale_coercion("utf-8", "utf-8",
coerce_c_locale="warn",
expected_warnings=[CLI_COERCION_WARNING])
def test_PYTHONCOERCECLOCALE_set_to_zero(self):
# The setting "0" should result in the locale coercion being disabled
self._check_c_locale_coercion(C_LOCALE_FS_ENCODING,
C_LOCALE_STREAM_ENCODING,
coerce_c_locale="0")
coerce_c_locale="0",
coercion_expected=False)
# Setting LC_ALL=C shouldn't make any difference to the behaviour
self._check_c_locale_coercion(C_LOCALE_FS_ENCODING,
C_LOCALE_STREAM_ENCODING,
coerce_c_locale="0",
LC_ALL="C",
coercion_expected=False)
def test_LC_ALL_set_to_C(self):
# Setting LC_ALL should render the locale coercion ineffective
self._check_c_locale_coercion(C_LOCALE_FS_ENCODING,
C_LOCALE_STREAM_ENCODING,
coerce_c_locale=None,
LC_ALL="C",
coercion_expected=False)
# And result in a warning about a lack of locale compatibility
self._check_c_locale_coercion(C_LOCALE_FS_ENCODING,
C_LOCALE_STREAM_ENCODING,
coerce_c_locale="warn",
LC_ALL="C",
expected_warnings=[LEGACY_LOCALE_WARNING],
coercion_expected=False)
def test_main():
test.support.run_unittest(
LocaleConfigurationTests,
LocaleCoercionTests,
LocaleWarningTests
LocaleCoercionTests
)
test.support.reap_children()

View File

@ -105,10 +105,10 @@ static const char usage_6[] =
" predictable seed.\n"
"PYTHONMALLOC: set the Python memory allocators and/or install debug hooks\n"
" on Python memory allocators. Use PYTHONMALLOC=debug to install debug\n"
" hooks.\n";
static const char usage_7[] =
" hooks.\n"
"PYTHONCOERCECLOCALE: if this variable is set to 0, it disables the locale\n"
" coercion behavior\n";
" coercion behavior. Use PYTHONCOERCECLOCALE=warn to request display of\n"
" locale coercion and locale compatibility warnings on stderr.\n";
static int
usage(int exitcode, const wchar_t* program)
@ -125,7 +125,6 @@ usage(int exitcode, const wchar_t* program)
fprintf(f, usage_4, (wint_t)DELIM);
fprintf(f, usage_5, (wint_t)DELIM, PYTHONHOMEHELP);
fputs(usage_6, f);
fputs(usage_7, f);
}
return exitcode;
}

View File

@ -356,6 +356,10 @@ _Py_LegacyLocaleDetected(void)
{
#ifndef MS_WINDOWS
/* On non-Windows systems, the C locale is considered a legacy locale */
/* XXX (ncoghlan): some platforms (notably Mac OS X) don't appear to treat
* the POSIX locale as a simple alias for the C locale, so
* we may also want to check for that explicitly.
*/
const char *ctype_loc = setlocale(LC_CTYPE, NULL);
return ctype_loc != NULL && strcmp(ctype_loc, "C") == 0;
#else
@ -364,6 +368,30 @@ _Py_LegacyLocaleDetected(void)
#endif
}
static const char *_C_LOCALE_WARNING =
"Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
"encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
"C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
"locales is recommended.\n";
static int
_legacy_locale_warnings_enabled(void)
{
const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
return (coerce_c_locale != NULL &&
strncmp(coerce_c_locale, "warn", 5) == 0);
}
static void
_emit_stderr_warning_for_legacy_locale(void)
{
if (_legacy_locale_warnings_enabled()) {
if (_Py_LegacyLocaleDetected()) {
fprintf(stderr, "%s", _C_LOCALE_WARNING);
}
}
}
typedef struct _CandidateLocale {
const char *locale_name; /* The locale to try as a coercion target */
} _LocaleCoercionTarget;
@ -371,10 +399,17 @@ typedef struct _CandidateLocale {
static _LocaleCoercionTarget _TARGET_LOCALES[] = {
{"C.UTF-8"},
{"C.utf8"},
{"UTF-8"},
/* {"UTF-8"}, */
{NULL}
};
/* XXX (ncoghlan): Using UTF-8 as a target locale is currently disabled due to
* problems encountered on *BSD systems with those test cases
* For additional details see:
* nl_langinfo CODESET error: https://bugs.python.org/issue30647
* locale handling differences: https://bugs.python.org/issue30672
*/
static char *
get_default_standard_stream_error_handler(void)
{
@ -419,7 +454,9 @@ _coerce_default_locale_settings(const _LocaleCoercionTarget *target)
"Error setting LC_CTYPE, skipping C locale coercion\n");
return;
}
if (_legacy_locale_warnings_enabled()) {
fprintf(stderr, _C_LOCALE_COERCION_WARNING, newloc);
}
/* Reconfigure with the overridden environment variables */
setlocale(LC_ALL, "");
@ -465,26 +502,6 @@ _Py_CoerceLegacyLocale(void)
}
#ifdef PY_WARN_ON_C_LOCALE
static const char *_C_LOCALE_WARNING =
"Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
"encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
"C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
"locales is recommended.\n";
static void
_emit_stderr_warning_for_c_locale(void)
{
const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) {
if (_Py_LegacyLocaleDetected()) {
fprintf(stderr, "%s", _C_LOCALE_WARNING);
}
}
}
#endif
/* Global initializations. Can be undone by Py_Finalize(). Don't
call this twice without an intervening Py_Finalize() call.
@ -561,9 +578,7 @@ void _Py_InitializeCore(const _PyCoreConfig *config)
the locale's charset without having to switch
locales. */
setlocale(LC_CTYPE, "");
#ifdef PY_WARN_ON_C_LOCALE
_emit_stderr_warning_for_c_locale();
#endif
_emit_stderr_warning_for_legacy_locale();
#endif
#endif