From 9e4994d410970fb4e75168401d159ba47a8f7108 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 28 Aug 2018 23:26:33 +0200 Subject: [PATCH] bpo-34485: Enhance init_sys_streams() (GH-8978) Python now gets the locale encoding with C code to initialize the encoding of standard streams like sys.stdout. Moreover, the encoding is now initialized to the Python codec name to get a normalized encoding name and to ensure that the codec is loaded. The change avoids importing _bootlocale and _locale modules at startup by default. When the PYTHONIOENCODING environment variable only contains an encoding, the error handler is now is now set explicitly to "strict". Rename also get_default_standard_stream_error_handler() to get_stdio_errors(). Reduce the buffer to format the "cpXXX" string (Windows locale encoding). --- Lib/test/test_embed.py | 16 ++-- Lib/test/test_sys.py | 6 +- Lib/test/test_utf8_mode.py | 12 +-- .../2018-08-28-17-48-40.bpo-34485.aFwck2.rst | 5 ++ .../2018-08-28-23-01-14.bpo-34485.dq1Kqk.rst | 3 + Modules/_localemodule.c | 2 +- Programs/_testembed.c | 4 +- Python/pylifecycle.c | 86 ++++++++++++++----- 8 files changed, 91 insertions(+), 43 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2018-08-28-17-48-40.bpo-34485.aFwck2.rst create mode 100644 Misc/NEWS.d/next/Core and Builtins/2018-08-28-23-01-14.bpo-34485.dq1Kqk.rst diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index 25593bdf420..3922447c645 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -171,17 +171,17 @@ class EmbeddingTests(EmbeddingTestsMixin, unittest.TestCase): "stdout: {out_encoding}:ignore", "stderr: {out_encoding}:backslashreplace", "--- Set encoding only ---", - "Expected encoding: latin-1", + "Expected encoding: iso8859-1", "Expected errors: default", - "stdin: latin-1:{errors}", - "stdout: latin-1:{errors}", - "stderr: latin-1:backslashreplace", + "stdin: iso8859-1:{errors}", + "stdout: iso8859-1:{errors}", + "stderr: iso8859-1:backslashreplace", "--- Set encoding and errors ---", - "Expected encoding: latin-1", + "Expected encoding: iso8859-1", "Expected errors: replace", - "stdin: latin-1:replace", - "stdout: latin-1:replace", - "stderr: latin-1:backslashreplace"]) + "stdin: iso8859-1:replace", + "stdout: iso8859-1:replace", + "stderr: iso8859-1:backslashreplace"]) expected_output = expected_output.format( in_encoding=expected_stream_encoding, out_encoding=expected_stream_encoding, diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index 336ae447a8d..005c82d13dc 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -668,7 +668,7 @@ class SysModuleTest(unittest.TestCase): 'dump("stdout")', 'dump("stderr")', )) - args = [sys.executable, "-c", code] + args = [sys.executable, "-X", "utf8=0", "-c", code] if isolated: args.append("-I") if encoding is not None: @@ -712,8 +712,8 @@ class SysModuleTest(unittest.TestCase): # have no any effect out = self.c_locale_get_error_handler(encoding=':') self.assertEqual(out, - 'stdin: strict\n' - 'stdout: strict\n' + 'stdin: surrogateescape\n' + 'stdout: surrogateescape\n' 'stderr: backslashreplace\n') out = self.c_locale_get_error_handler(encoding='') self.assertEqual(out, diff --git a/Lib/test/test_utf8_mode.py b/Lib/test/test_utf8_mode.py index df988c1fc9e..7280ce77ef8 100644 --- a/Lib/test/test_utf8_mode.py +++ b/Lib/test/test_utf8_mode.py @@ -139,16 +139,16 @@ class UTF8ModeTests(unittest.TestCase): out = self.get_output('-X', 'utf8', '-c', code, PYTHONIOENCODING="latin1") self.assertEqual(out.splitlines(), - ['stdin: latin1/strict', - 'stdout: latin1/strict', - 'stderr: latin1/backslashreplace']) + ['stdin: iso8859-1/strict', + 'stdout: iso8859-1/strict', + 'stderr: iso8859-1/backslashreplace']) out = self.get_output('-X', 'utf8', '-c', code, PYTHONIOENCODING=":namereplace") self.assertEqual(out.splitlines(), - ['stdin: UTF-8/namereplace', - 'stdout: UTF-8/namereplace', - 'stderr: UTF-8/backslashreplace']) + ['stdin: utf-8/namereplace', + 'stdout: utf-8/namereplace', + 'stderr: utf-8/backslashreplace']) def test_io(self): code = textwrap.dedent(''' diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-08-28-17-48-40.bpo-34485.aFwck2.rst b/Misc/NEWS.d/next/Core and Builtins/2018-08-28-17-48-40.bpo-34485.aFwck2.rst new file mode 100644 index 00000000000..f6cd9515f2a --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2018-08-28-17-48-40.bpo-34485.aFwck2.rst @@ -0,0 +1,5 @@ +Python now gets the locale encoding with C code to initialize the encoding +of standard streams like sys.stdout. Moreover, the encoding is now +initialized to the Python codec name to get a normalized encoding name and +to ensure that the codec is loaded. The change avoids importing _bootlocale +and _locale modules at startup by default. diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-08-28-23-01-14.bpo-34485.dq1Kqk.rst b/Misc/NEWS.d/next/Core and Builtins/2018-08-28-23-01-14.bpo-34485.dq1Kqk.rst new file mode 100644 index 00000000000..5ca373aeab6 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2018-08-28-23-01-14.bpo-34485.dq1Kqk.rst @@ -0,0 +1,3 @@ +Fix the error handler of standard streams like sys.stdout: +PYTHONIOENCODING=":" is now ignored instead of setting the error handler to +"strict". diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c index 524886d4660..3fdbc5ea812 100644 --- a/Modules/_localemodule.c +++ b/Modules/_localemodule.c @@ -319,7 +319,7 @@ exit: static PyObject* PyLocale_getdefaultlocale(PyObject* self, PyObject *Py_UNUSED(ignored)) { - char encoding[100]; + char encoding[20]; char locale[100]; PyOS_snprintf(encoding, sizeof(encoding), "cp%d", GetACP()); diff --git a/Programs/_testembed.c b/Programs/_testembed.c index f1d30f6c54d..d0c00cfc6cd 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -113,9 +113,9 @@ static int test_forced_io_encoding(void) printf("--- Set errors only ---\n"); check_stdio_details(NULL, "ignore"); printf("--- Set encoding only ---\n"); - check_stdio_details("latin-1", NULL); + check_stdio_details("iso8859-1", NULL); printf("--- Set encoding and errors ---\n"); - check_stdio_details("latin-1", "replace"); + check_stdio_details("iso8859-1", "replace"); /* Check calling after initialization fails */ Py_Initialize(); diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index cc64cf956d2..29711dfc982 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -244,22 +244,26 @@ error: return NULL; } -static char* -get_locale_encoding(void) +static _PyInitError +get_locale_encoding(char **locale_encoding) { -#if defined(HAVE_LANGINFO_H) && defined(CODESET) - char* codeset = nl_langinfo(CODESET); - if (!codeset || codeset[0] == '\0') { - PyErr_SetString(PyExc_ValueError, "CODESET is not set or empty"); - return NULL; - } - return get_codec_name(codeset); +#ifdef MS_WINDOWS + char encoding[20]; + PyOS_snprintf(encoding, sizeof(encoding), "cp%d", GetACP()); #elif defined(__ANDROID__) - return get_codec_name("UTF-8"); + const char *encoding = "UTF-8"; #else - PyErr_SetNone(PyExc_NotImplementedError); - return NULL; + const char *encoding = nl_langinfo(CODESET); + if (!encoding || encoding[0] == '\0') { + return _Py_INIT_USER_ERR("failed to get the locale encoding: " + "nl_langinfo(CODESET) failed"); + } #endif + *locale_encoding = _PyMem_RawStrdup(encoding); + if (*locale_encoding == NULL) { + return _Py_INIT_NO_MEMORY(); + } + return _Py_INIT_OK(); } static _PyInitError @@ -397,7 +401,7 @@ static _LocaleCoercionTarget _TARGET_LOCALES[] = { }; static const char * -get_default_standard_stream_error_handler(void) +get_stdio_errors(void) { const char *ctype_loc = setlocale(LC_CTYPE, NULL); if (ctype_loc != NULL) { @@ -417,8 +421,7 @@ get_default_standard_stream_error_handler(void) #endif } - /* Otherwise return NULL to request the typical default error handler */ - return NULL; + return "strict"; } #ifdef PY_COERCE_C_LOCALE @@ -1586,9 +1589,17 @@ initfsencoding(PyInterpreterState *interp) Py_HasFileSystemDefaultEncoding = 1; } else { - Py_FileSystemDefaultEncoding = get_locale_encoding(); + char *locale_encoding; + _PyInitError err = get_locale_encoding(&locale_encoding); + if (_Py_INIT_FAILED(err)) { + return err; + } + + Py_FileSystemDefaultEncoding = get_codec_name(locale_encoding); + PyMem_RawFree(locale_encoding); if (Py_FileSystemDefaultEncoding == NULL) { - return _Py_INIT_ERR("Unable to get the locale encoding"); + return _Py_INIT_ERR("failed to get the Python codec " + "of the locale encoding"); } Py_HasFileSystemDefaultEncoding = 0; @@ -1787,6 +1798,8 @@ init_sys_streams(PyInterpreterState *interp) PyObject * encoding_attr; char *pythonioencoding = NULL; const char *encoding, *errors; + char *locale_encoding = NULL; + char *codec_name = NULL; _PyInitError res = _Py_INIT_OK(); /* Hack to avoid a nasty recursion issue when Python is invoked @@ -1838,21 +1851,46 @@ init_sys_streams(PyInterpreterState *interp) errors = err; } } - if (*pythonioencoding && !encoding) { + if (!encoding && *pythonioencoding) { encoding = pythonioencoding; + if (!errors) { + errors = "strict"; + } } } - else if (interp->core_config.utf8_mode) { - encoding = "utf-8"; - errors = "surrogateescape"; + + if (interp->core_config.utf8_mode) { + if (!encoding) { + encoding = "utf-8"; + } + if (!errors) { + errors = "surrogateescape"; + } } - if (!errors && !pythonioencoding) { + if (!errors) { /* Choose the default error handler based on the current locale */ - errors = get_default_standard_stream_error_handler(); + errors = get_stdio_errors(); } } + if (encoding == NULL) { + _PyInitError err = get_locale_encoding(&locale_encoding); + if (_Py_INIT_FAILED(err)) { + return err; + } + encoding = locale_encoding; + } + + codec_name = get_codec_name(encoding); + if (codec_name == NULL) { + PyErr_SetString(PyExc_RuntimeError, + "failed to get the Python codec name " + "of stdio encoding"); + goto error; + } + encoding = codec_name; + /* Set sys.stdin */ fd = fileno(stdin); /* Under some conditions stdin, stdout and stderr may not be connected @@ -1928,6 +1966,8 @@ done: PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc); + PyMem_RawFree(locale_encoding); + PyMem_RawFree(codec_name); PyMem_Free(pythonioencoding); Py_XDECREF(bimod); Py_XDECREF(iomod);