From 710e82630775774dceba5e8f24b1b10e6dfaf9b7 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Sat, 31 Oct 2020 01:02:09 +0100 Subject: [PATCH] bpo-42208: Add _Py_GetLocaleEncoding() (GH-23050) _io.TextIOWrapper no longer calls getpreferredencoding(False) of _bootlocale to get the locale encoding, but calls _Py_GetLocaleEncoding() instead. Add config_get_fs_encoding() sub-function. Reorganize also config_get_locale_encoding() code. --- Include/internal/pycore_fileutils.h | 2 + Modules/_io/_iomodule.c | 25 ------ Modules/_io/_iomodule.h | 1 - Modules/_io/textio.c | 26 +----- Python/fileutils.c | 43 +++++++++- Python/initconfig.c | 125 ++++++++++++++-------------- 6 files changed, 112 insertions(+), 110 deletions(-) diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h index 9cb5fc66ee2..ff7bc4874c7 100644 --- a/Include/internal/pycore_fileutils.h +++ b/Include/internal/pycore_fileutils.h @@ -50,6 +50,8 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric( PyAPI_FUNC(void) _Py_closerange(int first, int last); +PyAPI_FUNC(PyObject*) _Py_GetLocaleEncoding(void); + #ifdef __cplusplus } #endif diff --git a/Modules/_io/_iomodule.c b/Modules/_io/_iomodule.c index e430352a48e..9147648b243 100644 --- a/Modules/_io/_iomodule.c +++ b/Modules/_io/_iomodule.c @@ -593,31 +593,6 @@ _PyIO_get_module_state(void) return state; } -PyObject * -_PyIO_get_locale_module(_PyIO_State *state) -{ - PyObject *mod; - if (state->locale_module != NULL) { - assert(PyWeakref_CheckRef(state->locale_module)); - mod = PyWeakref_GET_OBJECT(state->locale_module); - if (mod != Py_None) { - Py_INCREF(mod); - return mod; - } - Py_CLEAR(state->locale_module); - } - mod = PyImport_ImportModule("_bootlocale"); - if (mod == NULL) - return NULL; - state->locale_module = PyWeakref_NewRef(mod, NULL); - if (state->locale_module == NULL) { - Py_DECREF(mod); - return NULL; - } - return mod; -} - - static int iomodule_traverse(PyObject *mod, visitproc visit, void *arg) { _PyIO_State *state = get_io_state(mod); diff --git a/Modules/_io/_iomodule.h b/Modules/_io/_iomodule.h index a8f3951e57f..638797fd357 100644 --- a/Modules/_io/_iomodule.h +++ b/Modules/_io/_iomodule.h @@ -150,7 +150,6 @@ typedef struct { #define IO_STATE() _PyIO_get_module_state() extern _PyIO_State *_PyIO_get_module_state(void); -extern PyObject *_PyIO_get_locale_module(_PyIO_State *); #ifdef MS_WINDOWS extern char _PyIO_get_console_type(PyObject *); diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 699b7e94c93..2078bb316b2 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -10,6 +10,7 @@ #include "Python.h" #include "pycore_interp.h" // PyInterpreterState.fs_codec #include "pycore_long.h" // _PyLong_GetZero() +#include "pycore_fileutils.h" // _Py_GetLocaleEncoding() #include "pycore_object.h" #include "pycore_pystate.h" // _PyInterpreterState_GET() #include "structmember.h" // PyMemberDef @@ -27,7 +28,6 @@ _Py_IDENTIFIER(_dealloc_warn); _Py_IDENTIFIER(decode); _Py_IDENTIFIER(fileno); _Py_IDENTIFIER(flush); -_Py_IDENTIFIER(getpreferredencoding); _Py_IDENTIFIER(isatty); _Py_IDENTIFIER(mode); _Py_IDENTIFIER(name); @@ -1155,29 +1155,11 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer, } } if (encoding == NULL && self->encoding == NULL) { - PyObject *locale_module = _PyIO_get_locale_module(state); - if (locale_module == NULL) - goto catch_ImportError; - self->encoding = _PyObject_CallMethodIdOneArg( - locale_module, &PyId_getpreferredencoding, Py_False); - Py_DECREF(locale_module); + self->encoding = _Py_GetLocaleEncoding(); if (self->encoding == NULL) { - catch_ImportError: - /* - Importing locale can raise an ImportError because of - _functools, and locale.getpreferredencoding can raise an - ImportError if _locale is not available. These will happen - during module building. - */ - if (PyErr_ExceptionMatches(PyExc_ImportError)) { - PyErr_Clear(); - self->encoding = PyUnicode_FromString("ascii"); - } - else - goto error; + goto error; } - else if (!PyUnicode_Check(self->encoding)) - Py_CLEAR(self->encoding); + assert(PyUnicode_Check(self->encoding)); } if (self->encoding != NULL) { encoding = PyUnicode_AsUTF8(self->encoding); diff --git a/Python/fileutils.c b/Python/fileutils.c index e125ba46c21..ba2690429f3 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -1,5 +1,6 @@ #include "Python.h" -#include "pycore_fileutils.h" +#include "pycore_fileutils.h" // fileutils definitions +#include "pycore_runtime.h" // _PyRuntime #include "osdefs.h" // SEP #include @@ -820,6 +821,46 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str, } +// Get the current locale encoding: locale.getpreferredencoding(False). +// See also config_get_locale_encoding() +PyObject * +_Py_GetLocaleEncoding(void) +{ +#ifdef _Py_FORCE_UTF8_LOCALE + // On Android langinfo.h and CODESET are missing, + // and UTF-8 is always used in mbstowcs() and wcstombs(). + return PyUnicode_FromString("UTF-8"); +#else + const PyPreConfig *preconfig = &_PyRuntime.preconfig; + if (preconfig->utf8_mode) { + return PyUnicode_FromString("UTF-8"); + } + +#if defined(MS_WINDOWS) + return PyUnicode_FromFormat("cp%u", GetACP()); +#else + const char *encoding = nl_langinfo(CODESET); + if (!encoding || encoding[0] == '\0') { +#ifdef _Py_FORCE_UTF8_FS_ENCODING + // nl_langinfo() can return an empty string when the LC_CTYPE locale is + // not supported. Default to UTF-8 in that case, because UTF-8 is the + // default charset on macOS. + encoding = "UTF-8"; +#else + PyErr_SetString(PyExc_ValueError, + "failed to get the locale encoding: " + "nl_langinfo(CODESET) returns an empty string"); + return NULL; +#endif + } + // Decode from UTF-8 + return PyUnicode_FromString(encoding); +#endif // !CODESET + +#endif +} + + #ifdef MS_WINDOWS static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */ diff --git a/Python/initconfig.c b/Python/initconfig.c index 6a13dc52ed7..e129278d8f8 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -766,7 +766,7 @@ config_set_bytes_string(PyConfig *config, wchar_t **config_str, configured. */ PyStatus PyConfig_SetBytesString(PyConfig *config, wchar_t **config_str, - const char *str) + const char *str) { return CONFIG_SET_BYTES_STR(config, config_str, str, "string"); } @@ -1466,8 +1466,13 @@ config_read_complex_options(PyConfig *config) static const wchar_t * -config_get_stdio_errors(void) +config_get_stdio_errors(const PyPreConfig *preconfig) { + if (preconfig->utf8_mode) { + /* UTF-8 Mode uses UTF-8/surrogateescape */ + return L"surrogateescape"; + } + #ifndef MS_WINDOWS const char *loc = setlocale(LC_CTYPE, NULL); if (loc != NULL) { @@ -1492,26 +1497,41 @@ config_get_stdio_errors(void) } +// See also _Py_GetLocaleEncoding() and config_get_fs_encoding() static PyStatus -config_get_locale_encoding(PyConfig *config, wchar_t **locale_encoding) +config_get_locale_encoding(PyConfig *config, const PyPreConfig *preconfig, + wchar_t **locale_encoding) { +#ifdef _Py_FORCE_UTF8_LOCALE + return PyConfig_SetString(config, locale_encoding, L"utf-8"); +#else + if (preconfig->utf8_mode) { + return PyConfig_SetString(config, locale_encoding, L"utf-8"); + } + #ifdef MS_WINDOWS char encoding[20]; PyOS_snprintf(encoding, sizeof(encoding), "cp%u", GetACP()); return PyConfig_SetBytesString(config, locale_encoding, encoding); -#elif defined(_Py_FORCE_UTF8_LOCALE) - return PyConfig_SetString(config, locale_encoding, L"utf-8"); #else const char *encoding = nl_langinfo(CODESET); if (!encoding || encoding[0] == '\0') { +#ifdef _Py_FORCE_UTF8_FS_ENCODING + // nl_langinfo() can return an empty string when the LC_CTYPE locale is + // not supported. Default to UTF-8 in that case, because UTF-8 is the + // default charset on macOS. + encoding = "UTF-8"; +#else return _PyStatus_ERR("failed to get the locale encoding: " - "nl_langinfo(CODESET) failed"); + "nl_langinfo(CODESET) returns an empty string"); +#endif } /* nl_langinfo(CODESET) is decoded by Py_DecodeLocale() */ return CONFIG_SET_BYTES_STR(config, locale_encoding, encoding, "nl_langinfo(CODESET)"); -#endif +#endif // !MS_WINDOWS +#endif // !_Py_FORCE_UTF8_LOCALE } @@ -1596,33 +1616,16 @@ config_init_stdio_encoding(PyConfig *config, PyMem_RawFree(pythonioencoding); } - /* UTF-8 Mode uses UTF-8/surrogateescape */ - if (preconfig->utf8_mode) { - if (config->stdio_encoding == NULL) { - status = PyConfig_SetString(config, &config->stdio_encoding, - L"utf-8"); - if (_PyStatus_EXCEPTION(status)) { - return status; - } - } - if (config->stdio_errors == NULL) { - status = PyConfig_SetString(config, &config->stdio_errors, - L"surrogateescape"); - if (_PyStatus_EXCEPTION(status)) { - return status; - } - } - } - /* Choose the default error handler based on the current locale. */ if (config->stdio_encoding == NULL) { - status = config_get_locale_encoding(config, &config->stdio_encoding); + status = config_get_locale_encoding(config, preconfig, + &config->stdio_encoding); if (_PyStatus_EXCEPTION(status)) { return status; } } if (config->stdio_errors == NULL) { - const wchar_t *errors = config_get_stdio_errors(); + const wchar_t *errors = config_get_stdio_errors(preconfig); assert(errors != NULL); status = PyConfig_SetString(config, &config->stdio_errors, errors); @@ -1635,46 +1638,46 @@ config_init_stdio_encoding(PyConfig *config, } +// See also config_get_locale_encoding() +static PyStatus +config_get_fs_encoding(PyConfig *config, const PyPreConfig *preconfig, + wchar_t **fs_encoding) +{ +#ifdef _Py_FORCE_UTF8_FS_ENCODING + return PyConfig_SetString(config, fs_encoding, L"utf-8"); +#elif defined(MS_WINDOWS) + const wchar_t *encoding; + if (preconfig->legacy_windows_fs_encoding) { + // Legacy Windows filesystem encoding: mbcs/replace + encoding = L"mbcs"; + } + else { + // Windows defaults to utf-8/surrogatepass (PEP 529) + encoding = L"utf-8"; + } + return PyConfig_SetString(config, fs_encoding, encoding); +#else // !MS_WINDOWS + if (preconfig->utf8_mode) { + return PyConfig_SetString(config, fs_encoding, L"utf-8"); + } + else if (_Py_GetForceASCII()) { + return PyConfig_SetString(config, fs_encoding, L"ascii"); + } + else { + return config_get_locale_encoding(config, preconfig, fs_encoding); + } +#endif // !MS_WINDOWS +} + + static PyStatus config_init_fs_encoding(PyConfig *config, const PyPreConfig *preconfig) { PyStatus status; if (config->filesystem_encoding == NULL) { -#ifdef _Py_FORCE_UTF8_FS_ENCODING - status = PyConfig_SetString(config, &config->filesystem_encoding, L"utf-8"); -#else - -#ifdef MS_WINDOWS - if (preconfig->legacy_windows_fs_encoding) { - /* Legacy Windows filesystem encoding: mbcs/replace */ - status = PyConfig_SetString(config, &config->filesystem_encoding, - L"mbcs"); - } - else -#endif - if (preconfig->utf8_mode) { - status = PyConfig_SetString(config, &config->filesystem_encoding, - L"utf-8"); - } -#ifndef MS_WINDOWS - else if (_Py_GetForceASCII()) { - status = PyConfig_SetString(config, &config->filesystem_encoding, - L"ascii"); - } -#endif - else { -#ifdef MS_WINDOWS - /* Windows defaults to utf-8/surrogatepass (PEP 529). */ - status = PyConfig_SetString(config, &config->filesystem_encoding, - L"utf-8"); -#else - status = config_get_locale_encoding(config, - &config->filesystem_encoding); -#endif - } -#endif /* !_Py_FORCE_UTF8_FS_ENCODING */ - + status = config_get_fs_encoding(config, preconfig, + &config->filesystem_encoding); if (_PyStatus_EXCEPTION(status)) { return status; }