bpo-42236: Enhance _locale._get_locale_encoding() (GH-23083)

* Rename _Py_GetLocaleEncoding() to _Py_GetLocaleEncodingObject()
* Add _Py_GetLocaleEncoding() which returns a wchar_t* string to
  share code between _Py_GetLocaleEncodingObject()
  and config_get_locale_encoding().
* _Py_GetLocaleEncodingObject() now decodes nl_langinfo(CODESET)
  from the current locale encoding with surrogateescape,
  rather than using UTF-8.
This commit is contained in:
Victor Stinner 2020-11-01 20:59:35 +01:00 committed by GitHub
parent 1f7dfb277e
commit 82458b6cdb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 76 additions and 52 deletions

View File

@ -50,7 +50,8 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
PyAPI_FUNC(void) _Py_closerange(int first, int last); PyAPI_FUNC(void) _Py_closerange(int first, int last);
PyAPI_FUNC(PyObject*) _Py_GetLocaleEncoding(void); PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(const char **errmsg);
PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -1155,7 +1155,7 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
} }
} }
if (encoding == NULL && self->encoding == NULL) { if (encoding == NULL && self->encoding == NULL) {
self->encoding = _Py_GetLocaleEncoding(); self->encoding = _Py_GetLocaleEncodingObject();
if (self->encoding == NULL) { if (self->encoding == NULL) {
goto error; goto error;
} }

View File

@ -783,7 +783,7 @@ static PyObject *
_locale__get_locale_encoding_impl(PyObject *module) _locale__get_locale_encoding_impl(PyObject *module)
/*[clinic end generated code: output=e8e2f6f6f184591a input=513d9961d2f45c76]*/ /*[clinic end generated code: output=e8e2f6f6f184591a input=513d9961d2f45c76]*/
{ {
return _Py_GetLocaleEncoding(); return _Py_GetLocaleEncodingObject();
} }

View File

@ -821,23 +821,41 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str,
} }
// Get the current locale encoding: locale.getpreferredencoding(False). // Get the current locale encoding name:
//
// - Return "UTF-8" if _Py_FORCE_UTF8_LOCALE macro is defined (ex: on Android)
// - Return "UTF-8" if the UTF-8 Mode is enabled
// - On Windows, return the ANSI code page (ex: "cp1250")
// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string
// and if the _Py_FORCE_UTF8_FS_ENCODING macro is defined (ex: on macOS).
// - Otherwise, return nl_langinfo(CODESET).
//
// Return NULL and set errmsg to an error message
// if nl_langinfo(CODESET) fails.
//
// Return NULL and set errmsg to NULL on memory allocation failure.
//
// See also config_get_locale_encoding() // See also config_get_locale_encoding()
PyObject * wchar_t*
_Py_GetLocaleEncoding(void) _Py_GetLocaleEncoding(const char **errmsg)
{ {
*errmsg = NULL;
#ifdef _Py_FORCE_UTF8_LOCALE #ifdef _Py_FORCE_UTF8_LOCALE
// On Android langinfo.h and CODESET are missing, // On Android langinfo.h and CODESET are missing,
// and UTF-8 is always used in mbstowcs() and wcstombs(). // and UTF-8 is always used in mbstowcs() and wcstombs().
return PyUnicode_FromString("UTF-8"); return _PyMem_RawWcsdup(L"UTF-8");
#else #else
const PyPreConfig *preconfig = &_PyRuntime.preconfig; const PyPreConfig *preconfig = &_PyRuntime.preconfig;
if (preconfig->utf8_mode) { if (preconfig->utf8_mode) {
return PyUnicode_FromString("UTF-8"); return _PyMem_RawWcsdup(L"UTF-8");
} }
#if defined(MS_WINDOWS) #ifdef MS_WINDOWS
return PyUnicode_FromFormat("cp%u", GetACP()); wchar_t encoding[23];
unsigned int ansi_codepage = GetACP();
swprintf(encoding, Py_ARRAY_LENGTH(encoding), L"cp%u", ansi_codepage);
encoding[Py_ARRAY_LENGTH(encoding) - 1] = 0;
return _PyMem_RawWcsdup(encoding);
#else #else
const char *encoding = nl_langinfo(CODESET); const char *encoding = nl_langinfo(CODESET);
if (!encoding || encoding[0] == '\0') { if (!encoding || encoding[0] == '\0') {
@ -845,19 +863,45 @@ _Py_GetLocaleEncoding(void)
// nl_langinfo() can return an empty string when the LC_CTYPE locale is // nl_langinfo() can return an empty string when the LC_CTYPE locale is
// not supported. Default to UTF-8 in that case, because UTF-8 is the // not supported. Default to UTF-8 in that case, because UTF-8 is the
// default charset on macOS. // default charset on macOS.
encoding = "UTF-8"; return _PyMem_RawWcsdup(L"UTF-8");
#else #else
PyErr_SetString(PyExc_ValueError, *errmsg = "failed to get the locale encoding: "
"failed to get the locale encoding: " "nl_langinfo(CODESET) returns an empty string";
"nl_langinfo(CODESET) returns an empty string");
return NULL; return NULL;
#endif #endif
} }
// Decode from UTF-8
return PyUnicode_FromString(encoding);
#endif // !CODESET
#endif wchar_t *wstr;
int res = decode_current_locale(encoding, &wstr, NULL,
errmsg, _Py_ERROR_SURROGATEESCAPE);
if (res < 0) {
return NULL;
}
return wstr;
#endif // !MS_WINDOWS
#endif // !_Py_FORCE_UTF8_LOCALE
}
PyObject *
_Py_GetLocaleEncodingObject(void)
{
const char *errmsg;
wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg);
if (encoding == NULL) {
if (errmsg != NULL) {
PyErr_SetString(PyExc_ValueError, errmsg);
}
else {
PyErr_NoMemory();
}
return NULL;
}
PyObject *str = PyUnicode_FromWideChar(encoding, -1);
PyMem_RawFree(encoding);
return str;
} }

View File

@ -11,11 +11,7 @@
#include "osdefs.h" // DELIM #include "osdefs.h" // DELIM
#include <locale.h> // setlocale() #include <locale.h> // setlocale()
#ifdef HAVE_LANGINFO_H
# include <langinfo.h> // nl_langinfo(CODESET)
#endif
#if defined(MS_WINDOWS) || defined(__CYGWIN__) #if defined(MS_WINDOWS) || defined(__CYGWIN__)
# include <windows.h> // GetACP()
# ifdef HAVE_IO_H # ifdef HAVE_IO_H
# include <io.h> # include <io.h>
# endif # endif
@ -1497,41 +1493,24 @@ config_get_stdio_errors(const PyPreConfig *preconfig)
} }
// See also _Py_GetLocaleEncoding() and config_get_fs_encoding() // See also config_get_fs_encoding()
static PyStatus static PyStatus
config_get_locale_encoding(PyConfig *config, const PyPreConfig *preconfig, config_get_locale_encoding(PyConfig *config, const PyPreConfig *preconfig,
wchar_t **locale_encoding) wchar_t **locale_encoding)
{ {
#ifdef _Py_FORCE_UTF8_LOCALE const char *errmsg;
return PyConfig_SetString(config, locale_encoding, L"utf-8"); wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg);
#else if (encoding == NULL) {
if (preconfig->utf8_mode) { if (errmsg != NULL) {
return PyConfig_SetString(config, locale_encoding, L"utf-8"); return _PyStatus_ERR(errmsg);
}
else {
return _PyStatus_NO_MEMORY();
}
} }
PyStatus status = PyConfig_SetString(config, locale_encoding, encoding);
#ifdef MS_WINDOWS PyMem_RawFree(encoding);
char encoding[20]; return status;
PyOS_snprintf(encoding, sizeof(encoding), "cp%u", GetACP());
return PyConfig_SetBytesString(config, locale_encoding, encoding);
#else
const char *encoding = nl_langinfo(CODESET);
if (!encoding || encoding[0] == '\0') {
#ifdef _Py_FORCE_UTF8_FS_ENCODING
// nl_langinfo() can return an empty string when the LC_CTYPE locale is
// not supported. Default to UTF-8 in that case, because UTF-8 is the
// default charset on macOS.
encoding = "UTF-8";
#else
return _PyStatus_ERR("failed to get the locale encoding: "
"nl_langinfo(CODESET) returns an empty string");
#endif
}
/* nl_langinfo(CODESET) is decoded by Py_DecodeLocale() */
return CONFIG_SET_BYTES_STR(config,
locale_encoding, encoding,
"nl_langinfo(CODESET)");
#endif // !MS_WINDOWS
#endif // !_Py_FORCE_UTF8_LOCALE
} }