bpo-42236: Enhance _locale._get_locale_encoding() (GH-23083)
* Rename _Py_GetLocaleEncoding() to _Py_GetLocaleEncodingObject() * Add _Py_GetLocaleEncoding() which returns a wchar_t* string to share code between _Py_GetLocaleEncodingObject() and config_get_locale_encoding(). * _Py_GetLocaleEncodingObject() now decodes nl_langinfo(CODESET) from the current locale encoding with surrogateescape, rather than using UTF-8.
This commit is contained in:
parent
1f7dfb277e
commit
82458b6cdb
|
@ -50,7 +50,8 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
|
|||
|
||||
PyAPI_FUNC(void) _Py_closerange(int first, int last);
|
||||
|
||||
PyAPI_FUNC(PyObject*) _Py_GetLocaleEncoding(void);
|
||||
PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(const char **errmsg);
|
||||
PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -1155,7 +1155,7 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
|
|||
}
|
||||
}
|
||||
if (encoding == NULL && self->encoding == NULL) {
|
||||
self->encoding = _Py_GetLocaleEncoding();
|
||||
self->encoding = _Py_GetLocaleEncodingObject();
|
||||
if (self->encoding == NULL) {
|
||||
goto error;
|
||||
}
|
||||
|
|
|
@ -783,7 +783,7 @@ static PyObject *
|
|||
_locale__get_locale_encoding_impl(PyObject *module)
|
||||
/*[clinic end generated code: output=e8e2f6f6f184591a input=513d9961d2f45c76]*/
|
||||
{
|
||||
return _Py_GetLocaleEncoding();
|
||||
return _Py_GetLocaleEncodingObject();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -821,23 +821,41 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str,
|
|||
}
|
||||
|
||||
|
||||
// Get the current locale encoding: locale.getpreferredencoding(False).
|
||||
// Get the current locale encoding name:
|
||||
//
|
||||
// - Return "UTF-8" if _Py_FORCE_UTF8_LOCALE macro is defined (ex: on Android)
|
||||
// - Return "UTF-8" if the UTF-8 Mode is enabled
|
||||
// - On Windows, return the ANSI code page (ex: "cp1250")
|
||||
// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string
|
||||
// and if the _Py_FORCE_UTF8_FS_ENCODING macro is defined (ex: on macOS).
|
||||
// - Otherwise, return nl_langinfo(CODESET).
|
||||
//
|
||||
// Return NULL and set errmsg to an error message
|
||||
// if nl_langinfo(CODESET) fails.
|
||||
//
|
||||
// Return NULL and set errmsg to NULL on memory allocation failure.
|
||||
//
|
||||
// See also config_get_locale_encoding()
|
||||
PyObject *
|
||||
_Py_GetLocaleEncoding(void)
|
||||
wchar_t*
|
||||
_Py_GetLocaleEncoding(const char **errmsg)
|
||||
{
|
||||
*errmsg = NULL;
|
||||
#ifdef _Py_FORCE_UTF8_LOCALE
|
||||
// On Android langinfo.h and CODESET are missing,
|
||||
// and UTF-8 is always used in mbstowcs() and wcstombs().
|
||||
return PyUnicode_FromString("UTF-8");
|
||||
return _PyMem_RawWcsdup(L"UTF-8");
|
||||
#else
|
||||
const PyPreConfig *preconfig = &_PyRuntime.preconfig;
|
||||
if (preconfig->utf8_mode) {
|
||||
return PyUnicode_FromString("UTF-8");
|
||||
return _PyMem_RawWcsdup(L"UTF-8");
|
||||
}
|
||||
|
||||
#if defined(MS_WINDOWS)
|
||||
return PyUnicode_FromFormat("cp%u", GetACP());
|
||||
#ifdef MS_WINDOWS
|
||||
wchar_t encoding[23];
|
||||
unsigned int ansi_codepage = GetACP();
|
||||
swprintf(encoding, Py_ARRAY_LENGTH(encoding), L"cp%u", ansi_codepage);
|
||||
encoding[Py_ARRAY_LENGTH(encoding) - 1] = 0;
|
||||
return _PyMem_RawWcsdup(encoding);
|
||||
#else
|
||||
const char *encoding = nl_langinfo(CODESET);
|
||||
if (!encoding || encoding[0] == '\0') {
|
||||
|
@ -845,19 +863,45 @@ _Py_GetLocaleEncoding(void)
|
|||
// nl_langinfo() can return an empty string when the LC_CTYPE locale is
|
||||
// not supported. Default to UTF-8 in that case, because UTF-8 is the
|
||||
// default charset on macOS.
|
||||
encoding = "UTF-8";
|
||||
return _PyMem_RawWcsdup(L"UTF-8");
|
||||
#else
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"failed to get the locale encoding: "
|
||||
"nl_langinfo(CODESET) returns an empty string");
|
||||
*errmsg = "failed to get the locale encoding: "
|
||||
"nl_langinfo(CODESET) returns an empty string";
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
// Decode from UTF-8
|
||||
return PyUnicode_FromString(encoding);
|
||||
#endif // !CODESET
|
||||
|
||||
#endif
|
||||
wchar_t *wstr;
|
||||
int res = decode_current_locale(encoding, &wstr, NULL,
|
||||
errmsg, _Py_ERROR_SURROGATEESCAPE);
|
||||
if (res < 0) {
|
||||
return NULL;
|
||||
}
|
||||
return wstr;
|
||||
#endif // !MS_WINDOWS
|
||||
|
||||
#endif // !_Py_FORCE_UTF8_LOCALE
|
||||
}
|
||||
|
||||
|
||||
PyObject *
|
||||
_Py_GetLocaleEncodingObject(void)
|
||||
{
|
||||
const char *errmsg;
|
||||
wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg);
|
||||
if (encoding == NULL) {
|
||||
if (errmsg != NULL) {
|
||||
PyErr_SetString(PyExc_ValueError, errmsg);
|
||||
}
|
||||
else {
|
||||
PyErr_NoMemory();
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyObject *str = PyUnicode_FromWideChar(encoding, -1);
|
||||
PyMem_RawFree(encoding);
|
||||
return str;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -11,11 +11,7 @@
|
|||
|
||||
#include "osdefs.h" // DELIM
|
||||
#include <locale.h> // setlocale()
|
||||
#ifdef HAVE_LANGINFO_H
|
||||
# include <langinfo.h> // nl_langinfo(CODESET)
|
||||
#endif
|
||||
#if defined(MS_WINDOWS) || defined(__CYGWIN__)
|
||||
# include <windows.h> // GetACP()
|
||||
# ifdef HAVE_IO_H
|
||||
# include <io.h>
|
||||
# endif
|
||||
|
@ -1497,41 +1493,24 @@ config_get_stdio_errors(const PyPreConfig *preconfig)
|
|||
}
|
||||
|
||||
|
||||
// See also _Py_GetLocaleEncoding() and config_get_fs_encoding()
|
||||
// See also config_get_fs_encoding()
|
||||
static PyStatus
|
||||
config_get_locale_encoding(PyConfig *config, const PyPreConfig *preconfig,
|
||||
wchar_t **locale_encoding)
|
||||
{
|
||||
#ifdef _Py_FORCE_UTF8_LOCALE
|
||||
return PyConfig_SetString(config, locale_encoding, L"utf-8");
|
||||
#else
|
||||
if (preconfig->utf8_mode) {
|
||||
return PyConfig_SetString(config, locale_encoding, L"utf-8");
|
||||
const char *errmsg;
|
||||
wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg);
|
||||
if (encoding == NULL) {
|
||||
if (errmsg != NULL) {
|
||||
return _PyStatus_ERR(errmsg);
|
||||
}
|
||||
else {
|
||||
return _PyStatus_NO_MEMORY();
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef MS_WINDOWS
|
||||
char encoding[20];
|
||||
PyOS_snprintf(encoding, sizeof(encoding), "cp%u", GetACP());
|
||||
return PyConfig_SetBytesString(config, locale_encoding, encoding);
|
||||
#else
|
||||
const char *encoding = nl_langinfo(CODESET);
|
||||
if (!encoding || encoding[0] == '\0') {
|
||||
#ifdef _Py_FORCE_UTF8_FS_ENCODING
|
||||
// nl_langinfo() can return an empty string when the LC_CTYPE locale is
|
||||
// not supported. Default to UTF-8 in that case, because UTF-8 is the
|
||||
// default charset on macOS.
|
||||
encoding = "UTF-8";
|
||||
#else
|
||||
return _PyStatus_ERR("failed to get the locale encoding: "
|
||||
"nl_langinfo(CODESET) returns an empty string");
|
||||
#endif
|
||||
}
|
||||
/* nl_langinfo(CODESET) is decoded by Py_DecodeLocale() */
|
||||
return CONFIG_SET_BYTES_STR(config,
|
||||
locale_encoding, encoding,
|
||||
"nl_langinfo(CODESET)");
|
||||
#endif // !MS_WINDOWS
|
||||
#endif // !_Py_FORCE_UTF8_LOCALE
|
||||
PyStatus status = PyConfig_SetString(config, locale_encoding, encoding);
|
||||
PyMem_RawFree(encoding);
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue