[3.6] bpo-32555: Fix locale encodings (#5193)

On FreeBSD and Solaris, os.strerror() now always decode the byte
string from the current locale encoding, rather than using
ASCII/surrogateescape in some cases.

Changes:

* Add _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() which has an
  additional current_locale parameter.
* PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize() and
* PyUnicode_EncodeLocale() now always use the current locale
* encoding, instead of using Py_DecodeLocale()/Py_EncodeLocale().
* Document encoding in Py_DecodeLocale() and Py_EncodeLocale()
  documentations.
* Add USE_FORCE_ASCII define to not define
  decode_ascii_surrogateescape() on Android.
This commit is contained in:
Victor Stinner 2018-01-15 23:43:24 +01:00 committed by GitHub
parent 5f959c4f9e
commit b92c159efa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 227 additions and 105 deletions

View File

@ -66,9 +66,18 @@ Operating System Utilities
surrogate character, escape the bytes using the surrogateescape error surrogate character, escape the bytes using the surrogateescape error
handler instead of decoding them. handler instead of decoding them.
Encoding, highest priority to lowest priority:
* ``UTF-8`` on macOS and Android;
* ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
and :c:func:`mbstowcs` and :c:func:`wcstombs` functions use the
``ISO-8859-1`` encoding.
* the current locale encoding (``LC_CTYPE`` locale).
Return a pointer to a newly allocated wide character string, use Return a pointer to a newly allocated wide character string, use
:c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write :c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write
the number of wide characters excluding the null character into ``*size`` the number of wide characters excluding the null character into ``*size``.
Return ``NULL`` on decoding error or memory allocation error. If *size* is Return ``NULL`` on decoding error or memory allocation error. If *size* is
not ``NULL``, ``*size`` is set to ``(size_t)-1`` on memory error or set to not ``NULL``, ``*size`` is set to ``(size_t)-1`` on memory error or set to
@ -94,6 +103,15 @@ Operating System Utilities
:ref:`surrogateescape error handler <surrogateescape>`: surrogate characters :ref:`surrogateescape error handler <surrogateescape>`: surrogate characters
in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF. in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
Encoding, highest priority to lowest priority:
* ``UTF-8`` on macOS and Android;
* ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
``ISO-8859-1`` encoding.
* the current locale encoding.
Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free` Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free`
to free the memory. Return ``NULL`` on encoding error or memory allocation to free the memory. Return ``NULL`` on encoding error or memory allocation
error error

View File

@ -773,6 +773,12 @@ system.
.. versionadded:: 3.3 .. versionadded:: 3.3
.. versionchanged:: 3.6.5
The function now also uses the current locale encoding for the
``surrogateescape`` error handler. Previously, :c:func:`Py_DecodeLocale`
was used for the ``surrogateescape``, and the current locale encoding was
used for ``strict``.
.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors) .. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors)
@ -800,6 +806,12 @@ system.
.. versionadded:: 3.3 .. versionadded:: 3.3
.. versionchanged:: 3.6.5
The function now also uses the current locale encoding for the
``surrogateescape`` error handler. Previously, :c:func:`Py_EncodeLocale`
was used for the ``surrogateescape``, and the current locale encoding was
used for ``strict``.
File System Encoding File System Encoding
"""""""""""""""""""" """"""""""""""""""""

View File

@ -17,6 +17,16 @@ PyAPI_FUNC(char*) Py_EncodeLocale(
#ifndef Py_LIMITED_API #ifndef Py_LIMITED_API
PyAPI_FUNC(wchar_t *) _Py_DecodeLocaleEx(
const char *arg,
size_t *size,
int current_locale);
PyAPI_FUNC(char*) _Py_EncodeLocaleEx(
const wchar_t *text,
size_t *error_pos,
int current_locale);
PyAPI_FUNC(PyObject *) _Py_device_encoding(int); PyAPI_FUNC(PyObject *) _Py_device_encoding(int);
#ifdef MS_WINDOWS #ifdef MS_WINDOWS

View File

@ -0,0 +1,3 @@
On FreeBSD and Solaris, os.strerror() now always decode the byte string from
the current locale encoding, rather than using ASCII/surrogateescape in some
cases.

View File

@ -3439,8 +3439,9 @@ locale_error_handler(const char *errors, int *surrogateescape)
} }
} }
PyObject * static PyObject *
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) unicode_encode_locale(PyObject *unicode, const char *errors,
int current_locale)
{ {
Py_ssize_t wlen, wlen2; Py_ssize_t wlen, wlen2;
wchar_t *wstr; wchar_t *wstr;
@ -3469,7 +3470,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
/* "surrogateescape" error handler */ /* "surrogateescape" error handler */
char *str; char *str;
str = Py_EncodeLocale(wstr, &error_pos); str = _Py_EncodeLocaleEx(wstr, &error_pos, current_locale);
if (str == NULL) { if (str == NULL) {
if (error_pos == (size_t)-1) { if (error_pos == (size_t)-1) {
PyErr_NoMemory(); PyErr_NoMemory();
@ -3549,6 +3550,12 @@ encode_error:
return NULL; return NULL;
} }
PyObject *
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
{
return unicode_encode_locale(unicode, errors, 1);
}
PyObject * PyObject *
PyUnicode_EncodeFSDefault(PyObject *unicode) PyUnicode_EncodeFSDefault(PyObject *unicode)
{ {
@ -3571,7 +3578,8 @@ PyUnicode_EncodeFSDefault(PyObject *unicode)
Py_FileSystemDefaultEncodeErrors); Py_FileSystemDefaultEncodeErrors);
} }
else { else {
return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors); return unicode_encode_locale(unicode,
Py_FileSystemDefaultEncodeErrors, 0);
} }
#endif #endif
} }
@ -3741,9 +3749,9 @@ mbstowcs_errorpos(const char *str, size_t len)
return 0; return 0;
} }
PyObject* static PyObject*
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, unicode_decode_locale(const char *str, Py_ssize_t len,
const char *errors) const char *errors, int current_locale)
{ {
wchar_t smallbuf[256]; wchar_t smallbuf[256];
size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
@ -3766,7 +3774,7 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
if (surrogateescape) { if (surrogateescape) {
/* "surrogateescape" error handler */ /* "surrogateescape" error handler */
wstr = Py_DecodeLocale(str, &wlen); wstr = _Py_DecodeLocaleEx(str, &wlen, current_locale);
if (wstr == NULL) { if (wstr == NULL) {
if (wlen == (size_t)-1) if (wlen == (size_t)-1)
PyErr_NoMemory(); PyErr_NoMemory();
@ -3844,11 +3852,18 @@ decode_error:
return NULL; return NULL;
} }
PyObject*
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t size,
const char *errors)
{
return unicode_decode_locale(str, size, errors, 1);
}
PyObject* PyObject*
PyUnicode_DecodeLocale(const char *str, const char *errors) PyUnicode_DecodeLocale(const char *str, const char *errors)
{ {
Py_ssize_t size = (Py_ssize_t)strlen(str); Py_ssize_t size = (Py_ssize_t)strlen(str);
return PyUnicode_DecodeLocaleAndSize(str, size, errors); return unicode_decode_locale(str, size, errors, 1);
} }
@ -3880,7 +3895,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
Py_FileSystemDefaultEncodeErrors); Py_FileSystemDefaultEncodeErrors);
} }
else { else {
return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors); return unicode_decode_locale(s, size,
Py_FileSystemDefaultEncodeErrors, 0);
} }
#endif #endif
} }

View File

@ -70,7 +70,10 @@ _Py_device_encoding(int fd)
Py_RETURN_NONE; Py_RETURN_NONE;
} }
#if !defined(__APPLE__) && !defined(MS_WINDOWS) #if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS)
#define USE_FORCE_ASCII
extern int _Py_normalize_encoding(const char *, char *, size_t); extern int _Py_normalize_encoding(const char *, char *, size_t);
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale. /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
@ -221,7 +224,7 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos)
} }
#endif /* !defined(__APPLE__) && !defined(MS_WINDOWS) */ #endif /* !defined(__APPLE__) && !defined(MS_WINDOWS) */
#if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC)) #if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
static wchar_t* static wchar_t*
decode_ascii_surrogateescape(const char *arg, size_t *size) decode_ascii_surrogateescape(const char *arg, size_t *size)
{ {
@ -251,39 +254,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size)
#endif #endif
/* Decode a byte string from the locale encoding with the static wchar_t*
surrogateescape error handler: undecodable bytes are decoded as characters decode_current_locale(const char* arg, size_t *size)
in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
character, escape the bytes using the surrogateescape error handler instead
of decoding them.
Return a pointer to a newly allocated wide character string, use
PyMem_RawFree() to free the memory. If size is not NULL, write the number of
wide characters excluding the null character into *size
Return NULL on decoding error or memory allocation error. If *size* is not
NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
decoding error.
Decoding errors should never happen, unless there is a bug in the C
library.
Use the Py_EncodeLocale() function to encode the character string back to a
byte string. */
wchar_t*
Py_DecodeLocale(const char* arg, size_t *size)
{ {
#if defined(__APPLE__) || defined(__ANDROID__)
wchar_t *wstr;
wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
if (size != NULL) {
if (wstr != NULL)
*size = wcslen(wstr);
else
*size = (size_t)-1;
}
return wstr;
#else
wchar_t *res; wchar_t *res;
size_t argsize; size_t argsize;
size_t count; size_t count;
@ -293,19 +266,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
mbstate_t mbs; mbstate_t mbs;
#endif #endif
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii) {
/* force ASCII encoding to workaround mbstowcs() issue */
res = decode_ascii_surrogateescape(arg, size);
if (res == NULL)
goto oom;
return res;
}
#endif
#ifdef HAVE_BROKEN_MBSTOWCS #ifdef HAVE_BROKEN_MBSTOWCS
/* Some platforms have a broken implementation of /* Some platforms have a broken implementation of
* mbstowcs which does not count the characters that * mbstowcs which does not count the characters that
@ -402,72 +362,96 @@ Py_DecodeLocale(const char* arg, size_t *size)
goto oom; goto oom;
#endif /* HAVE_MBRTOWC */ #endif /* HAVE_MBRTOWC */
return res; return res;
oom: oom:
if (size != NULL) if (size != NULL)
*size = (size_t)-1; *size = (size_t)-1;
return NULL; return NULL;
}
static wchar_t*
decode_locale(const char* arg, size_t *size, int current_locale)
{
if (current_locale) {
return decode_current_locale(arg, size);
}
#if defined(__APPLE__) || defined(__ANDROID__)
wchar_t *wstr;
wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
if (size != NULL) {
if (wstr != NULL)
*size = wcslen(wstr);
else
*size = (size_t)-1;
}
return wstr;
#else
#ifdef USE_FORCE_ASCII
if (force_ascii == -1) {
force_ascii = check_force_ascii();
}
if (force_ascii) {
/* force ASCII encoding to workaround mbstowcs() issue */
wchar_t *res = decode_ascii_surrogateescape(arg, size);
if (res == NULL) {
if (size != NULL)
*size = (size_t)-1;
return NULL;
}
return res;
}
#endif
return decode_current_locale(arg, size);
#endif /* __APPLE__ or __ANDROID__ */ #endif /* __APPLE__ or __ANDROID__ */
} }
/* Encode a wide character string to the locale encoding with the
surrogateescape error handler: surrogate characters in the range
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
Return a pointer to a newly allocated byte string, use PyMem_Free() to free /* Decode a byte string from the locale encoding with the
the memory. Return NULL on encoding or memory allocation error. surrogateescape error handler: undecodable bytes are decoded as characters
in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
character, escape the bytes using the surrogateescape error handler instead
of decoding them.
If error_pos is not NULL, *error_pos is set to the index of the invalid Return a pointer to a newly allocated wide character string, use
character on encoding error, or set to (size_t)-1 otherwise. PyMem_RawFree() to free the memory. If size is not NULL, write the number of
wide characters excluding the null character into *size
Use the Py_DecodeLocale() function to decode the bytes string back to a wide Return NULL on decoding error or memory allocation error. If *size* is not
character string. */ NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
char* decoding error.
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
Decoding errors should never happen, unless there is a bug in the C
library.
Use the Py_EncodeLocale() function to encode the character string back to a
byte string. */
wchar_t*
Py_DecodeLocale(const char* arg, size_t *size)
{ {
#if defined(__APPLE__) || defined(__ANDROID__) return decode_locale(arg, size, 0);
Py_ssize_t len; }
PyObject *unicode, *bytes = NULL;
char *cpath;
unicode = PyUnicode_FromWideChar(text, wcslen(text));
if (unicode == NULL)
return NULL;
bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape"); wchar_t*
Py_DECREF(unicode); _Py_DecodeLocaleEx(const char* arg, size_t *size, int current_locale)
if (bytes == NULL) { {
PyErr_Clear(); return decode_locale(arg, size, current_locale);
if (error_pos != NULL) }
*error_pos = (size_t)-1;
return NULL;
}
len = PyBytes_GET_SIZE(bytes);
cpath = PyMem_Malloc(len+1); static char*
if (cpath == NULL) { encode_current_locale(const wchar_t *text, size_t *error_pos)
PyErr_Clear(); {
Py_DECREF(bytes);
if (error_pos != NULL)
*error_pos = (size_t)-1;
return NULL;
}
memcpy(cpath, PyBytes_AsString(bytes), len + 1);
Py_DECREF(bytes);
return cpath;
#else /* __APPLE__ */
const size_t len = wcslen(text); const size_t len = wcslen(text);
char *result = NULL, *bytes = NULL; char *result = NULL, *bytes = NULL;
size_t i, size, converted; size_t i, size, converted;
wchar_t c, buf[2]; wchar_t c, buf[2];
#ifndef MS_WINDOWS
if (force_ascii == -1)
force_ascii = check_force_ascii();
if (force_ascii)
return encode_ascii_surrogateescape(text, error_pos);
#endif
/* The function works in two steps: /* The function works in two steps:
1. compute the length of the output buffer in bytes (size) 1. compute the length of the output buffer in bytes (size)
2. outputs the bytes */ 2. outputs the bytes */
@ -522,10 +506,89 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
bytes = result; bytes = result;
} }
return result; return result;
}
static char*
encode_locale(const wchar_t *text, size_t *error_pos, int current_locale)
{
if (current_locale) {
return encode_current_locale(text, error_pos);
}
#if defined(__APPLE__) || defined(__ANDROID__)
Py_ssize_t len;
PyObject *unicode, *bytes = NULL;
char *cpath;
unicode = PyUnicode_FromWideChar(text, wcslen(text));
if (unicode == NULL)
return NULL;
bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Py_DECREF(unicode);
if (bytes == NULL) {
PyErr_Clear();
if (error_pos != NULL)
*error_pos = (size_t)-1;
return NULL;
}
len = PyBytes_GET_SIZE(bytes);
cpath = PyMem_Malloc(len+1);
if (cpath == NULL) {
PyErr_Clear();
Py_DECREF(bytes);
if (error_pos != NULL)
*error_pos = (size_t)-1;
return NULL;
}
memcpy(cpath, PyBytes_AsString(bytes), len + 1);
Py_DECREF(bytes);
return cpath;
#else /* __APPLE__ */
#ifdef USE_FORCE_ASCII
if (force_ascii == -1) {
force_ascii = check_force_ascii();
}
if (force_ascii) {
return encode_ascii_surrogateescape(text, error_pos);
}
#endif
return encode_current_locale(text, error_pos);
#endif /* __APPLE__ or __ANDROID__ */ #endif /* __APPLE__ or __ANDROID__ */
} }
/* Encode a wide character string to the locale encoding with the
surrogateescape error handler: surrogate characters in the range
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
Return a pointer to a newly allocated byte string, use PyMem_Free() to free
the memory. Return NULL on encoding or memory allocation error.
If error_pos is not NULL, *error_pos is set to the index of the invalid
character on encoding error, or set to (size_t)-1 otherwise.
Use the Py_DecodeLocale() function to decode the bytes string back to a wide
character string. */
char*
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
{
return encode_locale(text, error_pos, 0);
}
char*
_Py_EncodeLocaleEx(const wchar_t *text, size_t *error_pos, int current_locale)
{
return encode_locale(text, error_pos, current_locale);
}
#ifdef MS_WINDOWS #ifdef MS_WINDOWS
static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */ static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */