bpo-29240: Fix locale encodings in UTF-8 Mode (#5170)

Modify locale.localeconv(), time.tzname, os.strerror() and other
functions to ignore the UTF-8 Mode: always use the current locale
encoding.

Changes:

* Add _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx(). On decoding or
  encoding error, they return the position of the error and an error
  message which are used to raise Unicode errors in
  PyUnicode_DecodeLocale() and PyUnicode_EncodeLocale().
* Replace _Py_DecodeCurrentLocale() with _Py_DecodeLocaleEx().
* PyUnicode_DecodeLocale() now uses _Py_DecodeLocaleEx() for all
  cases, especially for the strict error handler.
* Add _Py_DecodeUTF8Ex(): return more information on decoding error
  and supports the strict error handler.
* Rename _Py_EncodeUTF8_surrogateescape() to _Py_EncodeUTF8Ex().
* Replace _Py_EncodeCurrentLocale() with _Py_EncodeLocaleEx().
* Ignore the UTF-8 mode to encode/decode localeconv(), strerror()
  and time zone name.
* Remove PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize()
  and PyUnicode_EncodeLocale() now ignore the UTF-8 mode: always use
  the "current" locale.
* Remove _PyUnicode_DecodeCurrentLocale(),
  _PyUnicode_DecodeCurrentLocaleAndSize() and
  _PyUnicode_EncodeCurrentLocale().
This commit is contained in:
Victor Stinner 2018-01-15 10:45:49 +01:00 committed by GitHub
parent ee3b83547c
commit 7ed7aead95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 484 additions and 517 deletions

View File

@ -106,6 +106,16 @@ Operating System Utilities
surrogate character, escape the bytes using the surrogateescape error surrogate character, escape the bytes using the surrogateescape error
handler instead of decoding them. handler instead of decoding them.
Encoding, highest priority to lowest priority:
* ``UTF-8`` on macOS and Android;
* ``UTF-8`` if the Python UTF-8 mode is enabled;
* ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
``ISO-8859-1`` encoding.
* the current locale encoding.
Return a pointer to a newly allocated wide character string, use Return a pointer to a newly allocated wide character string, use
:c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write :c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write
the number of wide characters excluding the null character into ``*size`` the number of wide characters excluding the null character into ``*size``
@ -137,6 +147,18 @@ Operating System Utilities
:ref:`surrogateescape error handler <surrogateescape>`: surrogate characters :ref:`surrogateescape error handler <surrogateescape>`: surrogate characters
in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF. in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
Encoding, highest priority to lowest priority:
* ``UTF-8`` on macOS and Android;
* ``UTF-8`` if the Python UTF-8 mode is enabled;
* ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
``ISO-8859-1`` encoding.
* the current locale encoding.
The function uses the UTF-8 encoding in the Python UTF-8 mode.
Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free` Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free`
to free the memory. Return ``NULL`` on encoding error or memory allocation to free the memory. Return ``NULL`` on encoding error or memory allocation
error error

View File

@ -770,12 +770,20 @@ system.
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
Python startup). Python startup).
This function ignores the Python UTF-8 mode.
.. seealso:: .. seealso::
The :c:func:`Py_DecodeLocale` function. The :c:func:`Py_DecodeLocale` function.
.. versionadded:: 3.3 .. versionadded:: 3.3
.. versionchanged:: 3.7
The function now also uses the current locale encoding for the
``surrogateescape`` error handler. Previously, :c:func:`Py_DecodeLocale`
was used for the ``surrogateescape``, and the current locale encoding was
used for ``strict``.
.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors) .. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors)
@ -797,12 +805,20 @@ system.
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
Python startup). Python startup).
This function ignores the Python UTF-8 mode.
.. seealso:: .. seealso::
The :c:func:`Py_EncodeLocale` function. The :c:func:`Py_EncodeLocale` function.
.. versionadded:: 3.3 .. versionadded:: 3.3
.. versionchanged:: 3.7
The function now also uses the current locale encoding for the
``surrogateescape`` error handler. Previously, :c:func:`Py_EncodeLocale`
was used for the ``surrogateescape``, and the current locale encoding was
used for ``strict``.
File System Encoding File System Encoding
"""""""""""""""""""" """"""""""""""""""""

View File

@ -20,18 +20,41 @@ PyAPI_FUNC(char*) _Py_EncodeLocaleRaw(
#endif #endif
#ifdef Py_BUILD_CORE #ifdef Py_BUILD_CORE
PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape( PyAPI_FUNC(int) _Py_DecodeUTF8Ex(
const char *s,
Py_ssize_t size,
size_t *p_wlen);
PyAPI_FUNC(wchar_t *) _Py_DecodeCurrentLocale(
const char *arg, const char *arg,
size_t *size); Py_ssize_t arglen,
wchar_t **wstr,
size_t *wlen,
const char **reason,
int surrogateescape);
PyAPI_FUNC(char*) _Py_EncodeCurrentLocale( PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
const wchar_t *text, const wchar_t *text,
size_t *error_pos); char **str,
size_t *error_pos,
const char **reason,
int raw_malloc,
int surrogateescape);
PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
const char *arg,
Py_ssize_t arglen);
PyAPI_FUNC(int) _Py_DecodeLocaleEx(
const char *arg,
wchar_t **wstr,
size_t *wlen,
const char **reason,
int current_locale,
int surrogateescape);
PyAPI_FUNC(int) _Py_EncodeLocaleEx(
const wchar_t *text,
char **str,
size_t *error_pos,
const char **reason,
int current_locale,
int surrogateescape);
#endif #endif
#ifndef Py_LIMITED_API #ifndef Py_LIMITED_API

View File

@ -1810,20 +1810,6 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
PyObject *unicode, PyObject *unicode,
const char *errors const char *errors
); );
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocale(
const char *str,
const char *errors);
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocaleAndSize(
const char *str,
Py_ssize_t len,
const char *errors);
PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCurrentLocale(
PyObject *unicode,
const char *errors
);
#endif #endif
/* --- File system encoding ---------------------------------------------- */ /* --- File system encoding ---------------------------------------------- */

View File

@ -696,7 +696,7 @@ static int parse_isoformat_date(const char *dtstr,
if (NULL == p) { if (NULL == p) {
return -1; return -1;
} }
if (*(p++) != '-') { if (*(p++) != '-') {
return -2; return -2;
} }

View File

@ -572,8 +572,9 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args)
if (!PyArg_ParseTuple(args, "sz", &domain, &codeset)) if (!PyArg_ParseTuple(args, "sz", &domain, &codeset))
return NULL; return NULL;
codeset = bind_textdomain_codeset(domain, codeset); codeset = bind_textdomain_codeset(domain, codeset);
if (codeset) if (codeset) {
return PyUnicode_DecodeLocale(codeset, NULL); return PyUnicode_DecodeLocale(codeset, NULL);
}
Py_RETURN_NONE; Py_RETURN_NONE;
} }
#endif #endif

View File

@ -449,8 +449,8 @@ search_for_exec_prefix(const _PyCoreConfig *core_config,
n = fread(buf, 1, MAXPATHLEN, f); n = fread(buf, 1, MAXPATHLEN, f);
buf[n] = '\0'; buf[n] = '\0';
fclose(f); fclose(f);
rel_builddir_path = _Py_DecodeUTF8_surrogateescape(buf, n, NULL); rel_builddir_path = _Py_DecodeUTF8_surrogateescape(buf, n);
if (rel_builddir_path != NULL) { if (rel_builddir_path) {
wcsncpy(exec_prefix, calculate->argv0_path, MAXPATHLEN); wcsncpy(exec_prefix, calculate->argv0_path, MAXPATHLEN);
exec_prefix[MAXPATHLEN] = L'\0'; exec_prefix[MAXPATHLEN] = L'\0';
joinpath(exec_prefix, rel_builddir_path); joinpath(exec_prefix, rel_builddir_path);

View File

@ -132,13 +132,13 @@ static PyModuleDef readlinemodule;
static PyObject * static PyObject *
encode(PyObject *b) encode(PyObject *b)
{ {
return _PyUnicode_EncodeCurrentLocale(b, "surrogateescape"); return PyUnicode_EncodeLocale(b, "surrogateescape");
} }
static PyObject * static PyObject *
decode(const char *s) decode(const char *s)
{ {
return _PyUnicode_DecodeCurrentLocale(s, "surrogateescape"); return PyUnicode_DecodeLocale(s, "surrogateescape");
} }

View File

@ -418,11 +418,11 @@ tmtotuple(struct tm *p
SET(8, p->tm_isdst); SET(8, p->tm_isdst);
#ifdef HAVE_STRUCT_TM_TM_ZONE #ifdef HAVE_STRUCT_TM_TM_ZONE
PyStructSequence_SET_ITEM(v, 9, PyStructSequence_SET_ITEM(v, 9,
_PyUnicode_DecodeCurrentLocale(p->tm_zone, "surrogateescape")); PyUnicode_DecodeLocale(p->tm_zone, "surrogateescape"));
SET(10, p->tm_gmtoff); SET(10, p->tm_gmtoff);
#else #else
PyStructSequence_SET_ITEM(v, 9, PyStructSequence_SET_ITEM(v, 9,
_PyUnicode_DecodeCurrentLocale(zone, "surrogateescape")); PyUnicode_DecodeLocale(zone, "surrogateescape"));
PyStructSequence_SET_ITEM(v, 10, _PyLong_FromTime_t(gmtoff)); PyStructSequence_SET_ITEM(v, 10, _PyLong_FromTime_t(gmtoff));
#endif /* HAVE_STRUCT_TM_TM_ZONE */ #endif /* HAVE_STRUCT_TM_TM_ZONE */
#undef SET #undef SET
@ -809,8 +809,7 @@ time_strftime(PyObject *self, PyObject *args)
#ifdef HAVE_WCSFTIME #ifdef HAVE_WCSFTIME
ret = PyUnicode_FromWideChar(outbuf, buflen); ret = PyUnicode_FromWideChar(outbuf, buflen);
#else #else
ret = _PyUnicode_DecodeCurrentLocaleAndSize(outbuf, buflen, ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, "surrogateescape");
"surrogateescape");
#endif #endif
PyMem_Free(outbuf); PyMem_Free(outbuf);
break; break;
@ -1541,8 +1540,8 @@ PyInit_timezone(PyObject *m) {
PyModule_AddIntConstant(m, "altzone", timezone-3600); PyModule_AddIntConstant(m, "altzone", timezone-3600);
#endif #endif
PyModule_AddIntConstant(m, "daylight", daylight); PyModule_AddIntConstant(m, "daylight", daylight);
otz0 = _PyUnicode_DecodeCurrentLocale(tzname[0], "surrogateescape"); otz0 = PyUnicode_DecodeLocale(tzname[0], "surrogateescape");
otz1 = _PyUnicode_DecodeCurrentLocale(tzname[1], "surrogateescape"); otz1 = PyUnicode_DecodeLocale(tzname[1], "surrogateescape");
PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1)); PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
#else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/ #else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
{ {

View File

@ -3327,53 +3327,6 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
return NULL; return NULL;
} }
static size_t
wcstombs_errorpos(const wchar_t *wstr)
{
size_t len;
#if SIZEOF_WCHAR_T == 2
wchar_t buf[3];
#else
wchar_t buf[2];
#endif
char outbuf[MB_LEN_MAX];
const wchar_t *start, *previous;
#if SIZEOF_WCHAR_T == 2
buf[2] = 0;
#else
buf[1] = 0;
#endif
start = wstr;
while (*wstr != L'\0')
{
previous = wstr;
#if SIZEOF_WCHAR_T == 2
if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
&& Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
{
buf[0] = wstr[0];
buf[1] = wstr[1];
wstr += 2;
}
else {
buf[0] = *wstr;
buf[1] = 0;
wstr++;
}
#else
buf[0] = *wstr;
wstr++;
#endif
len = wcstombs(outbuf, buf, sizeof(outbuf));
if (len == (size_t)-1)
return previous - start;
}
/* failed to find the unencodable character */
return 0;
}
static int static int
locale_error_handler(const char *errors, int *surrogateescape) locale_error_handler(const char *errors, int *surrogateescape)
{ {
@ -3396,130 +3349,60 @@ locale_error_handler(const char *errors, int *surrogateescape)
} }
static PyObject * static PyObject *
unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale) unicode_encode_locale(PyObject *unicode, const char *errors,
int current_locale)
{ {
Py_ssize_t wlen, wlen2;
wchar_t *wstr;
char *errmsg;
PyObject *bytes, *reason, *exc;
size_t error_pos, errlen;
int surrogateescape; int surrogateescape;
if (locale_error_handler(errors, &surrogateescape) < 0) if (locale_error_handler(errors, &surrogateescape) < 0)
return NULL; return NULL;
wstr = PyUnicode_AsWideCharString(unicode, &wlen); Py_ssize_t wlen;
if (wstr == NULL) wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
if (wstr == NULL) {
return NULL; return NULL;
}
wlen2 = wcslen(wstr); Py_ssize_t wlen2 = wcslen(wstr);
if (wlen2 != wlen) { if (wlen2 != wlen) {
PyMem_Free(wstr); PyMem_Free(wstr);
PyErr_SetString(PyExc_ValueError, "embedded null character"); PyErr_SetString(PyExc_ValueError, "embedded null character");
return NULL; return NULL;
} }
if (surrogateescape) { char *str;
/* "surrogateescape" error handler */ size_t error_pos;
char *str; const char *reason;
int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
if (current_locale) { current_locale, surrogateescape);
str = _Py_EncodeCurrentLocale(wstr, &error_pos); if (res != 0) {
if (res == -2) {
PyObject *exc;
exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
"locale", unicode,
(Py_ssize_t)error_pos,
(Py_ssize_t)(error_pos+1),
reason);
if (exc != NULL) {
PyCodec_StrictErrors(exc);
Py_DECREF(exc);
}
return NULL;
} }
else { else {
str = Py_EncodeLocale(wstr, &error_pos); PyErr_NoMemory();
}
if (str == NULL) {
if (error_pos == (size_t)-1) {
PyErr_NoMemory();
PyMem_Free(wstr);
return NULL;
}
else {
goto encode_error;
}
}
PyMem_Free(wstr);
bytes = PyBytes_FromString(str);
if (current_locale) {
PyMem_RawFree(str);
}
else {
PyMem_Free(str);
}
}
else {
/* strict mode */
size_t len, len2;
len = wcstombs(NULL, wstr, 0);
if (len == (size_t)-1) {
error_pos = (size_t)-1;
goto encode_error;
}
bytes = PyBytes_FromStringAndSize(NULL, len);
if (bytes == NULL) {
PyMem_Free(wstr); PyMem_Free(wstr);
return NULL; return NULL;
} }
len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
if (len2 == (size_t)-1 || len2 > len) {
Py_DECREF(bytes);
error_pos = (size_t)-1;
goto encode_error;
}
PyMem_Free(wstr);
} }
return bytes;
encode_error:
errmsg = strerror(errno);
assert(errmsg != NULL);
if (error_pos == (size_t)-1)
error_pos = wcstombs_errorpos(wstr);
PyMem_Free(wstr); PyMem_Free(wstr);
wstr = Py_DecodeLocale(errmsg, &errlen); PyObject *bytes = PyBytes_FromString(str);
if (wstr != NULL) { PyMem_RawFree(str);
reason = PyUnicode_FromWideChar(wstr, errlen); return bytes;
PyMem_RawFree(wstr);
} else {
errmsg = NULL;
}
if (errmsg == NULL)
reason = PyUnicode_FromString(
"wcstombs() encountered an unencodable "
"wide character");
if (reason == NULL)
return NULL;
exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
"locale", unicode,
(Py_ssize_t)error_pos,
(Py_ssize_t)(error_pos+1),
reason);
Py_DECREF(reason);
if (exc != NULL) {
PyCodec_StrictErrors(exc);
Py_DECREF(exc);
}
return NULL;
} }
PyObject * PyObject *
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
{
return unicode_encode_locale(unicode, errors, 0);
}
PyObject *
_PyUnicode_EncodeCurrentLocale(PyObject *unicode, const char *errors)
{ {
return unicode_encode_locale(unicode, errors, 1); return unicode_encode_locale(unicode, errors, 1);
} }
@ -3687,51 +3570,11 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
return NULL; return NULL;
} }
static size_t
mbstowcs_errorpos(const char *str, size_t len)
{
#ifdef HAVE_MBRTOWC
const char *start = str;
mbstate_t mbs;
size_t converted;
wchar_t ch;
memset(&mbs, 0, sizeof mbs);
while (len)
{
converted = mbrtowc(&ch, str, len, &mbs);
if (converted == 0)
/* Reached end of string */
break;
if (converted == (size_t)-1 || converted == (size_t)-2) {
/* Conversion error or incomplete character */
return str - start;
}
else {
str += converted;
len -= converted;
}
}
/* failed to find the undecodable byte sequence */
return 0;
#endif
return 0;
}
static PyObject* static PyObject*
unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
int current_locale) int current_locale)
{ {
wchar_t smallbuf[256];
size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
wchar_t *wstr;
size_t wlen, wlen2;
PyObject *unicode;
int surrogateescape; int surrogateescape;
size_t error_pos, errlen;
char *errmsg;
PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
if (locale_error_handler(errors, &surrogateescape) < 0) if (locale_error_handler(errors, &surrogateescape) < 0)
return NULL; return NULL;
@ -3740,113 +3583,47 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
return NULL; return NULL;
} }
if (surrogateescape) { wchar_t *wstr;
/* "surrogateescape" error handler */ size_t wlen;
if (current_locale) { const char *reason;
wstr = _Py_DecodeCurrentLocale(str, &wlen); int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
current_locale, surrogateescape);
if (res != 0) {
if (res == -2) {
PyObject *exc;
exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
"locale", str, len,
(Py_ssize_t)wlen,
(Py_ssize_t)(wlen + 1),
reason);
if (exc != NULL) {
PyCodec_StrictErrors(exc);
Py_DECREF(exc);
}
} }
else { else {
wstr = Py_DecodeLocale(str, &wlen); PyErr_NoMemory();
} }
if (wstr == NULL) {
if (wlen == (size_t)-1)
PyErr_NoMemory();
else
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
unicode = PyUnicode_FromWideChar(wstr, wlen);
PyMem_RawFree(wstr);
}
else {
/* strict mode */
#ifndef HAVE_BROKEN_MBSTOWCS
wlen = mbstowcs(NULL, str, 0);
#else
wlen = len;
#endif
if (wlen == (size_t)-1)
goto decode_error;
if (wlen+1 <= smallbuf_len) {
wstr = smallbuf;
}
else {
wstr = PyMem_New(wchar_t, wlen+1);
if (!wstr)
return PyErr_NoMemory();
}
wlen2 = mbstowcs(wstr, str, wlen+1);
if (wlen2 == (size_t)-1) {
if (wstr != smallbuf)
PyMem_Free(wstr);
goto decode_error;
}
#ifdef HAVE_BROKEN_MBSTOWCS
assert(wlen2 == wlen);
#endif
unicode = PyUnicode_FromWideChar(wstr, wlen2);
if (wstr != smallbuf)
PyMem_Free(wstr);
}
return unicode;
decode_error:
errmsg = strerror(errno);
assert(errmsg != NULL);
error_pos = mbstowcs_errorpos(str, len);
wstr = Py_DecodeLocale(errmsg, &errlen);
if (wstr != NULL) {
reason = PyUnicode_FromWideChar(wstr, errlen);
PyMem_RawFree(wstr);
}
if (reason == NULL)
reason = PyUnicode_FromString(
"mbstowcs() encountered an invalid multibyte sequence");
if (reason == NULL)
return NULL; return NULL;
exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
"locale", str, len,
(Py_ssize_t)error_pos,
(Py_ssize_t)(error_pos+1),
reason);
Py_DECREF(reason);
if (exc != NULL) {
PyCodec_StrictErrors(exc);
Py_DECREF(exc);
} }
return NULL;
PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
PyMem_RawFree(wstr);
return unicode;
} }
PyObject* PyObject*
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
const char *errors) const char *errors)
{
return unicode_decode_locale(str, len, errors, 0);
}
PyObject*
_PyUnicode_DecodeCurrentLocaleAndSize(const char *str, Py_ssize_t len,
const char *errors)
{ {
return unicode_decode_locale(str, len, errors, 1); return unicode_decode_locale(str, len, errors, 1);
} }
PyObject*
_PyUnicode_DecodeCurrentLocale(const char *str, const char *errors)
{
return unicode_decode_locale(str, (Py_ssize_t)strlen(str), errors, 1);
}
PyObject* PyObject*
PyUnicode_DecodeLocale(const char *str, const char *errors) PyUnicode_DecodeLocale(const char *str, const char *errors)
{ {
Py_ssize_t size = (Py_ssize_t)strlen(str); Py_ssize_t size = (Py_ssize_t)strlen(str);
return unicode_decode_locale(str, size, errors, 0); return unicode_decode_locale(str, size, errors, 1);
} }
@ -3878,7 +3655,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
Py_FileSystemDefaultEncodeErrors); Py_FileSystemDefaultEncodeErrors);
} }
else { else {
return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors); return unicode_decode_locale(s, size,
Py_FileSystemDefaultEncodeErrors, 0);
} }
#endif #endif
} }
@ -5128,17 +4906,23 @@ onError:
} }
/* UTF-8 decoder using the surrogateescape error handler . /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
non-zero, use strict error handler otherwise.
On success, return a pointer to a newly allocated wide character string (use On success, write a pointer to a newly allocated wide character string into
PyMem_RawFree() to free the memory) and write the output length (in number *wstr (use PyMem_RawFree() to free the memory) and write the output length
of wchar_t units) into *p_wlen (if p_wlen is set). (in number of wchar_t units) into *wlen (if wlen is set).
On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen On memory allocation failure, return -1.
(if p_wlen is set). */
wchar_t* On decoding error (if surrogateescape is zero), return -2. If wlen is
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) non-NULL, write the start of the illegal byte sequence into *wlen. If reason
is not NULL, write the decoding error message into *reason. */
int
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
const char **reason, int surrogateescape)
{ {
const char *orig_s = s;
const char *e; const char *e;
wchar_t *unicode; wchar_t *unicode;
Py_ssize_t outpos; Py_ssize_t outpos;
@ -5146,18 +4930,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
/* Note: size will always be longer than the resulting Unicode /* Note: size will always be longer than the resulting Unicode
character count */ character count */
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) { if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
if (p_wlen) { return -1;
*p_wlen = (size_t)-1;
}
return NULL;
} }
unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
if (!unicode) { if (!unicode) {
if (p_wlen) { return -1;
*p_wlen = (size_t)-1;
}
return NULL;
} }
/* Unpack UTF-8 encoded data */ /* Unpack UTF-8 encoded data */
@ -5175,7 +4953,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
Py_UNREACHABLE(); Py_UNREACHABLE();
#else #else
assert(ch > 0xFFFF && ch <= MAX_UNICODE); assert(ch > 0xFFFF && ch <= MAX_UNICODE);
/* compute and append the two surrogates: */ /* write a surrogate pair */
unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
#endif #endif
@ -5183,60 +4961,88 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
else { else {
if (!ch && s == e) if (!ch && s == e)
break; break;
if (!surrogateescape) {
PyMem_RawFree(unicode );
if (reason != NULL) {
switch (ch) {
case 0:
*reason = "unexpected end of data";
break;
case 1:
*reason = "invalid start byte";
break;
/* 2, 3, 4 */
default:
*reason = "invalid continuation byte";
break;
}
}
if (wlen != NULL) {
*wlen = s - orig_s;
}
return -2;
}
/* surrogateescape */ /* surrogateescape */
unicode[outpos++] = 0xDC00 + (unsigned char)*s++; unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
} }
} }
unicode[outpos] = L'\0'; unicode[outpos] = L'\0';
if (p_wlen) { if (wlen) {
*p_wlen = outpos; *wlen = outpos;
} }
return unicode; *wstr = unicode;
return 0;
}
wchar_t*
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
{
wchar_t *wstr;
int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
if (res != 0) {
return NULL;
}
return wstr;
} }
/* UTF-8 encoder using the surrogateescape error handler . /* UTF-8 encoder using the surrogateescape error handler .
On success, return a pointer to a newly allocated character string (use On success, return 0 and write the newly allocated character string (use
PyMem_Free() to free the memory). PyMem_Free() to free the memory) into *str.
On encoding failure, return NULL and write the position of the invalid On encoding failure, return -2 and write the position of the invalid
surrogate character into *error_pos (if error_pos is set). surrogate character into *error_pos (if error_pos is set) and the decoding
error message into *reason (if reason is set).
On memory allocation failure, return NULL and write (size_t)-1 into On memory allocation failure, return -1. */
*error_pos (if error_pos is set). */ int
char* _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos, const char **reason, int raw_malloc, int surrogateescape)
int raw_malloc)
{ {
const Py_ssize_t max_char_size = 4; const Py_ssize_t max_char_size = 4;
Py_ssize_t len = wcslen(text); Py_ssize_t len = wcslen(text);
assert(len >= 0); assert(len >= 0);
if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
return -1;
}
char *bytes; char *bytes;
if (len <= PY_SSIZE_T_MAX / max_char_size - 1) { if (raw_malloc) {
if (raw_malloc) { bytes = PyMem_RawMalloc((len + 1) * max_char_size);
bytes = PyMem_RawMalloc((len + 1) * max_char_size);
}
else {
bytes = PyMem_Malloc((len + 1) * max_char_size);
}
} }
else { else {
bytes = NULL; bytes = PyMem_Malloc((len + 1) * max_char_size);
} }
if (bytes == NULL) { if (bytes == NULL) {
if (error_pos != NULL) { return -1;
*error_pos = (size_t)-1;
}
return NULL;
} }
char *p = bytes; char *p = bytes;
Py_ssize_t i; Py_ssize_t i;
for (i = 0; i < len;) { for (i = 0; i < len; i++) {
Py_UCS4 ch = text[i++]; Py_UCS4 ch = text[i];
if (ch < 0x80) { if (ch < 0x80) {
/* Encode ASCII */ /* Encode ASCII */
@ -5250,11 +5056,20 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
} }
else if (Py_UNICODE_IS_SURROGATE(ch)) { else if (Py_UNICODE_IS_SURROGATE(ch)) {
/* surrogateescape error handler */ /* surrogateescape error handler */
if (!(0xDC80 <= ch && ch <= 0xDCFF)) { if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
if (error_pos != NULL) { if (error_pos != NULL) {
*error_pos = (size_t)i - 1; *error_pos = (size_t)i;
} }
goto error; if (reason != NULL) {
*reason = "encoding error";
}
if (raw_malloc) {
PyMem_RawFree(bytes);
}
else {
PyMem_Free(bytes);
}
return -2;
} }
*p++ = (char)(ch & 0xff); *p++ = (char)(ch & 0xff);
} }
@ -5286,18 +5101,16 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
if (error_pos != NULL) { if (error_pos != NULL) {
*error_pos = (size_t)-1; *error_pos = (size_t)-1;
} }
goto error; if (raw_malloc) {
PyMem_RawFree(bytes);
}
else {
PyMem_Free(bytes);
}
return -1;
} }
return bytes2; *str = bytes2;
return 0;
error:
if (raw_malloc) {
PyMem_RawFree(bytes);
}
else {
PyMem_Free(bytes);
}
return NULL;
} }

View File

@ -20,9 +20,6 @@ extern int winerror_to_errno(int);
#include <fcntl.h> #include <fcntl.h>
#endif /* HAVE_FCNTL_H */ #endif /* HAVE_FCNTL_H */
extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text,
size_t *error_pos, int raw_malloc);
#ifdef O_CLOEXEC #ifdef O_CLOEXEC
/* Does open() support the O_CLOEXEC flag? Possible values: /* Does open() support the O_CLOEXEC flag? Possible values:
@ -69,7 +66,10 @@ _Py_device_encoding(int fd)
Py_RETURN_NONE; Py_RETURN_NONE;
} }
#if !defined(__APPLE__) && !defined(MS_WINDOWS) #if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS)
#define USE_FORCE_ASCII
extern int _Py_normalize_encoding(const char *, char *, size_t); extern int _Py_normalize_encoding(const char *, char *, size_t);
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale. /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
@ -90,7 +90,7 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);
1: the workaround is used: Py_EncodeLocale() uses 1: the workaround is used: Py_EncodeLocale() uses
encode_ascii_surrogateescape() and Py_DecodeLocale() uses encode_ascii_surrogateescape() and Py_DecodeLocale() uses
decode_ascii_surrogateescape() decode_ascii()
0: the workaround is not used: Py_EncodeLocale() uses wcstombs() and 0: the workaround is not used: Py_EncodeLocale() uses wcstombs() and
Py_DecodeLocale() uses mbstowcs() Py_DecodeLocale() uses mbstowcs()
-1: unknown, need to call check_force_ascii() to get the value -1: unknown, need to call check_force_ascii() to get the value
@ -180,16 +180,15 @@ error:
return 1; return 1;
} }
static char* static int
encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_malloc) encode_ascii(const wchar_t *text, char **str,
size_t *error_pos, const char **reason,
int raw_malloc, int surrogateescape)
{ {
char *result = NULL, *out; char *result = NULL, *out;
size_t len, i; size_t len, i;
wchar_t ch; wchar_t ch;
if (error_pos != NULL)
*error_pos = (size_t)-1;
len = wcslen(text); len = wcslen(text);
/* +1 for NULL byte */ /* +1 for NULL byte */
@ -199,8 +198,9 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal
else { else {
result = PyMem_Malloc(len + 1); result = PyMem_Malloc(len + 1);
} }
if (result == NULL) if (result == NULL) {
return NULL; return -1;
}
out = result; out = result;
for (i=0; i<len; i++) { for (i=0; i<len; i++) {
@ -210,60 +210,84 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal
/* ASCII character */ /* ASCII character */
*out++ = (char)ch; *out++ = (char)ch;
} }
else if (0xdc80 <= ch && ch <= 0xdcff) { else if (surrogateescape && 0xdc80 <= ch && ch <= 0xdcff) {
/* UTF-8b surrogate */ /* UTF-8b surrogate */
*out++ = (char)(ch - 0xdc00); *out++ = (char)(ch - 0xdc00);
} }
else { else {
if (error_pos != NULL) {
*error_pos = i;
}
if (raw_malloc) { if (raw_malloc) {
PyMem_RawFree(result); PyMem_RawFree(result);
} }
else { else {
PyMem_Free(result); PyMem_Free(result);
} }
return NULL; if (error_pos != NULL) {
*error_pos = i;
}
if (reason) {
*reason = "encoding error";
}
return -2;
} }
} }
*out = '\0'; *out = '\0';
return result; *str = result;
return 0;
} }
#endif /* !defined(__APPLE__) && !defined(MS_WINDOWS) */ #endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */
#if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC))
static wchar_t* #if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
decode_ascii_surrogateescape(const char *arg, size_t *size) static int
decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
const char **reason, int surrogateescape)
{ {
wchar_t *res; wchar_t *res;
unsigned char *in; unsigned char *in;
wchar_t *out; wchar_t *out;
size_t argsize = strlen(arg) + 1; size_t argsize = strlen(arg) + 1;
if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
return NULL; return -1;
res = PyMem_RawMalloc(argsize*sizeof(wchar_t)); }
if (!res) res = PyMem_RawMalloc(argsize * sizeof(wchar_t));
return NULL; if (!res) {
return -1;
}
in = (unsigned char*)arg;
out = res; out = res;
while(*in) for (in = (unsigned char*)arg; *in; in++) {
if(*in < 128) unsigned char ch = *in;
*out++ = *in++; if (ch < 128) {
else *out++ = ch;
*out++ = 0xdc00 + *in++; }
else {
if (!surrogateescape) {
PyMem_RawFree(res);
if (wlen) {
*wlen = in - (unsigned char*)arg;
}
if (reason) {
*reason = "decoding error";
}
return -2;
}
*out++ = 0xdc00 + ch;
}
}
*out = 0; *out = 0;
if (size != NULL)
*size = out - res;
return res;
}
#endif
#if !defined(__APPLE__) && !defined(__ANDROID__) if (wlen != NULL) {
static wchar_t* *wlen = out - res;
decode_current_locale(const char* arg, size_t *size) }
*wstr = res;
return 0;
}
#endif /* !HAVE_MBRTOWC */
static int
decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
const char **reason, int surrogateescape)
{ {
wchar_t *res; wchar_t *res;
size_t argsize; size_t argsize;
@ -284,15 +308,15 @@ decode_current_locale(const char* arg, size_t *size)
argsize = mbstowcs(NULL, arg, 0); argsize = mbstowcs(NULL, arg, 0);
#endif #endif
if (argsize != (size_t)-1) { if (argsize != (size_t)-1) {
if (argsize == PY_SSIZE_T_MAX) if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
goto oom; return -1;
argsize += 1; }
if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) res = (wchar_t *)PyMem_RawMalloc((argsize + 1) * sizeof(wchar_t));
goto oom; if (!res) {
res = (wchar_t *)PyMem_RawMalloc(argsize*sizeof(wchar_t)); return -1;
if (!res) }
goto oom;
count = mbstowcs(res, arg, argsize); count = mbstowcs(res, arg, argsize + 1);
if (count != (size_t)-1) { if (count != (size_t)-1) {
wchar_t *tmp; wchar_t *tmp;
/* Only use the result if it contains no /* Only use the result if it contains no
@ -301,13 +325,16 @@ decode_current_locale(const char* arg, size_t *size)
!Py_UNICODE_IS_SURROGATE(*tmp); tmp++) !Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
; ;
if (*tmp == 0) { if (*tmp == 0) {
if (size != NULL) if (wlen != NULL) {
*size = count; *wlen = count;
return res; }
*wstr = res;
return 0;
} }
} }
PyMem_RawFree(res); PyMem_RawFree(res);
} }
/* Conversion failed. Fall back to escaping with surrogateescape. */ /* Conversion failed. Fall back to escaping with surrogateescape. */
#ifdef HAVE_MBRTOWC #ifdef HAVE_MBRTOWC
/* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */ /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
@ -315,30 +342,37 @@ decode_current_locale(const char* arg, size_t *size)
/* Overallocate; as multi-byte characters are in the argument, the /* Overallocate; as multi-byte characters are in the argument, the
actual output could use less memory. */ actual output could use less memory. */
argsize = strlen(arg) + 1; argsize = strlen(arg) + 1;
if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
goto oom; return -1;
res = (wchar_t*)PyMem_RawMalloc(argsize*sizeof(wchar_t)); }
if (!res) res = (wchar_t*)PyMem_RawMalloc(argsize * sizeof(wchar_t));
goto oom; if (!res) {
return -1;
}
in = (unsigned char*)arg; in = (unsigned char*)arg;
out = res; out = res;
memset(&mbs, 0, sizeof mbs); memset(&mbs, 0, sizeof mbs);
while (argsize) { while (argsize) {
size_t converted = mbrtowc(out, (char*)in, argsize, &mbs); size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
if (converted == 0) if (converted == 0) {
/* Reached end of string; null char stored. */ /* Reached end of string; null char stored. */
break; break;
}
if (converted == (size_t)-2) { if (converted == (size_t)-2) {
/* Incomplete character. This should never happen, /* Incomplete character. This should never happen,
since we provide everything that we have - since we provide everything that we have -
unless there is a bug in the C library, or I unless there is a bug in the C library, or I
misunderstood how mbrtowc works. */ misunderstood how mbrtowc works. */
PyMem_RawFree(res); goto decode_error;
if (size != NULL)
*size = (size_t)-2;
return NULL;
} }
if (converted == (size_t)-1) { if (converted == (size_t)-1) {
if (!surrogateescape) {
goto decode_error;
}
/* Conversion error. Escape as UTF-8b, and start over /* Conversion error. Escape as UTF-8b, and start over
in the initial shift state. */ in the initial shift state. */
*out++ = 0xdc00 + *in++; *out++ = 0xdc00 + *in++;
@ -346,12 +380,18 @@ decode_current_locale(const char* arg, size_t *size)
memset(&mbs, 0, sizeof mbs); memset(&mbs, 0, sizeof mbs);
continue; continue;
} }
if (Py_UNICODE_IS_SURROGATE(*out)) { if (Py_UNICODE_IS_SURROGATE(*out)) {
if (!surrogateescape) {
goto decode_error;
}
/* Surrogate character. Escape the original /* Surrogate character. Escape the original
byte sequence with surrogateescape. */ byte sequence with surrogateescape. */
argsize -= converted; argsize -= converted;
while (converted--) while (converted--) {
*out++ = 0xdc00 + *in++; *out++ = 0xdc00 + *in++;
}
continue; continue;
} }
/* successfully converted some bytes */ /* successfully converted some bytes */
@ -359,55 +399,80 @@ decode_current_locale(const char* arg, size_t *size)
argsize -= converted; argsize -= converted;
out++; out++;
} }
if (size != NULL) if (wlen != NULL) {
*size = out - res; *wlen = out - res;
}
*wstr = res;
return 0;
decode_error:
PyMem_RawFree(res);
if (wlen) {
*wlen = in - (unsigned char*)arg;
}
if (reason) {
*reason = "decoding error";
}
return -2;
#else /* HAVE_MBRTOWC */ #else /* HAVE_MBRTOWC */
/* Cannot use C locale for escaping; manually escape as if charset /* Cannot use C locale for escaping; manually escape as if charset
is ASCII (i.e. escape all bytes > 128. This will still roundtrip is ASCII (i.e. escape all bytes > 128. This will still roundtrip
correctly in the locale's charset, which must be an ASCII superset. */ correctly in the locale's charset, which must be an ASCII superset. */
res = decode_ascii_surrogateescape(arg, size); return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
if (res == NULL)
goto oom;
#endif /* HAVE_MBRTOWC */ #endif /* HAVE_MBRTOWC */
return res;
oom:
if (size != NULL) {
*size = (size_t)-1;
}
return NULL;
} }
#endif
static wchar_t* /* Decode a byte string from the locale encoding.
decode_locale(const char* arg, size_t *size, int ignore_utf8_mode)
Use the strict error handler if 'surrogateescape' is zero. Use the
surrogateescape error handler if 'surrogateescape' is non-zero: undecodable
bytes are decoded as characters in range U+DC80..U+DCFF. If a byte sequence
can be decoded as a surrogate character, escape the bytes using the
surrogateescape error handler instead of decoding them.
On sucess, return 0 and write the newly allocated wide character string into
*wstr (use PyMem_RawFree() to free the memory). If wlen is not NULL, write
the number of wide characters excluding the null character into *wlen.
On memory allocation failure, return -1.
On decoding error, return -2. If wlen is not NULL, write the start of
invalid byte sequence in the input string into *wlen. If reason is not NULL,
write the decoding error message into *reason.
Use the Py_EncodeLocaleEx() function to encode the character string back to
a byte string. */
int
_Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
const char **reason,
int current_locale, int surrogateescape)
{ {
#if defined(__APPLE__) || defined(__ANDROID__) if (current_locale) {
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
#else
if (!ignore_utf8_mode && Py_UTF8Mode == 1) {
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
} }
#ifndef MS_WINDOWS #if defined(__APPLE__) || defined(__ANDROID__)
if (force_ascii == -1) return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
surrogateescape);
#else
if (Py_UTF8Mode == 1) {
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
surrogateescape);
}
#ifdef USE_FORCE_ASCII
if (force_ascii == -1) {
force_ascii = check_force_ascii(); force_ascii = check_force_ascii();
}
if (force_ascii) { if (force_ascii) {
/* force ASCII encoding to workaround mbstowcs() issue */ /* force ASCII encoding to workaround mbstowcs() issue */
wchar_t *wstr = decode_ascii_surrogateescape(arg, size); return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
if (wstr == NULL) {
if (size != NULL) {
*size = (size_t)-1;
}
return NULL;
}
return wstr;
} }
#endif #endif
return decode_current_locale(arg, size); return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
#endif /* __APPLE__ or __ANDROID__ */ #endif /* __APPLE__ or __ANDROID__ */
} }
@ -432,23 +497,24 @@ decode_locale(const char* arg, size_t *size, int ignore_utf8_mode)
Use the Py_EncodeLocale() function to encode the character string back to a Use the Py_EncodeLocale() function to encode the character string back to a
byte string. */ byte string. */
wchar_t* wchar_t*
Py_DecodeLocale(const char* arg, size_t *size) Py_DecodeLocale(const char* arg, size_t *wlen)
{ {
return decode_locale(arg, size, 0); wchar_t *wstr;
int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1);
if (res != 0) {
if (wlen != NULL) {
*wlen = (size_t)res;
}
return NULL;
}
return wstr;
} }
/* Similar to Py_DecodeLocale() but ignore the UTF-8 mode */ static int
wchar_t* encode_current_locale(const wchar_t *text, char **str,
_Py_DecodeCurrentLocale(const char* arg, size_t *size) size_t *error_pos, const char **reason,
{ int raw_malloc, int surrogateescape)
return decode_locale(arg, size, 1);
}
#if !defined(__APPLE__) && !defined(__ANDROID__)
static char*
encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
{ {
const size_t len = wcslen(text); const size_t len = wcslen(text);
char *result = NULL, *bytes = NULL; char *result = NULL, *bytes = NULL;
@ -464,38 +530,37 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
for (i=0; i < len; i++) { for (i=0; i < len; i++) {
c = text[i]; c = text[i];
if (c >= 0xdc80 && c <= 0xdcff) { if (c >= 0xdc80 && c <= 0xdcff) {
if (!surrogateescape) {
goto encode_error;
}
/* UTF-8b surrogate */ /* UTF-8b surrogate */
if (bytes != NULL) { if (bytes != NULL) {
*bytes++ = c - 0xdc00; *bytes++ = c - 0xdc00;
size--; size--;
} }
else else {
size++; size++;
}
continue; continue;
} }
else { else {
buf[0] = c; buf[0] = c;
if (bytes != NULL) if (bytes != NULL) {
converted = wcstombs(bytes, buf, size); converted = wcstombs(bytes, buf, size);
else }
else {
converted = wcstombs(NULL, buf, 0); converted = wcstombs(NULL, buf, 0);
}
if (converted == (size_t)-1) { if (converted == (size_t)-1) {
if (raw_malloc) { goto encode_error;
PyMem_RawFree(result);
}
else {
PyMem_Free(result);
}
if (error_pos != NULL)
*error_pos = i;
return NULL;
} }
if (bytes != NULL) { if (bytes != NULL) {
bytes += converted; bytes += converted;
size -= converted; size -= converted;
} }
else else {
size += converted; size += converted;
}
} }
} }
if (result != NULL) { if (result != NULL) {
@ -511,38 +576,78 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
result = PyMem_Malloc(size); result = PyMem_Malloc(size);
} }
if (result == NULL) { if (result == NULL) {
if (error_pos != NULL) { return -1;
*error_pos = (size_t)-1;
}
return NULL;
} }
bytes = result; bytes = result;
} }
return result; *str = result;
return 0;
encode_error:
if (raw_malloc) {
PyMem_RawFree(result);
}
else {
PyMem_Free(result);
}
if (error_pos != NULL) {
*error_pos = i;
}
if (reason) {
*reason = "encoding error";
}
return -2;
} }
static int
encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
const char **reason,
int raw_malloc, int current_locale, int surrogateescape)
{
if (current_locale) {
return encode_current_locale(text, str, error_pos, reason,
raw_malloc, surrogateescape);
}
#if defined(__APPLE__) || defined(__ANDROID__)
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
raw_malloc, surrogateescape);
#else /* __APPLE__ */
if (Py_UTF8Mode == 1) {
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
raw_malloc, surrogateescape);
}
#ifdef USE_FORCE_ASCII
if (force_ascii == -1) {
force_ascii = check_force_ascii();
}
if (force_ascii) {
return encode_ascii(text, str, error_pos, reason,
raw_malloc, surrogateescape);
}
#endif #endif
return encode_current_locale(text, str, error_pos, reason,
raw_malloc, surrogateescape);
#endif /* __APPLE__ or __ANDROID__ */
}
static char* static char*
encode_locale(const wchar_t *text, size_t *error_pos, encode_locale(const wchar_t *text, size_t *error_pos,
int raw_malloc, int ignore_utf8_mode) int raw_malloc, int current_locale)
{ {
#if defined(__APPLE__) || defined(__ANDROID__) char *str;
return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc); int res = encode_locale_ex(text, &str, error_pos, NULL,
#else /* __APPLE__ */ raw_malloc, current_locale, 1);
if (!ignore_utf8_mode && Py_UTF8Mode == 1) { if (res != -2 && error_pos) {
return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc); *error_pos = (size_t)-1;
} }
if (res != 0) {
#ifndef MS_WINDOWS return NULL;
if (force_ascii == -1) }
force_ascii = check_force_ascii(); return str;
if (force_ascii)
return encode_ascii_surrogateescape(text, error_pos, raw_malloc);
#endif
return encode_current_locale(text, error_pos, raw_malloc);
#endif /* __APPLE__ or __ANDROID__ */
} }
/* Encode a wide character string to the locale encoding with the /* Encode a wide character string to the locale encoding with the
@ -573,11 +678,13 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos)
} }
/* Similar to _Py_EncodeLocaleRaw() but ignore the UTF-8 Mode */ int
char* _Py_EncodeLocaleEx(const wchar_t *text, char **str,
_Py_EncodeCurrentLocale(const wchar_t *text, size_t *error_pos) size_t *error_pos, const char **reason,
int current_locale, int surrogateescape)
{ {
return encode_locale(text, error_pos, 1, 1); return encode_locale_ex(text, str, error_pos, reason, 1,
current_locale, surrogateescape);
} }

View File

@ -382,8 +382,8 @@ _Py_FindEnvConfigValue(FILE *env_file, const wchar_t *key,
/* Comment - skip */ /* Comment - skip */
continue; continue;
} }
tmpbuffer = _Py_DecodeUTF8_surrogateescape(buffer, n, NULL); tmpbuffer = _Py_DecodeUTF8_surrogateescape(buffer, n);
if (tmpbuffer != NULL) { if (tmpbuffer) {
wchar_t * state; wchar_t * state;
wchar_t * tok = wcstok(tmpbuffer, L" \t\r\n", &state); wchar_t * tok = wcstok(tmpbuffer, L" \t\r\n", &state);
if ((tok != NULL) && !wcscmp(tok, key)) { if ((tok != NULL) && !wcscmp(tok, key)) {