bpo-29240: Fix locale encodings in UTF-8 Mode (#5170)

Modify locale.localeconv(), time.tzname, os.strerror() and other functions to ignore the UTF-8 Mode: always use the current locale encoding. Changes: * Add _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx(). On decoding or encoding error, they return the position of the error and an error message which are used to raise Unicode errors in PyUnicode_DecodeLocale() and PyUnicode_EncodeLocale(). * Replace _Py_DecodeCurrentLocale() with _Py_DecodeLocaleEx(). * PyUnicode_DecodeLocale() now uses _Py_DecodeLocaleEx() for all cases, especially for the strict error handler. * Add _Py_DecodeUTF8Ex(): return more information on decoding error and supports the strict error handler. * Rename _Py_EncodeUTF8_surrogateescape() to _Py_EncodeUTF8Ex(). * Replace _Py_EncodeCurrentLocale() with _Py_EncodeLocaleEx(). * Ignore the UTF-8 mode to encode/decode localeconv(), strerror() and time zone name. * Remove PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize() and PyUnicode_EncodeLocale() now ignore the UTF-8 mode: always use the "current" locale. * Remove _PyUnicode_DecodeCurrentLocale(), _PyUnicode_DecodeCurrentLocaleAndSize() and _PyUnicode_EncodeCurrentLocale().
2018-01-15 10:45:49 +01:00 · 2018-01-15 10:45:49 +01:00 · 7ed7aead95
parent ee3b83547c
commit 7ed7aead95
12 changed files with 484 additions and 517 deletions
--- a/Doc/c-api/sys.rst
+++ b/Doc/c-api/sys.rst
@ -106,6 +106,16 @@ Operating System Utilities
   surrogate character, escape the bytes using the surrogateescape error
   handler instead of decoding them.

+   Encoding, highest priority to lowest priority:
+
+   * ``UTF-8`` on macOS and Android;
+   * ``UTF-8`` if the Python UTF-8 mode is enabled;
+   * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
+     ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
+     and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
+     ``ISO-8859-1`` encoding.
+   * the current locale encoding.
+
   Return a pointer to a newly allocated wide character string, use
   :c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write
   the number of wide characters excluding the null character into ``*size``
@ -137,6 +147,18 @@ Operating System Utilities
   :ref:`surrogateescape error handler <surrogateescape>`: surrogate characters
   in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF.

+   Encoding, highest priority to lowest priority:
+
+   * ``UTF-8`` on macOS and Android;
+   * ``UTF-8`` if the Python UTF-8 mode is enabled;
+   * ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
+     ``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
+     and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
+     ``ISO-8859-1`` encoding.
+   * the current locale encoding.
+
+   The function uses the UTF-8 encoding in the Python UTF-8 mode.
+
   Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free`
   to free the memory. Return ``NULL`` on encoding error or memory allocation
   error
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@ -770,12 +770,20 @@ system.
   :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
   Python startup).

+   This function ignores the Python UTF-8 mode.
+
   .. seealso::

      The :c:func:`Py_DecodeLocale` function.

   .. versionadded:: 3.3

+   .. versionchanged:: 3.7
+      The function now also uses the current locale encoding for the
+      ``surrogateescape`` error handler. Previously, :c:func:`Py_DecodeLocale`
+      was used for the ``surrogateescape``, and the current locale encoding was
+      used for ``strict``.
+

 .. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors)

@ -797,12 +805,20 @@ system.
   :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
   Python startup).

+   This function ignores the Python UTF-8 mode.
+
   .. seealso::

      The :c:func:`Py_EncodeLocale` function.

   .. versionadded:: 3.3

+   .. versionchanged:: 3.7
+      The function now also uses the current locale encoding for the
+      ``surrogateescape`` error handler. Previously, :c:func:`Py_EncodeLocale`
+      was used for the ``surrogateescape``, and the current locale encoding was
+      used for ``strict``.
+

 File System Encoding
 """"""""""""""""""""
--- a/Include/fileutils.h
+++ b/Include/fileutils.h
@ -20,18 +20,41 @@ PyAPI_FUNC(char*) _Py_EncodeLocaleRaw(
 #endif

 #ifdef Py_BUILD_CORE
-PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
-    const char *s,
-    Py_ssize_t size,
-    size_t *p_wlen);
-
-PyAPI_FUNC(wchar_t *) _Py_DecodeCurrentLocale(
+PyAPI_FUNC(int) _Py_DecodeUTF8Ex(
    const char *arg,
-    size_t *size);
+    Py_ssize_t arglen,
+    wchar_t **wstr,
+    size_t *wlen,
+    const char **reason,
+    int surrogateescape);

-PyAPI_FUNC(char*) _Py_EncodeCurrentLocale(
+PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
    const wchar_t *text,
-    size_t *error_pos);
+    char **str,
+    size_t *error_pos,
+    const char **reason,
+    int raw_malloc,
+    int surrogateescape);
+
+PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
+    const char *arg,
+    Py_ssize_t arglen);
+
+PyAPI_FUNC(int) _Py_DecodeLocaleEx(
+    const char *arg,
+    wchar_t **wstr,
+    size_t *wlen,
+    const char **reason,
+    int current_locale,
+    int surrogateescape);
+
+PyAPI_FUNC(int) _Py_EncodeLocaleEx(
+    const wchar_t *text,
+    char **str,
+    size_t *error_pos,
+    const char **reason,
+    int current_locale,
+    int surrogateescape);
 #endif

 #ifndef Py_LIMITED_API
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -1810,20 +1810,6 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
    PyObject *unicode,
    const char *errors
    );
-
-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocale(
-    const char *str,
-    const char *errors);
-
-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocaleAndSize(
-    const char *str,
-    Py_ssize_t len,
-    const char *errors);
-
-PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCurrentLocale(
-    PyObject *unicode,
-    const char *errors
-    );
 #endif

 /* --- File system encoding ---------------------------------------------- */
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@ -696,7 +696,7 @@ static int parse_isoformat_date(const char *dtstr,
    if (NULL == p) {
        return -1;
    }
-    
+
    if (*(p++) != '-') {
        return -2;
    }
--- a/Modules/_localemodule.c
+++ b/Modules/_localemodule.c
@ -572,8 +572,9 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args)
    if (!PyArg_ParseTuple(args, "sz", &domain, &codeset))
        return NULL;
    codeset = bind_textdomain_codeset(domain, codeset);
-    if (codeset)
+    if (codeset) {
        return PyUnicode_DecodeLocale(codeset, NULL);
+    }
    Py_RETURN_NONE;
 }
 #endif
--- a/Modules/getpath.c
+++ b/Modules/getpath.c
@ -449,8 +449,8 @@ search_for_exec_prefix(const _PyCoreConfig *core_config,
            n = fread(buf, 1, MAXPATHLEN, f);
            buf[n] = '\0';
            fclose(f);
-            rel_builddir_path = _Py_DecodeUTF8_surrogateescape(buf, n, NULL);
-            if (rel_builddir_path != NULL) {
+            rel_builddir_path = _Py_DecodeUTF8_surrogateescape(buf, n);
+            if (rel_builddir_path) {
                wcsncpy(exec_prefix, calculate->argv0_path, MAXPATHLEN);
                exec_prefix[MAXPATHLEN] = L'\0';
                joinpath(exec_prefix, rel_builddir_path);
--- a/Modules/readline.c
+++ b/Modules/readline.c
@ -132,13 +132,13 @@ static PyModuleDef readlinemodule;
 static PyObject *
 encode(PyObject *b)
 {
-    return _PyUnicode_EncodeCurrentLocale(b, "surrogateescape");
+    return PyUnicode_EncodeLocale(b, "surrogateescape");
 }

 static PyObject *
 decode(const char *s)
 {
-    return _PyUnicode_DecodeCurrentLocale(s, "surrogateescape");
+    return PyUnicode_DecodeLocale(s, "surrogateescape");
 }


--- a/Modules/timemodule.c
+++ b/Modules/timemodule.c
@ -418,11 +418,11 @@ tmtotuple(struct tm *p
    SET(8, p->tm_isdst);
 #ifdef HAVE_STRUCT_TM_TM_ZONE
    PyStructSequence_SET_ITEM(v, 9,
-        _PyUnicode_DecodeCurrentLocale(p->tm_zone, "surrogateescape"));
+        PyUnicode_DecodeLocale(p->tm_zone, "surrogateescape"));
    SET(10, p->tm_gmtoff);
 #else
    PyStructSequence_SET_ITEM(v, 9,
-        _PyUnicode_DecodeCurrentLocale(zone, "surrogateescape"));
+        PyUnicode_DecodeLocale(zone, "surrogateescape"));
    PyStructSequence_SET_ITEM(v, 10, _PyLong_FromTime_t(gmtoff));
 #endif /* HAVE_STRUCT_TM_TM_ZONE */
 #undef SET
@ -809,8 +809,7 @@ time_strftime(PyObject *self, PyObject *args)
 #ifdef HAVE_WCSFTIME
            ret = PyUnicode_FromWideChar(outbuf, buflen);
 #else
-            ret = _PyUnicode_DecodeCurrentLocaleAndSize(outbuf, buflen,
-                                                        "surrogateescape");
+            ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, "surrogateescape");
 #endif
            PyMem_Free(outbuf);
            break;
@ -1541,8 +1540,8 @@ PyInit_timezone(PyObject *m) {
    PyModule_AddIntConstant(m, "altzone", timezone-3600);
 #endif
    PyModule_AddIntConstant(m, "daylight", daylight);
-    otz0 = _PyUnicode_DecodeCurrentLocale(tzname[0], "surrogateescape");
-    otz1 = _PyUnicode_DecodeCurrentLocale(tzname[1], "surrogateescape");
+    otz0 = PyUnicode_DecodeLocale(tzname[0], "surrogateescape");
+    otz1 = PyUnicode_DecodeLocale(tzname[1], "surrogateescape");
    PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
 #else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
    {
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -3327,53 +3327,6 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
    return NULL;
 }

-static size_t
-wcstombs_errorpos(const wchar_t *wstr)
-{
-    size_t len;
-#if SIZEOF_WCHAR_T == 2
-    wchar_t buf[3];
-#else
-    wchar_t buf[2];
-#endif
-    char outbuf[MB_LEN_MAX];
-    const wchar_t *start, *previous;
-
-#if SIZEOF_WCHAR_T == 2
-    buf[2] = 0;
-#else
-    buf[1] = 0;
-#endif
-    start = wstr;
-    while (*wstr != L'\0')
-    {
-        previous = wstr;
-#if SIZEOF_WCHAR_T == 2
-        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
-            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
-        {
-            buf[0] = wstr[0];
-            buf[1] = wstr[1];
-            wstr += 2;
-        }
-        else {
-            buf[0] = *wstr;
-            buf[1] = 0;
-            wstr++;
-        }
-#else
-        buf[0] = *wstr;
-        wstr++;
-#endif
-        len = wcstombs(outbuf, buf, sizeof(outbuf));
-        if (len == (size_t)-1)
-            return previous - start;
-    }
-
-    /* failed to find the unencodable character */
-    return 0;
-}
-
 static int
 locale_error_handler(const char *errors, int *surrogateescape)
 {
@ -3396,130 +3349,60 @@ locale_error_handler(const char *errors, int *surrogateescape)
 }

 static PyObject *
-unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale)
+unicode_encode_locale(PyObject *unicode, const char *errors,
+                      int current_locale)
 {
-    Py_ssize_t wlen, wlen2;
-    wchar_t *wstr;
-    char *errmsg;
-    PyObject *bytes, *reason, *exc;
-    size_t error_pos, errlen;
    int surrogateescape;
-
    if (locale_error_handler(errors, &surrogateescape) < 0)
        return NULL;

-    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
-    if (wstr == NULL)
+    Py_ssize_t wlen;
+    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
+    if (wstr == NULL) {
        return NULL;
+    }

-    wlen2 = wcslen(wstr);
+    Py_ssize_t wlen2 = wcslen(wstr);
    if (wlen2 != wlen) {
        PyMem_Free(wstr);
        PyErr_SetString(PyExc_ValueError, "embedded null character");
        return NULL;
    }

-    if (surrogateescape) {
-        /* "surrogateescape" error handler */
-        char *str;
-
-        if (current_locale) {
-            str = _Py_EncodeCurrentLocale(wstr, &error_pos);
+    char *str;
+    size_t error_pos;
+    const char *reason;
+    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
+                                 current_locale, surrogateescape);
+    if (res != 0) {
+        if (res == -2) {
+            PyObject *exc;
+            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
+                    "locale", unicode,
+                    (Py_ssize_t)error_pos,
+                    (Py_ssize_t)(error_pos+1),
+                    reason);
+            if (exc != NULL) {
+                PyCodec_StrictErrors(exc);
+                Py_DECREF(exc);
+            }
+            return NULL;
        }
        else {
-            str = Py_EncodeLocale(wstr, &error_pos);
-        }
-        if (str == NULL) {
-            if (error_pos == (size_t)-1) {
-                PyErr_NoMemory();
-                PyMem_Free(wstr);
-                return NULL;
-            }
-            else {
-                goto encode_error;
-            }
-        }
-        PyMem_Free(wstr);
-
-        bytes = PyBytes_FromString(str);
-        if (current_locale) {
-            PyMem_RawFree(str);
-        }
-        else {
-            PyMem_Free(str);
-        }
-    }
-    else {
-        /* strict mode */
-        size_t len, len2;
-
-        len = wcstombs(NULL, wstr, 0);
-        if (len == (size_t)-1) {
-            error_pos = (size_t)-1;
-            goto encode_error;
-        }
-
-        bytes = PyBytes_FromStringAndSize(NULL, len);
-        if (bytes == NULL) {
+            PyErr_NoMemory();
            PyMem_Free(wstr);
            return NULL;
        }
-
-        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
-        if (len2 == (size_t)-1 || len2 > len) {
-            Py_DECREF(bytes);
-            error_pos = (size_t)-1;
-            goto encode_error;
-        }
-        PyMem_Free(wstr);
    }
-    return bytes;
-
-encode_error:
-    errmsg = strerror(errno);
-    assert(errmsg != NULL);
-
-    if (error_pos == (size_t)-1)
-        error_pos = wcstombs_errorpos(wstr);
-
    PyMem_Free(wstr);

-    wstr = Py_DecodeLocale(errmsg, &errlen);
-    if (wstr != NULL) {
-        reason = PyUnicode_FromWideChar(wstr, errlen);
-        PyMem_RawFree(wstr);
-    } else {
-        errmsg = NULL;
-    }
-
-    if (errmsg == NULL)
-        reason = PyUnicode_FromString(
-            "wcstombs() encountered an unencodable "
-            "wide character");
-    if (reason == NULL)
-        return NULL;
-
-    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
-                                "locale", unicode,
-                                (Py_ssize_t)error_pos,
-                                (Py_ssize_t)(error_pos+1),
-                                reason);
-    Py_DECREF(reason);
-    if (exc != NULL) {
-        PyCodec_StrictErrors(exc);
-        Py_DECREF(exc);
-    }
-    return NULL;
+    PyObject *bytes = PyBytes_FromString(str);
+    PyMem_RawFree(str);
+    return bytes;
 }

 PyObject *
 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
-{
-    return unicode_encode_locale(unicode, errors, 0);
-}
-
-PyObject *
-_PyUnicode_EncodeCurrentLocale(PyObject *unicode, const char *errors)
 {
    return unicode_encode_locale(unicode, errors, 1);
 }
@ -3687,51 +3570,11 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
    return NULL;
 }

-static size_t
-mbstowcs_errorpos(const char *str, size_t len)
-{
-#ifdef HAVE_MBRTOWC
-    const char *start = str;
-    mbstate_t mbs;
-    size_t converted;
-    wchar_t ch;
-
-    memset(&mbs, 0, sizeof mbs);
-    while (len)
-    {
-        converted = mbrtowc(&ch, str, len, &mbs);
-        if (converted == 0)
-            /* Reached end of string */
-            break;
-        if (converted == (size_t)-1 || converted == (size_t)-2) {
-            /* Conversion error or incomplete character */
-            return str - start;
-        }
-        else {
-            str += converted;
-            len -= converted;
-        }
-    }
-    /* failed to find the undecodable byte sequence */
-    return 0;
-#endif
-    return 0;
-}
-
 static PyObject*
 unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
                      int current_locale)
 {
-    wchar_t smallbuf[256];
-    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
-    wchar_t *wstr;
-    size_t wlen, wlen2;
-    PyObject *unicode;
    int surrogateescape;
-    size_t error_pos, errlen;
-    char *errmsg;
-    PyObject *exc, *reason = NULL;   /* initialize to prevent gcc warning */
-
    if (locale_error_handler(errors, &surrogateescape) < 0)
        return NULL;

@ -3740,113 +3583,47 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
        return NULL;
    }

-    if (surrogateescape) {
-        /* "surrogateescape" error handler */
-        if (current_locale) {
-            wstr = _Py_DecodeCurrentLocale(str, &wlen);
+    wchar_t *wstr;
+    size_t wlen;
+    const char *reason;
+    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
+                                 current_locale, surrogateescape);
+    if (res != 0) {
+        if (res == -2) {
+            PyObject *exc;
+            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
+                                        "locale", str, len,
+                                        (Py_ssize_t)wlen,
+                                        (Py_ssize_t)(wlen + 1),
+                                        reason);
+            if (exc != NULL) {
+                PyCodec_StrictErrors(exc);
+                Py_DECREF(exc);
+            }
        }
        else {
-            wstr = Py_DecodeLocale(str, &wlen);
+            PyErr_NoMemory();
        }
-        if (wstr == NULL) {
-            if (wlen == (size_t)-1)
-                PyErr_NoMemory();
-            else
-                PyErr_SetFromErrno(PyExc_OSError);
-            return NULL;
-        }
-
-        unicode = PyUnicode_FromWideChar(wstr, wlen);
-        PyMem_RawFree(wstr);
-    }
-    else {
-        /* strict mode */
-#ifndef HAVE_BROKEN_MBSTOWCS
-        wlen = mbstowcs(NULL, str, 0);
-#else
-        wlen = len;
-#endif
-        if (wlen == (size_t)-1)
-            goto decode_error;
-        if (wlen+1 <= smallbuf_len) {
-            wstr = smallbuf;
-        }
-        else {
-            wstr = PyMem_New(wchar_t, wlen+1);
-            if (!wstr)
-                return PyErr_NoMemory();
-        }
-
-        wlen2 = mbstowcs(wstr, str, wlen+1);
-        if (wlen2 == (size_t)-1) {
-            if (wstr != smallbuf)
-                PyMem_Free(wstr);
-            goto decode_error;
-        }
-#ifdef HAVE_BROKEN_MBSTOWCS
-        assert(wlen2 == wlen);
-#endif
-        unicode = PyUnicode_FromWideChar(wstr, wlen2);
-        if (wstr != smallbuf)
-            PyMem_Free(wstr);
-    }
-    return unicode;
-
-decode_error:
-    errmsg = strerror(errno);
-    assert(errmsg != NULL);
-
-    error_pos = mbstowcs_errorpos(str, len);
-    wstr = Py_DecodeLocale(errmsg, &errlen);
-    if (wstr != NULL) {
-        reason = PyUnicode_FromWideChar(wstr, errlen);
-        PyMem_RawFree(wstr);
-    }
-
-    if (reason == NULL)
-        reason = PyUnicode_FromString(
-            "mbstowcs() encountered an invalid multibyte sequence");
-    if (reason == NULL)
        return NULL;
-
-    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
-                                "locale", str, len,
-                                (Py_ssize_t)error_pos,
-                                (Py_ssize_t)(error_pos+1),
-                                reason);
-    Py_DECREF(reason);
-    if (exc != NULL) {
-        PyCodec_StrictErrors(exc);
-        Py_DECREF(exc);
    }
-    return NULL;
+
+    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
+    PyMem_RawFree(wstr);
+    return unicode;
 }

 PyObject*
 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
                              const char *errors)
-{
-    return unicode_decode_locale(str, len, errors, 0);
-}
-
-PyObject*
-_PyUnicode_DecodeCurrentLocaleAndSize(const char *str, Py_ssize_t len,
-                                      const char *errors)
 {
    return unicode_decode_locale(str, len, errors, 1);
 }

-PyObject*
-_PyUnicode_DecodeCurrentLocale(const char *str, const char *errors)
-{
-    return unicode_decode_locale(str, (Py_ssize_t)strlen(str), errors, 1);
-}
-
 PyObject*
 PyUnicode_DecodeLocale(const char *str, const char *errors)
 {
    Py_ssize_t size = (Py_ssize_t)strlen(str);
-    return unicode_decode_locale(str, size, errors, 0);
+    return unicode_decode_locale(str, size, errors, 1);
 }


@ -3878,7 +3655,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
                                Py_FileSystemDefaultEncodeErrors);
    }
    else {
-        return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
+        return unicode_decode_locale(s, size,
+                                     Py_FileSystemDefaultEncodeErrors, 0);
    }
 #endif
 }
@ -5128,17 +4906,23 @@ onError:
 }


-/* UTF-8 decoder using the surrogateescape error handler .
+/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
+   non-zero, use strict error handler otherwise.

-   On success, return a pointer to a newly allocated wide character string (use
-   PyMem_RawFree() to free the memory) and write the output length (in number
-   of wchar_t units) into *p_wlen (if p_wlen is set).
+   On success, write a pointer to a newly allocated wide character string into
+   *wstr (use PyMem_RawFree() to free the memory) and write the output length
+   (in number of wchar_t units) into *wlen (if wlen is set).

-   On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen
-   (if p_wlen is set). */
-wchar_t*
-_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
+   On memory allocation failure, return -1.
+
+   On decoding error (if surrogateescape is zero), return -2. If wlen is
+   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
+   is not NULL, write the decoding error message into *reason. */
+int
+_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
+                 const char **reason, int surrogateescape)
 {
+    const char *orig_s = s;
    const char *e;
    wchar_t *unicode;
    Py_ssize_t outpos;
@ -5146,18 +4930,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
    /* Note: size will always be longer than the resulting Unicode
       character count */
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
-        if (p_wlen) {
-            *p_wlen = (size_t)-1;
-        }
-        return NULL;
+        return -1;
    }

    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
    if (!unicode) {
-        if (p_wlen) {
-            *p_wlen = (size_t)-1;
-        }
-        return NULL;
+        return -1;
    }

    /* Unpack UTF-8 encoded data */
@ -5175,7 +4953,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
            Py_UNREACHABLE();
 #else
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
-            /*  compute and append the two surrogates: */
+            /* write a surrogate pair */
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
 #endif
@ -5183,60 +4961,88 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
        else {
            if (!ch && s == e)
                break;
+            if (!surrogateescape) {
+                PyMem_RawFree(unicode );
+                if (reason != NULL) {
+                    switch (ch) {
+                    case 0:
+                        *reason = "unexpected end of data";
+                        break;
+                    case 1:
+                        *reason = "invalid start byte";
+                        break;
+                    /* 2, 3, 4 */
+                    default:
+                        *reason = "invalid continuation byte";
+                        break;
+                    }
+                }
+                if (wlen != NULL) {
+                    *wlen = s - orig_s;
+                }
+                return -2;
+            }
            /* surrogateescape */
            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
        }
    }
    unicode[outpos] = L'\0';
-    if (p_wlen) {
-        *p_wlen = outpos;
+    if (wlen) {
+        *wlen = outpos;
    }
-    return unicode;
+    *wstr = unicode;
+    return 0;
+}
+
+wchar_t*
+_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
+{
+    wchar_t *wstr;
+    int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
+    if (res != 0) {
+        return NULL;
+    }
+    return wstr;
 }


 /* UTF-8 encoder using the surrogateescape error handler .

-   On success, return a pointer to a newly allocated character string (use
-   PyMem_Free() to free the memory).
+   On success, return 0 and write the newly allocated character string (use
+   PyMem_Free() to free the memory) into *str.

-   On encoding failure, return NULL and write the position of the invalid
-   surrogate character into *error_pos (if error_pos is set).
+   On encoding failure, return -2 and write the position of the invalid
+   surrogate character into *error_pos (if error_pos is set) and the decoding
+   error message into *reason (if reason is set).

-   On memory allocation failure, return NULL and write (size_t)-1 into
-   *error_pos (if error_pos is set). */
-char*
-_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
-                               int raw_malloc)
+   On memory allocation failure, return -1. */
+int
+_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
+                 const char **reason, int raw_malloc, int surrogateescape)
 {
    const Py_ssize_t max_char_size = 4;
    Py_ssize_t len = wcslen(text);

    assert(len >= 0);

+    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
+        return -1;
+    }
    char *bytes;
-    if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
-        if (raw_malloc) {
-            bytes = PyMem_RawMalloc((len + 1) * max_char_size);
-        }
-        else {
-            bytes = PyMem_Malloc((len + 1) * max_char_size);
-        }
+    if (raw_malloc) {
+        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
    }
    else {
-        bytes = NULL;
+        bytes = PyMem_Malloc((len + 1) * max_char_size);
    }
    if (bytes == NULL) {
-        if (error_pos != NULL) {
-            *error_pos = (size_t)-1;
-        }
-        return NULL;
+        return -1;
    }

    char *p = bytes;
    Py_ssize_t i;
-    for (i = 0; i < len;) {
-        Py_UCS4 ch = text[i++];
+    for (i = 0; i < len; i++) {
+        Py_UCS4 ch = text[i];

        if (ch < 0x80) {
            /* Encode ASCII */
@ -5250,11 +5056,20 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
        }
        else if (Py_UNICODE_IS_SURROGATE(ch)) {
            /* surrogateescape error handler */
-            if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
+            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
                if (error_pos != NULL) {
-                    *error_pos = (size_t)i - 1;
+                    *error_pos = (size_t)i;
                }
-                goto error;
+                if (reason != NULL) {
+                    *reason = "encoding error";
+                }
+                if (raw_malloc) {
+                    PyMem_RawFree(bytes);
+                }
+                else {
+                    PyMem_Free(bytes);
+                }
+                return -2;
            }
            *p++ = (char)(ch & 0xff);
        }
@ -5286,18 +5101,16 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
        if (error_pos != NULL) {
            *error_pos = (size_t)-1;
        }
-        goto error;
+        if (raw_malloc) {
+            PyMem_RawFree(bytes);
+        }
+        else {
+            PyMem_Free(bytes);
+        }
+        return -1;
    }
-    return bytes2;
-
- error:
-    if (raw_malloc) {
-        PyMem_RawFree(bytes);
-    }
-    else {
-        PyMem_Free(bytes);
-    }
-    return NULL;
+    *str = bytes2;
+    return 0;
 }


--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@ -20,9 +20,6 @@ extern int winerror_to_errno(int);
 #include <fcntl.h>
 #endif /* HAVE_FCNTL_H */

-extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text,
-                                            size_t *error_pos, int raw_malloc);
-
 #ifdef O_CLOEXEC
 /* Does open() support the O_CLOEXEC flag? Possible values:

@ -69,7 +66,10 @@ _Py_device_encoding(int fd)
    Py_RETURN_NONE;
 }

-#if !defined(__APPLE__) && !defined(MS_WINDOWS)
+#if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS)
+
+#define USE_FORCE_ASCII
+
 extern int _Py_normalize_encoding(const char *, char *, size_t);

 /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
@ -90,7 +90,7 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);

       1: the workaround is used: Py_EncodeLocale() uses
          encode_ascii_surrogateescape() and Py_DecodeLocale() uses
-          decode_ascii_surrogateescape()
+          decode_ascii()
       0: the workaround is not used: Py_EncodeLocale() uses wcstombs() and
          Py_DecodeLocale() uses mbstowcs()
      -1: unknown, need to call check_force_ascii() to get the value
@ -180,16 +180,15 @@ error:
    return 1;
 }

-static char*
-encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_malloc)
+static int
+encode_ascii(const wchar_t *text, char **str,
+             size_t *error_pos, const char **reason,
+             int raw_malloc, int surrogateescape)
 {
    char *result = NULL, *out;
    size_t len, i;
    wchar_t ch;

-    if (error_pos != NULL)
-        *error_pos = (size_t)-1;
-
    len = wcslen(text);

    /* +1 for NULL byte */
@ -199,8 +198,9 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal
    else {
        result = PyMem_Malloc(len + 1);
    }
-    if (result == NULL)
-        return NULL;
+    if (result == NULL) {
+        return -1;
+    }

    out = result;
    for (i=0; i<len; i++) {
@ -210,60 +210,84 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal
            /* ASCII character */
            *out++ = (char)ch;
        }
-        else if (0xdc80 <= ch && ch <= 0xdcff) {
+        else if (surrogateescape && 0xdc80 <= ch && ch <= 0xdcff) {
            /* UTF-8b surrogate */
            *out++ = (char)(ch - 0xdc00);
        }
        else {
-            if (error_pos != NULL) {
-                *error_pos = i;
-            }
            if (raw_malloc) {
                PyMem_RawFree(result);
            }
            else {
                PyMem_Free(result);
            }
-            return NULL;
+            if (error_pos != NULL) {
+                *error_pos = i;
+            }
+            if (reason) {
+                *reason = "encoding error";
+            }
+            return -2;
        }
    }
    *out = '\0';
-    return result;
+    *str = result;
+    return 0;
 }
-#endif   /* !defined(__APPLE__) && !defined(MS_WINDOWS) */
+#endif   /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */

-#if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC))
-static wchar_t*
-decode_ascii_surrogateescape(const char *arg, size_t *size)
+
+#if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
+static int
+decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
+             const char **reason, int surrogateescape)
 {
    wchar_t *res;
    unsigned char *in;
    wchar_t *out;
    size_t argsize = strlen(arg) + 1;

-    if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
-        return NULL;
-    res = PyMem_RawMalloc(argsize*sizeof(wchar_t));
-    if (!res)
-        return NULL;
+    if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
+        return -1;
+    }
+    res = PyMem_RawMalloc(argsize * sizeof(wchar_t));
+    if (!res) {
+        return -1;
+    }

-    in = (unsigned char*)arg;
    out = res;
-    while(*in)
-        if(*in < 128)
-            *out++ = *in++;
-        else
-            *out++ = 0xdc00 + *in++;
+    for (in = (unsigned char*)arg; *in; in++) {
+        unsigned char ch = *in;
+        if (ch < 128) {
+            *out++ = ch;
+        }
+        else {
+            if (!surrogateescape) {
+                PyMem_RawFree(res);
+                if (wlen) {
+                    *wlen = in - (unsigned char*)arg;
+                }
+                if (reason) {
+                    *reason = "decoding error";
+                }
+                return -2;
+            }
+            *out++ = 0xdc00 + ch;
+        }
+    }
    *out = 0;
-    if (size != NULL)
-        *size = out - res;
-    return res;
-}
-#endif

-#if !defined(__APPLE__) && !defined(__ANDROID__)
-static wchar_t*
-decode_current_locale(const char* arg, size_t *size)
+    if (wlen != NULL) {
+        *wlen = out - res;
+    }
+    *wstr = res;
+    return 0;
+}
+#endif   /* !HAVE_MBRTOWC */
+
+static int
+decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
+                      const char **reason, int surrogateescape)
 {
    wchar_t *res;
    size_t argsize;
@ -284,15 +308,15 @@ decode_current_locale(const char* arg, size_t *size)
    argsize = mbstowcs(NULL, arg, 0);
 #endif
    if (argsize != (size_t)-1) {
-        if (argsize == PY_SSIZE_T_MAX)
-            goto oom;
-        argsize += 1;
-        if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
-            goto oom;
-        res = (wchar_t *)PyMem_RawMalloc(argsize*sizeof(wchar_t));
-        if (!res)
-            goto oom;
-        count = mbstowcs(res, arg, argsize);
+        if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
+            return -1;
+        }
+        res = (wchar_t *)PyMem_RawMalloc((argsize + 1) * sizeof(wchar_t));
+        if (!res) {
+            return -1;
+        }
+
+        count = mbstowcs(res, arg, argsize + 1);
        if (count != (size_t)-1) {
            wchar_t *tmp;
            /* Only use the result if it contains no
@ -301,13 +325,16 @@ decode_current_locale(const char* arg, size_t *size)
                         !Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
                ;
            if (*tmp == 0) {
-                if (size != NULL)
-                    *size = count;
-                return res;
+                if (wlen != NULL) {
+                    *wlen = count;
+                }
+                *wstr = res;
+                return 0;
            }
        }
        PyMem_RawFree(res);
    }
+
    /* Conversion failed. Fall back to escaping with surrogateescape. */
 #ifdef HAVE_MBRTOWC
    /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
@ -315,30 +342,37 @@ decode_current_locale(const char* arg, size_t *size)
    /* Overallocate; as multi-byte characters are in the argument, the
       actual output could use less memory. */
    argsize = strlen(arg) + 1;
-    if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
-        goto oom;
-    res = (wchar_t*)PyMem_RawMalloc(argsize*sizeof(wchar_t));
-    if (!res)
-        goto oom;
+    if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
+        return -1;
+    }
+    res = (wchar_t*)PyMem_RawMalloc(argsize * sizeof(wchar_t));
+    if (!res) {
+        return -1;
+    }
+
    in = (unsigned char*)arg;
    out = res;
    memset(&mbs, 0, sizeof mbs);
    while (argsize) {
        size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
-        if (converted == 0)
+        if (converted == 0) {
            /* Reached end of string; null char stored. */
            break;
+        }
+
        if (converted == (size_t)-2) {
            /* Incomplete character. This should never happen,
               since we provide everything that we have -
               unless there is a bug in the C library, or I
               misunderstood how mbrtowc works. */
-            PyMem_RawFree(res);
-            if (size != NULL)
-                *size = (size_t)-2;
-            return NULL;
+            goto decode_error;
        }
+
        if (converted == (size_t)-1) {
+            if (!surrogateescape) {
+                goto decode_error;
+            }
+
            /* Conversion error. Escape as UTF-8b, and start over
               in the initial shift state. */
            *out++ = 0xdc00 + *in++;
@ -346,12 +380,18 @@ decode_current_locale(const char* arg, size_t *size)
            memset(&mbs, 0, sizeof mbs);
            continue;
        }
+
        if (Py_UNICODE_IS_SURROGATE(*out)) {
+            if (!surrogateescape) {
+                goto decode_error;
+            }
+
            /* Surrogate character.  Escape the original
               byte sequence with surrogateescape. */
            argsize -= converted;
-            while (converted--)
+            while (converted--) {
                *out++ = 0xdc00 + *in++;
+            }
            continue;
        }
        /* successfully converted some bytes */
@ -359,55 +399,80 @@ decode_current_locale(const char* arg, size_t *size)
        argsize -= converted;
        out++;
    }
-    if (size != NULL)
-        *size = out - res;
+    if (wlen != NULL) {
+        *wlen = out - res;
+    }
+    *wstr = res;
+    return 0;
+
+decode_error:
+    PyMem_RawFree(res);
+    if (wlen) {
+        *wlen = in - (unsigned char*)arg;
+    }
+    if (reason) {
+        *reason = "decoding error";
+    }
+    return -2;
 #else   /* HAVE_MBRTOWC */
    /* Cannot use C locale for escaping; manually escape as if charset
       is ASCII (i.e. escape all bytes > 128. This will still roundtrip
       correctly in the locale's charset, which must be an ASCII superset. */
-    res = decode_ascii_surrogateescape(arg, size);
-    if (res == NULL)
-        goto oom;
+    return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
 #endif   /* HAVE_MBRTOWC */
-    return res;
-
-oom:
-    if (size != NULL) {
-        *size = (size_t)-1;
-    }
-    return NULL;
 }
-#endif


-static wchar_t*
-decode_locale(const char* arg, size_t *size, int ignore_utf8_mode)
+/* Decode a byte string from the locale encoding.
+
+   Use the strict error handler if 'surrogateescape' is zero.  Use the
+   surrogateescape error handler if 'surrogateescape' is non-zero: undecodable
+   bytes are decoded as characters in range U+DC80..U+DCFF. If a byte sequence
+   can be decoded as a surrogate character, escape the bytes using the
+   surrogateescape error handler instead of decoding them.
+
+   On sucess, return 0 and write the newly allocated wide character string into
+   *wstr (use PyMem_RawFree() to free the memory). If wlen is not NULL, write
+   the number of wide characters excluding the null character into *wlen.
+
+   On memory allocation failure, return -1.
+
+   On decoding error, return -2. If wlen is not NULL, write the start of
+   invalid byte sequence in the input string into *wlen. If reason is not NULL,
+   write the decoding error message into *reason.
+
+   Use the Py_EncodeLocaleEx() function to encode the character string back to
+   a byte string. */
+int
+_Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
+                   const char **reason,
+                   int current_locale, int surrogateescape)
 {
-#if defined(__APPLE__) || defined(__ANDROID__)
-    return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
-#else
-    if (!ignore_utf8_mode && Py_UTF8Mode == 1) {
-        return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
+    if (current_locale) {
+        return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
    }

-#ifndef MS_WINDOWS
-    if (force_ascii == -1)
+#if defined(__APPLE__) || defined(__ANDROID__)
+    return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
+                            surrogateescape);
+#else
+    if (Py_UTF8Mode == 1) {
+        return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
+                                surrogateescape);
+    }
+
+#ifdef USE_FORCE_ASCII
+    if (force_ascii == -1) {
        force_ascii = check_force_ascii();
+    }

    if (force_ascii) {
        /* force ASCII encoding to workaround mbstowcs() issue */
-        wchar_t *wstr = decode_ascii_surrogateescape(arg, size);
-        if (wstr == NULL) {
-            if (size != NULL) {
-                *size = (size_t)-1;
-            }
-            return NULL;
-        }
-        return wstr;
+        return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
    }
 #endif

-    return decode_current_locale(arg, size);
+    return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
 #endif   /* __APPLE__ or __ANDROID__ */
 }

@ -432,23 +497,24 @@ decode_locale(const char* arg, size_t *size, int ignore_utf8_mode)
   Use the Py_EncodeLocale() function to encode the character string back to a
   byte string. */
 wchar_t*
-Py_DecodeLocale(const char* arg, size_t *size)
+Py_DecodeLocale(const char* arg, size_t *wlen)
 {
-    return decode_locale(arg, size, 0);
+    wchar_t *wstr;
+    int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1);
+    if (res != 0) {
+        if (wlen != NULL) {
+            *wlen = (size_t)res;
+        }
+        return NULL;
+    }
+    return wstr;
 }


-/* Similar to Py_DecodeLocale() but ignore the UTF-8 mode */
-wchar_t*
-_Py_DecodeCurrentLocale(const char* arg, size_t *size)
-{
-    return decode_locale(arg, size, 1);
-}
-
-
-#if !defined(__APPLE__) && !defined(__ANDROID__)
-static char*
-encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
+static int
+encode_current_locale(const wchar_t *text, char **str,
+                      size_t *error_pos, const char **reason,
+                      int raw_malloc, int surrogateescape)
 {
    const size_t len = wcslen(text);
    char *result = NULL, *bytes = NULL;
@ -464,38 +530,37 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
        for (i=0; i < len; i++) {
            c = text[i];
            if (c >= 0xdc80 && c <= 0xdcff) {
+                if (!surrogateescape) {
+                    goto encode_error;
+                }
                /* UTF-8b surrogate */
                if (bytes != NULL) {
                    *bytes++ = c - 0xdc00;
                    size--;
                }
-                else
+                else {
                    size++;
+                }
                continue;
            }
            else {
                buf[0] = c;
-                if (bytes != NULL)
+                if (bytes != NULL) {
                    converted = wcstombs(bytes, buf, size);
-                else
+                }
+                else {
                    converted = wcstombs(NULL, buf, 0);
+                }
                if (converted == (size_t)-1) {
-                    if (raw_malloc) {
-                        PyMem_RawFree(result);
-                    }
-                    else {
-                        PyMem_Free(result);
-                    }
-                    if (error_pos != NULL)
-                        *error_pos = i;
-                    return NULL;
+                    goto encode_error;
                }
                if (bytes != NULL) {
                    bytes += converted;
                    size -= converted;
                }
-                else
+                else {
                    size += converted;
+                }
            }
        }
        if (result != NULL) {
@ -511,38 +576,78 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
            result = PyMem_Malloc(size);
        }
        if (result == NULL) {
-            if (error_pos != NULL) {
-                *error_pos = (size_t)-1;
-            }
-            return NULL;
+            return -1;
        }
        bytes = result;
    }
-    return result;
+    *str = result;
+    return 0;
+
+encode_error:
+    if (raw_malloc) {
+        PyMem_RawFree(result);
+    }
+    else {
+        PyMem_Free(result);
+    }
+    if (error_pos != NULL) {
+        *error_pos = i;
+    }
+    if (reason) {
+        *reason = "encoding error";
+    }
+    return -2;
 }
+
+static int
+encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
+                 const char **reason,
+                 int raw_malloc, int current_locale, int surrogateescape)
+{
+    if (current_locale) {
+        return encode_current_locale(text, str, error_pos, reason,
+                                     raw_malloc, surrogateescape);
+    }
+
+#if defined(__APPLE__) || defined(__ANDROID__)
+    return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
+                            raw_malloc, surrogateescape);
+#else   /* __APPLE__ */
+    if (Py_UTF8Mode == 1) {
+        return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
+                                raw_malloc, surrogateescape);
+    }
+
+#ifdef USE_FORCE_ASCII
+    if (force_ascii == -1) {
+        force_ascii = check_force_ascii();
+    }
+
+    if (force_ascii) {
+        return encode_ascii(text, str, error_pos, reason,
+                            raw_malloc, surrogateescape);
+    }
 #endif

+    return encode_current_locale(text, str, error_pos, reason,
+                                 raw_malloc, surrogateescape);
+#endif   /* __APPLE__ or __ANDROID__ */
+}
+
 static char*
 encode_locale(const wchar_t *text, size_t *error_pos,
-              int raw_malloc, int ignore_utf8_mode)
+              int raw_malloc, int current_locale)
 {
-#if defined(__APPLE__) || defined(__ANDROID__)
-    return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc);
-#else   /* __APPLE__ */
-    if (!ignore_utf8_mode && Py_UTF8Mode == 1) {
-        return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc);
+    char *str;
+    int res = encode_locale_ex(text, &str, error_pos, NULL,
+                               raw_malloc, current_locale, 1);
+    if (res != -2 && error_pos) {
+        *error_pos = (size_t)-1;
    }
-
-#ifndef MS_WINDOWS
-    if (force_ascii == -1)
-        force_ascii = check_force_ascii();
-
-    if (force_ascii)
-        return encode_ascii_surrogateescape(text, error_pos, raw_malloc);
-#endif
-
-    return encode_current_locale(text, error_pos, raw_malloc);
-#endif   /* __APPLE__ or __ANDROID__ */
+    if (res != 0) {
+        return NULL;
+    }
+    return str;
 }

 /* Encode a wide character string to the locale encoding with the
@ -573,11 +678,13 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos)
 }


-/* Similar to _Py_EncodeLocaleRaw() but ignore the UTF-8 Mode */
-char*
-_Py_EncodeCurrentLocale(const wchar_t *text, size_t *error_pos)
+int
+_Py_EncodeLocaleEx(const wchar_t *text, char **str,
+                   size_t *error_pos, const char **reason,
+                   int current_locale, int surrogateescape)
 {
-    return encode_locale(text, error_pos, 1, 1);
+    return encode_locale_ex(text, str, error_pos, reason, 1,
+                            current_locale, surrogateescape);
 }


--- a/Python/pathconfig.c
+++ b/Python/pathconfig.c
@ -382,8 +382,8 @@ _Py_FindEnvConfigValue(FILE *env_file, const wchar_t *key,
            /* Comment - skip */
            continue;
        }
-        tmpbuffer = _Py_DecodeUTF8_surrogateescape(buffer, n, NULL);
-        if (tmpbuffer != NULL) {
+        tmpbuffer = _Py_DecodeUTF8_surrogateescape(buffer, n);
+        if (tmpbuffer) {
            wchar_t * state;
            wchar_t * tok = wcstok(tmpbuffer, L" \t\r\n", &state);
            if ((tok != NULL) && !wcscmp(tok, key)) {