bpo-34523: Support surrogatepass in locale codecs (GH-8995)

Add support for the "surrogatepass" error handler in PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault() for the UTF-8 encoding. Changes: * _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex() now support the surrogatepass error handler (_Py_ERROR_SURROGATEPASS). * _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() now use the _Py_error_handler enum instead of "int surrogateescape" to pass the error handler. These functions now return -3 if the error handler is unknown. * Add unit tests on _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() in test_codecs. * Rename get_error_handler() to _Py_GetErrorHandler() and expose it as a private function. * _freeze_importlib doesn't need config.filesystem_errors="strict" workaround anymore.
2018-08-29 22:21:32 +02:00 · 2018-08-29 22:21:32 +02:00 · 3d4226a832
parent c5989cd876
commit 3d4226a832
7 changed files with 423 additions and 117 deletions
--- a/Include/fileutils.h
+++ b/Include/fileutils.h
@ -5,6 +5,24 @@
 extern "C" {
 #endif

+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
+typedef enum {
+    _Py_ERROR_UNKNOWN=0,
+    _Py_ERROR_STRICT,
+    _Py_ERROR_SURROGATEESCAPE,
+    _Py_ERROR_REPLACE,
+    _Py_ERROR_IGNORE,
+    _Py_ERROR_BACKSLASHREPLACE,
+    _Py_ERROR_SURROGATEPASS,
+    _Py_ERROR_XMLCHARREFREPLACE,
+    _Py_ERROR_OTHER
+} _Py_error_handler;
+
+PyAPI_FUNC(_Py_error_handler) _Py_GetErrorHandler(const char *errors);
+#endif
+
+
 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
 PyAPI_FUNC(wchar_t *) Py_DecodeLocale(
    const char *arg,
@ -26,7 +44,7 @@ PyAPI_FUNC(int) _Py_DecodeUTF8Ex(
    wchar_t **wstr,
    size_t *wlen,
    const char **reason,
-    int surrogateescape);
+    _Py_error_handler errors);

 PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
    const wchar_t *text,
@ -34,19 +52,22 @@ PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
    size_t *error_pos,
    const char **reason,
    int raw_malloc,
-    int surrogateescape);
+    _Py_error_handler errors);

 PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
    const char *arg,
    Py_ssize_t arglen);
+#endif

+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
 PyAPI_FUNC(int) _Py_DecodeLocaleEx(
    const char *arg,
    wchar_t **wstr,
    size_t *wlen,
    const char **reason,
    int current_locale,
-    int surrogateescape);
+    _Py_error_handler errors);

 PyAPI_FUNC(int) _Py_EncodeLocaleEx(
    const wchar_t *text,
@ -54,7 +75,7 @@ PyAPI_FUNC(int) _Py_EncodeLocaleEx(
    size_t *error_pos,
    const char **reason,
    int current_locale,
-    int surrogateescape);
+    _Py_error_handler errors);
 #endif

 #ifndef Py_LIMITED_API
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -9,6 +9,11 @@ from unittest import mock

 from test import support

+try:
+    import _testcapi
+except ImportError as exc:
+    _testcapi = None
+
 try:
    import ctypes
 except ImportError:
@ -2051,13 +2056,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):

    @support.cpython_only
    def test_basics_capi(self):
-        from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
        s = "abc123"  # all codecs should be able to encode these
        for encoding in all_unicode_encodings:
            if encoding not in broken_unicode_with_stateful:
                # check incremental decoder/encoder (fetched via the C API)
                try:
-                    cencoder = codec_incrementalencoder(encoding)
+                    cencoder = _testcapi.codec_incrementalencoder(encoding)
                except LookupError:  # no IncrementalEncoder
                    pass
                else:
@ -2066,7 +2070,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
                    for c in s:
                        encodedresult += cencoder.encode(c)
                    encodedresult += cencoder.encode("", True)
-                    cdecoder = codec_incrementaldecoder(encoding)
+                    cdecoder = _testcapi.codec_incrementaldecoder(encoding)
                    decodedresult = ""
                    for c in encodedresult:
                        decodedresult += cdecoder.decode(bytes([c]))
@ -2077,12 +2081,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
                if encoding not in ("idna", "mbcs"):
                    # check incremental decoder/encoder with errors argument
                    try:
-                        cencoder = codec_incrementalencoder(encoding, "ignore")
+                        cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
                    except LookupError:  # no IncrementalEncoder
                        pass
                    else:
                        encodedresult = b"".join(cencoder.encode(c) for c in s)
-                        cdecoder = codec_incrementaldecoder(encoding, "ignore")
+                        cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
                        decodedresult = "".join(cdecoder.decode(bytes([c]))
                                                for c in encodedresult)
                        self.assertEqual(decodedresult, s,
@ -3263,5 +3267,109 @@ class Latin1Test(unittest.TestCase):
                self.assertEqual(data.decode('latin1'), expected)


+@unittest.skipIf(_testcapi is None, 'need _testcapi module')
+class LocaleCodecTest(unittest.TestCase):
+    """
+    Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
+    """
+    ENCODING = sys.getfilesystemencoding()
+    STRINGS = ("ascii", "ulatin1:\xa7\xe9",
+               "u255:\xff",
+               "UCS:\xe9\u20ac\U0010ffff",
+               "surrogates:\uDC80\uDCFF")
+    BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
+    SURROGATES = "\uDC80\uDCFF"
+
+    def encode(self, text, errors="strict"):
+        return _testcapi.EncodeLocaleEx(text, 0, errors)
+
+    def check_encode_strings(self, errors):
+        for text in self.STRINGS:
+            with self.subTest(text=text):
+                try:
+                    expected = text.encode(self.ENCODING, errors)
+                except UnicodeEncodeError:
+                    with self.assertRaises(RuntimeError) as cm:
+                        self.encode(self.SURROGATES)
+                    errmsg = str(cm.exception)
+                    self.assertTrue(errmsg.startswith("encode error: pos=0, reason="), errmsg)
+                else:
+                    encoded = self.encode(text, errors)
+                    self.assertEqual(encoded, expected)
+
+    def test_encode_strict(self):
+        self.check_encode_strings("strict")
+
+    def test_encode_surrogateescape(self):
+        self.check_encode_strings("surrogateescape")
+
+    def test_encode_surrogatepass(self):
+        try:
+            self.encode('', 'surrogatepass')
+        except ValueError as exc:
+            if str(exc) == 'unsupported error handler':
+                self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
+                              f"surrogatepass error handler")
+            else:
+                raise
+
+        self.check_encode_strings("surrogatepass")
+
+    def decode(self, encoded, errors="strict"):
+        return _testcapi.DecodeLocaleEx(encoded, 0, errors)
+
+    def check_decode_strings(self, errors):
+        is_utf8 = (self.ENCODING == "utf-8")
+        if is_utf8:
+            encode_errors = 'surrogateescape'
+        else:
+            encode_errors = 'strict'
+
+        strings = list(self.BYTES_STRINGS)
+        for text in self.STRINGS:
+            try:
+                encoded = text.encode(self.ENCODING, encode_errors)
+                if encoded not in strings:
+                    strings.append(encoded)
+            except UnicodeEncodeError:
+                encoded = None
+
+            if is_utf8:
+                encoded2 = text.encode(self.ENCODING, 'surrogatepass')
+                if encoded2 != encoded:
+                    strings.append(encoded2)
+
+        for encoded in strings:
+            with self.subTest(encoded=encoded):
+                try:
+                    expected = encoded.decode(self.ENCODING, errors)
+                except UnicodeDecodeError:
+                    with self.assertRaises(RuntimeError) as cm:
+                        self.decode(encoded, errors)
+                    errmsg = str(cm.exception)
+                    self.assertTrue(errmsg.startswith("decode error: "), errmsg)
+                else:
+                    decoded = self.decode(encoded, errors)
+                    self.assertEqual(decoded, expected)
+
+    def test_decode_strict(self):
+        self.check_decode_strings("strict")
+
+    def test_decode_surrogateescape(self):
+        self.check_decode_strings("surrogateescape")
+
+    def test_decode_surrogatepass(self):
+        try:
+            self.decode(b'', 'surrogatepass')
+        except ValueError as exc:
+            if str(exc) == 'unsupported error handler':
+                self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
+                              f"surrogatepass error handler")
+            else:
+                raise
+
+        self.check_decode_strings("surrogatepass")
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@ -4550,6 +4550,98 @@ new_hamt(PyObject *self, PyObject *args)
 }


+static PyObject *
+encode_locale_ex(PyObject *self, PyObject *args)
+{
+    PyObject *unicode;
+    int current_locale = 0;
+    wchar_t *wstr;
+    PyObject *res = NULL;
+    const char *errors = NULL;
+
+    if (!PyArg_ParseTuple(args, "U|is", &unicode, &current_locale, &errors)) {
+        return NULL;
+    }
+    wstr = PyUnicode_AsWideCharString(unicode, NULL);
+    if (wstr == NULL) {
+        return NULL;
+    }
+    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
+
+    char *str = NULL;
+    size_t error_pos;
+    const char *reason = NULL;
+    int ret = _Py_EncodeLocaleEx(wstr,
+                                 &str, &error_pos, &reason,
+                                 current_locale, error_handler);
+    PyMem_Free(wstr);
+
+    switch(ret) {
+    case 0:
+        res = PyBytes_FromString(str);
+        PyMem_RawFree(str);
+        break;
+    case -1:
+        PyErr_NoMemory();
+        break;
+    case -2:
+        PyErr_Format(PyExc_RuntimeError, "encode error: pos=%zu, reason=%s",
+                     error_pos, reason);
+        break;
+    case -3:
+        PyErr_SetString(PyExc_ValueError, "unsupported error handler");
+        break;
+    default:
+        PyErr_SetString(PyExc_ValueError, "unknow error code");
+        break;
+    }
+    return res;
+}
+
+
+static PyObject *
+decode_locale_ex(PyObject *self, PyObject *args)
+{
+    char *str;
+    int current_locale = 0;
+    PyObject *res = NULL;
+    const char *errors = NULL;
+
+    if (!PyArg_ParseTuple(args, "y|is", &str, &current_locale, &errors)) {
+        return NULL;
+    }
+    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
+
+    wchar_t *wstr = NULL;
+    size_t wlen = 0;
+    const char *reason = NULL;
+    int ret = _Py_DecodeLocaleEx(str,
+                                 &wstr, &wlen, &reason,
+                                 current_locale, error_handler);
+
+    switch(ret) {
+    case 0:
+        res = PyUnicode_FromWideChar(wstr, wlen);
+        PyMem_RawFree(wstr);
+        break;
+    case -1:
+        PyErr_NoMemory();
+        break;
+    case -2:
+        PyErr_Format(PyExc_RuntimeError, "decode error: pos=%zu, reason=%s",
+                     wlen, reason);
+        break;
+    case -3:
+        PyErr_SetString(PyExc_ValueError, "unsupported error handler");
+        break;
+    default:
+        PyErr_SetString(PyExc_ValueError, "unknow error code");
+        break;
+    }
+    return res;
+}
+
+
 static PyMethodDef TestMethods[] = {
    {"raise_exception",         raise_exception,                 METH_VARARGS},
    {"raise_memoryerror",       raise_memoryerror,               METH_NOARGS},
@ -4771,6 +4863,8 @@ static PyMethodDef TestMethods[] = {
    {"get_mapping_items", get_mapping_items, METH_O},
    {"test_pythread_tss_key_state", test_pythread_tss_key_state, METH_VARARGS},
    {"hamt", new_hamt, METH_NOARGS},
+    {"EncodeLocaleEx", encode_locale_ex, METH_VARARGS},
+    {"DecodeLocaleEx", decode_locale_ex, METH_VARARGS},
    {NULL, NULL} /* sentinel */
 };

--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@ -313,7 +313,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
            Py_ssize_t startpos, endpos, newpos;
            Py_ssize_t k;
            if (error_handler == _Py_ERROR_UNKNOWN) {
-                error_handler = get_error_handler(errors);
+                error_handler = _Py_GetErrorHandler(errors);
            }

            startpos = i-1;
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -318,20 +318,8 @@ static int convert_uc(PyObject *obj, void *addr);

 #include "clinic/unicodeobject.c.h"

-typedef enum {
-    _Py_ERROR_UNKNOWN=0,
-    _Py_ERROR_STRICT,
-    _Py_ERROR_SURROGATEESCAPE,
-    _Py_ERROR_REPLACE,
-    _Py_ERROR_IGNORE,
-    _Py_ERROR_BACKSLASHREPLACE,
-    _Py_ERROR_SURROGATEPASS,
-    _Py_ERROR_XMLCHARREFREPLACE,
-    _Py_ERROR_OTHER
-} _Py_error_handler;
-
-static _Py_error_handler
-get_error_handler(const char *errors)
+_Py_error_handler
+_Py_GetErrorHandler(const char *errors)
 {
    if (errors == NULL || strcmp(errors, "strict") == 0) {
        return _Py_ERROR_STRICT;
@ -3327,34 +3315,12 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
    return NULL;
 }

-static int
-locale_error_handler(const char *errors, int *surrogateescape)
-{
-    _Py_error_handler error_handler = get_error_handler(errors);
-    switch (error_handler)
-    {
-    case _Py_ERROR_STRICT:
-        *surrogateescape = 0;
-        return 0;
-    case _Py_ERROR_SURROGATEESCAPE:
-        *surrogateescape = 1;
-        return 0;
-    default:
-        PyErr_Format(PyExc_ValueError,
-                     "only 'strict' and 'surrogateescape' error handlers "
-                     "are supported, not '%s'",
-                     errors);
-        return -1;
-    }
-}

 static PyObject *
 unicode_encode_locale(PyObject *unicode, const char *errors,
                      int current_locale)
 {
-    int surrogateescape;
-    if (locale_error_handler(errors, &surrogateescape) < 0)
-        return NULL;
+    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);

    Py_ssize_t wlen;
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
@ -3373,7 +3339,7 @@ unicode_encode_locale(PyObject *unicode, const char *errors,
    size_t error_pos;
    const char *reason;
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
-                                 current_locale, surrogateescape);
+                                 current_locale, error_handler);
    if (res != 0) {
        if (res == -2) {
            PyObject *exc;
@ -3388,6 +3354,9 @@ unicode_encode_locale(PyObject *unicode, const char *errors,
            }
            return NULL;
        }
+        else if (res == -3) {
+            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
+        }
        else {
            PyErr_NoMemory();
            PyMem_Free(wstr);
@ -3571,9 +3540,7 @@ static PyObject*
 unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
                      int current_locale)
 {
-    int surrogateescape;
-    if (locale_error_handler(errors, &surrogateescape) < 0)
-        return NULL;
+    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);

    if (str[len] != '\0' || (size_t)len != strlen(str))  {
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
@ -3584,7 +3551,7 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
    size_t wlen;
    const char *reason;
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
-                                 current_locale, surrogateescape);
+                                 current_locale, error_handler);
    if (res != 0) {
        if (res == -2) {
            PyObject *exc;
@ -3598,6 +3565,9 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
                Py_DECREF(exc);
            }
        }
+        else if (res == -3) {
+            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
+        }
        else {
            PyErr_NoMemory();
        }
@ -4863,7 +4833,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
        }

        if (error_handler == _Py_ERROR_UNKNOWN)
-            error_handler = get_error_handler(errors);
+            error_handler = _Py_GetErrorHandler(errors);

        switch (error_handler) {
        case _Py_ERROR_IGNORE:
@ -4932,13 +4902,29 @@ onError:
   is not NULL, write the decoding error message into *reason. */
 int
 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
-                 const char **reason, int surrogateescape)
+                 const char **reason, _Py_error_handler errors)
 {
    const char *orig_s = s;
    const char *e;
    wchar_t *unicode;
    Py_ssize_t outpos;

+    int surrogateescape = 0;
+    int surrogatepass = 0;
+    switch (errors)
+    {
+    case _Py_ERROR_STRICT:
+        break;
+    case _Py_ERROR_SURROGATEESCAPE:
+        surrogateescape = 1;
+        break;
+    case _Py_ERROR_SURROGATEPASS:
+        surrogatepass = 1;
+        break;
+    default:
+        return -3;
+    }
+
    /* Note: size will always be longer than the resulting Unicode
       character count */
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
@ -4971,31 +4957,47 @@ _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
 #endif
        }
        else {
-            if (!ch && s == e)
+            if (!ch && s == e) {
                break;
-            if (!surrogateescape) {
-                PyMem_RawFree(unicode );
-                if (reason != NULL) {
-                    switch (ch) {
-                    case 0:
-                        *reason = "unexpected end of data";
-                        break;
-                    case 1:
-                        *reason = "invalid start byte";
-                        break;
-                    /* 2, 3, 4 */
-                    default:
-                        *reason = "invalid continuation byte";
-                        break;
-                    }
-                }
-                if (wlen != NULL) {
-                    *wlen = s - orig_s;
-                }
-                return -2;
            }
-            /* surrogateescape */
-            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
+
+            if (surrogateescape) {
+                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
+            }
+            else {
+                /* Is it a valid three-byte code? */
+                if (surrogatepass
+                    && (e - s) >= 3
+                    && (s[0] & 0xf0) == 0xe0
+                    && (s[1] & 0xc0) == 0x80
+                    && (s[2] & 0xc0) == 0x80)
+                {
+                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+                    s += 3;
+                    unicode[outpos++] = ch;
+                }
+                else {
+                    PyMem_RawFree(unicode );
+                    if (reason != NULL) {
+                        switch (ch) {
+                        case 0:
+                            *reason = "unexpected end of data";
+                            break;
+                        case 1:
+                            *reason = "invalid start byte";
+                            break;
+                        /* 2, 3, 4 */
+                        default:
+                            *reason = "invalid continuation byte";
+                            break;
+                        }
+                    }
+                    if (wlen != NULL) {
+                        *wlen = s - orig_s;
+                    }
+                    return -2;
+                }
+            }
        }
    }
    unicode[outpos] = L'\0';
@ -5030,13 +5032,29 @@ _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
   On memory allocation failure, return -1. */
 int
 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
-                 const char **reason, int raw_malloc, int surrogateescape)
+                 const char **reason, int raw_malloc, _Py_error_handler errors)
 {
    const Py_ssize_t max_char_size = 4;
    Py_ssize_t len = wcslen(text);

    assert(len >= 0);

+    int surrogateescape = 0;
+    int surrogatepass = 0;
+    switch (errors)
+    {
+    case _Py_ERROR_STRICT:
+        break;
+    case _Py_ERROR_SURROGATEESCAPE:
+        surrogateescape = 1;
+        break;
+    case _Py_ERROR_SURROGATEPASS:
+        surrogatepass = 1;
+        break;
+    default:
+        return -3;
+    }
+
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
        return -1;
    }
@ -5053,8 +5071,19 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,

    char *p = bytes;
    Py_ssize_t i;
-    for (i = 0; i < len; i++) {
+    for (i = 0; i < len; ) {
+        Py_ssize_t ch_pos = i;
        Py_UCS4 ch = text[i];
+        i++;
+#if Py_UNICODE_SIZE == 2
+        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
+            && i < len
+            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
+        {
+            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
+            i++;
+        }
+#endif

        if (ch < 0x80) {
            /* Encode ASCII */
@ -5066,11 +5095,11 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
            *p++ = (char)(0xc0 | (ch >> 6));
            *p++ = (char)(0x80 | (ch & 0x3f));
        }
-        else if (Py_UNICODE_IS_SURROGATE(ch)) {
+        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
            /* surrogateescape error handler */
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
                if (error_pos != NULL) {
-                    *error_pos = (size_t)i;
+                    *error_pos = (size_t)ch_pos;
                }
                if (reason != NULL) {
                    *reason = "encoding error";
@ -6741,7 +6770,7 @@ unicode_encode_ucs1(PyObject *unicode,

            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
            if (error_handler == _Py_ERROR_UNKNOWN)
-                error_handler = get_error_handler(errors);
+                error_handler = _Py_GetErrorHandler(errors);

            switch (error_handler) {
            case _Py_ERROR_STRICT:
@ -6945,7 +6974,7 @@ PyUnicode_DecodeASCII(const char *s,
        /* byte outsize range 0x00..0x7f: call the error handler */

        if (error_handler == _Py_ERROR_UNKNOWN)
-            error_handler = get_error_handler(errors);
+            error_handler = _Py_GetErrorHandler(errors);

        switch (error_handler)
        {
@ -8404,7 +8433,7 @@ charmap_encoding_error(
    /* cache callback name lookup
     * (if not done yet, i.e. it's the first error) */
    if (*error_handler == _Py_ERROR_UNKNOWN)
-        *error_handler = get_error_handler(errors);
+        *error_handler = _Py_GetErrorHandler(errors);

    switch (*error_handler) {
    case _Py_ERROR_STRICT:
--- a/Programs/_freeze_importlib.c
+++ b/Programs/_freeze_importlib.c
@ -82,14 +82,6 @@ main(int argc, char *argv[])
    /* Don't install importlib, since it could execute outdated bytecode. */
    config._install_importlib = 0;
    config._frozen = 1;
-#ifdef MS_WINDOWS
-    /* bpo-34523: initfsencoding() is not called if _install_importlib=0,
-       so interp->fscodec_initialized value remains 0.
-       PyUnicode_EncodeFSDefault() doesn't support the "surrogatepass" error
-       handler in such case, whereas it's the default error handler on Windows.
-       Force the "strict" error handler to work around this bootstrap issue. */
-    config.filesystem_errors = "strict";
-#endif

    _PyInitError err = _Py_InitializeFromConfig(&config);
    /* No need to call _PyCoreConfig_Clear() since we didn't allocate any
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@ -32,6 +32,24 @@ extern int winerror_to_errno(int);
 int _Py_open_cloexec_works = -1;
 #endif

+
+static int
+get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
+{
+    switch (errors)
+    {
+    case _Py_ERROR_STRICT:
+        *surrogateescape = 0;
+        return 0;
+    case _Py_ERROR_SURROGATEESCAPE:
+        *surrogateescape = 1;
+        return 0;
+    default:
+        return -1;
+    }
+}
+
+
 PyObject *
 _Py_device_encoding(int fd)
 {
@ -215,12 +233,17 @@ _Py_GetForceASCII(void)
 static int
 encode_ascii(const wchar_t *text, char **str,
             size_t *error_pos, const char **reason,
-             int raw_malloc, int surrogateescape)
+             int raw_malloc, _Py_error_handler errors)
 {
    char *result = NULL, *out;
    size_t len, i;
    wchar_t ch;

+    int surrogateescape;
+    if (get_surrogateescape(errors, &surrogateescape) < 0) {
+        return -3;
+    }
+
    len = wcslen(text);

    /* +1 for NULL byte */
@ -278,13 +301,18 @@ _Py_GetForceASCII(void)
 #if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
 static int
 decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
-             const char **reason, int surrogateescape)
+             const char **reason, _Py_error_handler errors)
 {
    wchar_t *res;
    unsigned char *in;
    wchar_t *out;
    size_t argsize = strlen(arg) + 1;

+    int surrogateescape;
+    if (get_surrogateescape(errors, &surrogateescape) < 0) {
+        return -3;
+    }
+
    if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
        return -1;
    }
@ -325,7 +353,7 @@ decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,

 static int
 decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
-                      const char **reason, int surrogateescape)
+                      const char **reason, _Py_error_handler errors)
 {
    wchar_t *res;
    size_t argsize;
@ -336,6 +364,11 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
    mbstate_t mbs;
 #endif

+    int surrogateescape;
+    if (get_surrogateescape(errors, &surrogateescape) < 0) {
+        return -3;
+    }
+
 #ifdef HAVE_BROKEN_MBSTOWCS
    /* Some platforms have a broken implementation of
     * mbstowcs which does not count the characters that
@ -456,7 +489,7 @@ decode_error:
    /* Cannot use C locale for escaping; manually escape as if charset
       is ASCII (i.e. escape all bytes > 128. This will still roundtrip
       correctly in the locale's charset, which must be an ASCII superset. */
-    return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
+    return decode_ascii(arg, wstr, wlen, reason, errors);
 #endif   /* HAVE_MBRTOWC */
 }

@ -479,33 +512,35 @@ decode_error:
   invalid byte sequence in the input string into *wlen. If reason is not NULL,
   write the decoding error message into *reason.

+   Return -3 if the error handler 'errors' is not supported.
+
   Use the Py_EncodeLocaleEx() function to encode the character string back to
   a byte string. */
 int
 _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
                   const char **reason,
-                   int current_locale, int surrogateescape)
+                   int current_locale, _Py_error_handler errors)
 {
    if (current_locale) {
 #ifdef __ANDROID__
        return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
-                                surrogateescape);
+                                errors);
 #else
-        return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
+        return decode_current_locale(arg, wstr, wlen, reason, errors);
 #endif
    }

 #if defined(__APPLE__) || defined(__ANDROID__)
    return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
-                            surrogateescape);
+                            errors);
 #else
    int use_utf8 = (Py_UTF8Mode == 1);
 #ifdef MS_WINDOWS
    use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
 #endif
    if (use_utf8) {
-        return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen,
-                                reason, surrogateescape);
+        return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
+                                errors);
    }

 #ifdef USE_FORCE_ASCII
@ -515,11 +550,11 @@ _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,

    if (force_ascii) {
        /* force ASCII encoding to workaround mbstowcs() issue */
-        return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
+        return decode_ascii(arg, wstr, wlen, reason, errors);
    }
 #endif

-    return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
+    return decode_current_locale(arg, wstr, wlen, reason, errors);
 #endif   /* __APPLE__ or __ANDROID__ */
 }

@ -547,8 +582,11 @@ wchar_t*
 Py_DecodeLocale(const char* arg, size_t *wlen)
 {
    wchar_t *wstr;
-    int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1);
+    int res = _Py_DecodeLocaleEx(arg, &wstr, wlen,
+                                 NULL, 0,
+                                 _Py_ERROR_SURROGATEESCAPE);
    if (res != 0) {
+        assert(res != -3);
        if (wlen != NULL) {
            *wlen = (size_t)res;
        }
@ -561,13 +599,18 @@ Py_DecodeLocale(const char* arg, size_t *wlen)
 static int
 encode_current_locale(const wchar_t *text, char **str,
                      size_t *error_pos, const char **reason,
-                      int raw_malloc, int surrogateescape)
+                      int raw_malloc, _Py_error_handler errors)
 {
    const size_t len = wcslen(text);
    char *result = NULL, *bytes = NULL;
    size_t i, size, converted;
    wchar_t c, buf[2];

+    int surrogateescape;
+    if (get_surrogateescape(errors, &surrogateescape) < 0) {
+        return -3;
+    }
+
    /* The function works in two steps:
       1. compute the length of the output buffer in bytes (size)
       2. outputs the bytes */
@ -646,32 +689,50 @@ encode_error:
    return -2;
 }

+
+/* Encode a string to the locale encoding.
+
+   Parameters:
+
+   * raw_malloc: if non-zero, allocate memory using PyMem_RawMalloc() instead
+     of PyMem_Malloc().
+   * current_locale: if non-zero, use the current LC_CTYPE, otherwise use
+     Python filesystem encoding.
+   * errors: error handler like "strict" or "surrogateescape".
+
+   Return value:
+
+    0: success, *str is set to a newly allocated decoded string.
+   -1: memory allocation failure
+   -2: encoding error, set *error_pos and *reason (if set).
+   -3: the error handler 'errors' is not supported.
+ */
 static int
 encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
                 const char **reason,
-                 int raw_malloc, int current_locale, int surrogateescape)
+                 int raw_malloc, int current_locale, _Py_error_handler errors)
 {
    if (current_locale) {
 #ifdef __ANDROID__
        return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
-                                raw_malloc, surrogateescape);
+                                raw_malloc, errors);
 #else
        return encode_current_locale(text, str, error_pos, reason,
-                                     raw_malloc, surrogateescape);
+                                     raw_malloc, errors);
 #endif
    }

 #if defined(__APPLE__) || defined(__ANDROID__)
    return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
-                            raw_malloc, surrogateescape);
-#else   /* __APPLE__ */
+                            raw_malloc, errors);
+#else
    int use_utf8 = (Py_UTF8Mode == 1);
 #ifdef MS_WINDOWS
    use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
 #endif
    if (use_utf8) {
        return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
-                                raw_malloc, surrogateescape);
+                                raw_malloc, errors);
    }

 #ifdef USE_FORCE_ASCII
@ -681,12 +742,12 @@ encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,

    if (force_ascii) {
        return encode_ascii(text, str, error_pos, reason,
-                            raw_malloc, surrogateescape);
+                            raw_malloc, errors);
    }
 #endif

    return encode_current_locale(text, str, error_pos, reason,
-                                 raw_malloc, surrogateescape);
+                                 raw_malloc, errors);
 #endif   /* __APPLE__ or __ANDROID__ */
 }

@ -696,7 +757,8 @@ encode_locale(const wchar_t *text, size_t *error_pos,
 {
    char *str;
    int res = encode_locale_ex(text, &str, error_pos, NULL,
-                               raw_malloc, current_locale, 1);
+                               raw_malloc, current_locale,
+                               _Py_ERROR_SURROGATEESCAPE);
    if (res != -2 && error_pos) {
        *error_pos = (size_t)-1;
    }
@ -737,10 +799,10 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos)
 int
 _Py_EncodeLocaleEx(const wchar_t *text, char **str,
                   size_t *error_pos, const char **reason,
-                   int current_locale, int surrogateescape)
+                   int current_locale, _Py_error_handler errors)
 {
    return encode_locale_ex(text, str, error_pos, reason, 1,
-                            current_locale, surrogateescape);
+                            current_locale, errors);
 }