From 3d4226a832cabc630402589cc671cc4035d504e5 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 29 Aug 2018 22:21:32 +0200 Subject: [PATCH] bpo-34523: Support surrogatepass in locale codecs (GH-8995) Add support for the "surrogatepass" error handler in PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault() for the UTF-8 encoding. Changes: * _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex() now support the surrogatepass error handler (_Py_ERROR_SURROGATEPASS). * _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() now use the _Py_error_handler enum instead of "int surrogateescape" to pass the error handler. These functions now return -3 if the error handler is unknown. * Add unit tests on _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() in test_codecs. * Rename get_error_handler() to _Py_GetErrorHandler() and expose it as a private function. * _freeze_importlib doesn't need config.filesystem_errors="strict" workaround anymore. --- Include/fileutils.h | 29 +++++- Lib/test/test_codecs.py | 118 ++++++++++++++++++++++- Modules/_testcapimodule.c | 94 +++++++++++++++++++ Objects/stringlib/codecs.h | 2 +- Objects/unicodeobject.c | 177 ++++++++++++++++++++--------------- Programs/_freeze_importlib.c | 8 -- Python/fileutils.c | 112 +++++++++++++++++----- 7 files changed, 423 insertions(+), 117 deletions(-) diff --git a/Include/fileutils.h b/Include/fileutils.h index 370878469df..f0a8e2c61a4 100644 --- a/Include/fileutils.h +++ b/Include/fileutils.h @@ -5,6 +5,24 @@ extern "C" { #endif + +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000 +typedef enum { + _Py_ERROR_UNKNOWN=0, + _Py_ERROR_STRICT, + _Py_ERROR_SURROGATEESCAPE, + _Py_ERROR_REPLACE, + _Py_ERROR_IGNORE, + _Py_ERROR_BACKSLASHREPLACE, + _Py_ERROR_SURROGATEPASS, + _Py_ERROR_XMLCHARREFREPLACE, + _Py_ERROR_OTHER +} _Py_error_handler; + +PyAPI_FUNC(_Py_error_handler) _Py_GetErrorHandler(const char *errors); +#endif + + #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000 PyAPI_FUNC(wchar_t *) Py_DecodeLocale( const char *arg, @@ -26,7 +44,7 @@ PyAPI_FUNC(int) _Py_DecodeUTF8Ex( wchar_t **wstr, size_t *wlen, const char **reason, - int surrogateescape); + _Py_error_handler errors); PyAPI_FUNC(int) _Py_EncodeUTF8Ex( const wchar_t *text, @@ -34,19 +52,22 @@ PyAPI_FUNC(int) _Py_EncodeUTF8Ex( size_t *error_pos, const char **reason, int raw_malloc, - int surrogateescape); + _Py_error_handler errors); PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape( const char *arg, Py_ssize_t arglen); +#endif + +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000 PyAPI_FUNC(int) _Py_DecodeLocaleEx( const char *arg, wchar_t **wstr, size_t *wlen, const char **reason, int current_locale, - int surrogateescape); + _Py_error_handler errors); PyAPI_FUNC(int) _Py_EncodeLocaleEx( const wchar_t *text, @@ -54,7 +75,7 @@ PyAPI_FUNC(int) _Py_EncodeLocaleEx( size_t *error_pos, const char **reason, int current_locale, - int surrogateescape); + _Py_error_handler errors); #endif #ifndef Py_LIMITED_API diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 86d0dde1705..00b5d317c40 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -9,6 +9,11 @@ from unittest import mock from test import support +try: + import _testcapi +except ImportError as exc: + _testcapi = None + try: import ctypes except ImportError: @@ -2051,13 +2056,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): @support.cpython_only def test_basics_capi(self): - from _testcapi import codec_incrementalencoder, codec_incrementaldecoder s = "abc123" # all codecs should be able to encode these for encoding in all_unicode_encodings: if encoding not in broken_unicode_with_stateful: # check incremental decoder/encoder (fetched via the C API) try: - cencoder = codec_incrementalencoder(encoding) + cencoder = _testcapi.codec_incrementalencoder(encoding) except LookupError: # no IncrementalEncoder pass else: @@ -2066,7 +2070,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): for c in s: encodedresult += cencoder.encode(c) encodedresult += cencoder.encode("", True) - cdecoder = codec_incrementaldecoder(encoding) + cdecoder = _testcapi.codec_incrementaldecoder(encoding) decodedresult = "" for c in encodedresult: decodedresult += cdecoder.decode(bytes([c])) @@ -2077,12 +2081,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): if encoding not in ("idna", "mbcs"): # check incremental decoder/encoder with errors argument try: - cencoder = codec_incrementalencoder(encoding, "ignore") + cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore") except LookupError: # no IncrementalEncoder pass else: encodedresult = b"".join(cencoder.encode(c) for c in s) - cdecoder = codec_incrementaldecoder(encoding, "ignore") + cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore") decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult) self.assertEqual(decodedresult, s, @@ -3263,5 +3267,109 @@ class Latin1Test(unittest.TestCase): self.assertEqual(data.decode('latin1'), expected) +@unittest.skipIf(_testcapi is None, 'need _testcapi module') +class LocaleCodecTest(unittest.TestCase): + """ + Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex(). + """ + ENCODING = sys.getfilesystemencoding() + STRINGS = ("ascii", "ulatin1:\xa7\xe9", + "u255:\xff", + "UCS:\xe9\u20ac\U0010ffff", + "surrogates:\uDC80\uDCFF") + BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff") + SURROGATES = "\uDC80\uDCFF" + + def encode(self, text, errors="strict"): + return _testcapi.EncodeLocaleEx(text, 0, errors) + + def check_encode_strings(self, errors): + for text in self.STRINGS: + with self.subTest(text=text): + try: + expected = text.encode(self.ENCODING, errors) + except UnicodeEncodeError: + with self.assertRaises(RuntimeError) as cm: + self.encode(self.SURROGATES) + errmsg = str(cm.exception) + self.assertTrue(errmsg.startswith("encode error: pos=0, reason="), errmsg) + else: + encoded = self.encode(text, errors) + self.assertEqual(encoded, expected) + + def test_encode_strict(self): + self.check_encode_strings("strict") + + def test_encode_surrogateescape(self): + self.check_encode_strings("surrogateescape") + + def test_encode_surrogatepass(self): + try: + self.encode('', 'surrogatepass') + except ValueError as exc: + if str(exc) == 'unsupported error handler': + self.skipTest(f"{self.ENCODING!r} encoder doesn't support " + f"surrogatepass error handler") + else: + raise + + self.check_encode_strings("surrogatepass") + + def decode(self, encoded, errors="strict"): + return _testcapi.DecodeLocaleEx(encoded, 0, errors) + + def check_decode_strings(self, errors): + is_utf8 = (self.ENCODING == "utf-8") + if is_utf8: + encode_errors = 'surrogateescape' + else: + encode_errors = 'strict' + + strings = list(self.BYTES_STRINGS) + for text in self.STRINGS: + try: + encoded = text.encode(self.ENCODING, encode_errors) + if encoded not in strings: + strings.append(encoded) + except UnicodeEncodeError: + encoded = None + + if is_utf8: + encoded2 = text.encode(self.ENCODING, 'surrogatepass') + if encoded2 != encoded: + strings.append(encoded2) + + for encoded in strings: + with self.subTest(encoded=encoded): + try: + expected = encoded.decode(self.ENCODING, errors) + except UnicodeDecodeError: + with self.assertRaises(RuntimeError) as cm: + self.decode(encoded, errors) + errmsg = str(cm.exception) + self.assertTrue(errmsg.startswith("decode error: "), errmsg) + else: + decoded = self.decode(encoded, errors) + self.assertEqual(decoded, expected) + + def test_decode_strict(self): + self.check_decode_strings("strict") + + def test_decode_surrogateescape(self): + self.check_decode_strings("surrogateescape") + + def test_decode_surrogatepass(self): + try: + self.decode(b'', 'surrogatepass') + except ValueError as exc: + if str(exc) == 'unsupported error handler': + self.skipTest(f"{self.ENCODING!r} decoder doesn't support " + f"surrogatepass error handler") + else: + raise + + self.check_decode_strings("surrogatepass") + + if __name__ == "__main__": unittest.main() diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c index 014c2f325af..7c2c57b9800 100644 --- a/Modules/_testcapimodule.c +++ b/Modules/_testcapimodule.c @@ -4550,6 +4550,98 @@ new_hamt(PyObject *self, PyObject *args) } +static PyObject * +encode_locale_ex(PyObject *self, PyObject *args) +{ + PyObject *unicode; + int current_locale = 0; + wchar_t *wstr; + PyObject *res = NULL; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "U|is", &unicode, ¤t_locale, &errors)) { + return NULL; + } + wstr = PyUnicode_AsWideCharString(unicode, NULL); + if (wstr == NULL) { + return NULL; + } + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); + + char *str = NULL; + size_t error_pos; + const char *reason = NULL; + int ret = _Py_EncodeLocaleEx(wstr, + &str, &error_pos, &reason, + current_locale, error_handler); + PyMem_Free(wstr); + + switch(ret) { + case 0: + res = PyBytes_FromString(str); + PyMem_RawFree(str); + break; + case -1: + PyErr_NoMemory(); + break; + case -2: + PyErr_Format(PyExc_RuntimeError, "encode error: pos=%zu, reason=%s", + error_pos, reason); + break; + case -3: + PyErr_SetString(PyExc_ValueError, "unsupported error handler"); + break; + default: + PyErr_SetString(PyExc_ValueError, "unknow error code"); + break; + } + return res; +} + + +static PyObject * +decode_locale_ex(PyObject *self, PyObject *args) +{ + char *str; + int current_locale = 0; + PyObject *res = NULL; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "y|is", &str, ¤t_locale, &errors)) { + return NULL; + } + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); + + wchar_t *wstr = NULL; + size_t wlen = 0; + const char *reason = NULL; + int ret = _Py_DecodeLocaleEx(str, + &wstr, &wlen, &reason, + current_locale, error_handler); + + switch(ret) { + case 0: + res = PyUnicode_FromWideChar(wstr, wlen); + PyMem_RawFree(wstr); + break; + case -1: + PyErr_NoMemory(); + break; + case -2: + PyErr_Format(PyExc_RuntimeError, "decode error: pos=%zu, reason=%s", + wlen, reason); + break; + case -3: + PyErr_SetString(PyExc_ValueError, "unsupported error handler"); + break; + default: + PyErr_SetString(PyExc_ValueError, "unknow error code"); + break; + } + return res; +} + + static PyMethodDef TestMethods[] = { {"raise_exception", raise_exception, METH_VARARGS}, {"raise_memoryerror", raise_memoryerror, METH_NOARGS}, @@ -4771,6 +4863,8 @@ static PyMethodDef TestMethods[] = { {"get_mapping_items", get_mapping_items, METH_O}, {"test_pythread_tss_key_state", test_pythread_tss_key_state, METH_VARARGS}, {"hamt", new_hamt, METH_NOARGS}, + {"EncodeLocaleEx", encode_locale_ex, METH_VARARGS}, + {"DecodeLocaleEx", decode_locale_ex, METH_VARARGS}, {NULL, NULL} /* sentinel */ }; diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index f019d9a96bf..0abb4c8abb9 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -313,7 +313,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, Py_ssize_t startpos, endpos, newpos; Py_ssize_t k; if (error_handler == _Py_ERROR_UNKNOWN) { - error_handler = get_error_handler(errors); + error_handler = _Py_GetErrorHandler(errors); } startpos = i-1; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 60adcd9c88d..a797f838eb4 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -318,20 +318,8 @@ static int convert_uc(PyObject *obj, void *addr); #include "clinic/unicodeobject.c.h" -typedef enum { - _Py_ERROR_UNKNOWN=0, - _Py_ERROR_STRICT, - _Py_ERROR_SURROGATEESCAPE, - _Py_ERROR_REPLACE, - _Py_ERROR_IGNORE, - _Py_ERROR_BACKSLASHREPLACE, - _Py_ERROR_SURROGATEPASS, - _Py_ERROR_XMLCHARREFREPLACE, - _Py_ERROR_OTHER -} _Py_error_handler; - -static _Py_error_handler -get_error_handler(const char *errors) +_Py_error_handler +_Py_GetErrorHandler(const char *errors) { if (errors == NULL || strcmp(errors, "strict") == 0) { return _Py_ERROR_STRICT; @@ -3327,34 +3315,12 @@ PyUnicode_AsEncodedObject(PyObject *unicode, return NULL; } -static int -locale_error_handler(const char *errors, int *surrogateescape) -{ - _Py_error_handler error_handler = get_error_handler(errors); - switch (error_handler) - { - case _Py_ERROR_STRICT: - *surrogateescape = 0; - return 0; - case _Py_ERROR_SURROGATEESCAPE: - *surrogateescape = 1; - return 0; - default: - PyErr_Format(PyExc_ValueError, - "only 'strict' and 'surrogateescape' error handlers " - "are supported, not '%s'", - errors); - return -1; - } -} static PyObject * unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale) { - int surrogateescape; - if (locale_error_handler(errors, &surrogateescape) < 0) - return NULL; + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); Py_ssize_t wlen; wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen); @@ -3373,7 +3339,7 @@ unicode_encode_locale(PyObject *unicode, const char *errors, size_t error_pos; const char *reason; int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason, - current_locale, surrogateescape); + current_locale, error_handler); if (res != 0) { if (res == -2) { PyObject *exc; @@ -3388,6 +3354,9 @@ unicode_encode_locale(PyObject *unicode, const char *errors, } return NULL; } + else if (res == -3) { + PyErr_SetString(PyExc_ValueError, "unsupported error handler"); + } else { PyErr_NoMemory(); PyMem_Free(wstr); @@ -3571,9 +3540,7 @@ static PyObject* unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, int current_locale) { - int surrogateescape; - if (locale_error_handler(errors, &surrogateescape) < 0) - return NULL; + _Py_error_handler error_handler = _Py_GetErrorHandler(errors); if (str[len] != '\0' || (size_t)len != strlen(str)) { PyErr_SetString(PyExc_ValueError, "embedded null byte"); @@ -3584,7 +3551,7 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, size_t wlen; const char *reason; int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason, - current_locale, surrogateescape); + current_locale, error_handler); if (res != 0) { if (res == -2) { PyObject *exc; @@ -3598,6 +3565,9 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, Py_DECREF(exc); } } + else if (res == -3) { + PyErr_SetString(PyExc_ValueError, "unsupported error handler"); + } else { PyErr_NoMemory(); } @@ -4863,7 +4833,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, } if (error_handler == _Py_ERROR_UNKNOWN) - error_handler = get_error_handler(errors); + error_handler = _Py_GetErrorHandler(errors); switch (error_handler) { case _Py_ERROR_IGNORE: @@ -4932,13 +4902,29 @@ onError: is not NULL, write the decoding error message into *reason. */ int _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen, - const char **reason, int surrogateescape) + const char **reason, _Py_error_handler errors) { const char *orig_s = s; const char *e; wchar_t *unicode; Py_ssize_t outpos; + int surrogateescape = 0; + int surrogatepass = 0; + switch (errors) + { + case _Py_ERROR_STRICT: + break; + case _Py_ERROR_SURROGATEESCAPE: + surrogateescape = 1; + break; + case _Py_ERROR_SURROGATEPASS: + surrogatepass = 1; + break; + default: + return -3; + } + /* Note: size will always be longer than the resulting Unicode character count */ if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) { @@ -4971,31 +4957,47 @@ _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen, #endif } else { - if (!ch && s == e) + if (!ch && s == e) { break; - if (!surrogateescape) { - PyMem_RawFree(unicode ); - if (reason != NULL) { - switch (ch) { - case 0: - *reason = "unexpected end of data"; - break; - case 1: - *reason = "invalid start byte"; - break; - /* 2, 3, 4 */ - default: - *reason = "invalid continuation byte"; - break; - } - } - if (wlen != NULL) { - *wlen = s - orig_s; - } - return -2; } - /* surrogateescape */ - unicode[outpos++] = 0xDC00 + (unsigned char)*s++; + + if (surrogateescape) { + unicode[outpos++] = 0xDC00 + (unsigned char)*s++; + } + else { + /* Is it a valid three-byte code? */ + if (surrogatepass + && (e - s) >= 3 + && (s[0] & 0xf0) == 0xe0 + && (s[1] & 0xc0) == 0x80 + && (s[2] & 0xc0) == 0x80) + { + ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); + s += 3; + unicode[outpos++] = ch; + } + else { + PyMem_RawFree(unicode ); + if (reason != NULL) { + switch (ch) { + case 0: + *reason = "unexpected end of data"; + break; + case 1: + *reason = "invalid start byte"; + break; + /* 2, 3, 4 */ + default: + *reason = "invalid continuation byte"; + break; + } + } + if (wlen != NULL) { + *wlen = s - orig_s; + } + return -2; + } + } } } unicode[outpos] = L'\0'; @@ -5030,13 +5032,29 @@ _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen) On memory allocation failure, return -1. */ int _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, - const char **reason, int raw_malloc, int surrogateescape) + const char **reason, int raw_malloc, _Py_error_handler errors) { const Py_ssize_t max_char_size = 4; Py_ssize_t len = wcslen(text); assert(len >= 0); + int surrogateescape = 0; + int surrogatepass = 0; + switch (errors) + { + case _Py_ERROR_STRICT: + break; + case _Py_ERROR_SURROGATEESCAPE: + surrogateescape = 1; + break; + case _Py_ERROR_SURROGATEPASS: + surrogatepass = 1; + break; + default: + return -3; + } + if (len > PY_SSIZE_T_MAX / max_char_size - 1) { return -1; } @@ -5053,8 +5071,19 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, char *p = bytes; Py_ssize_t i; - for (i = 0; i < len; i++) { + for (i = 0; i < len; ) { + Py_ssize_t ch_pos = i; Py_UCS4 ch = text[i]; + i++; +#if Py_UNICODE_SIZE == 2 + if (Py_UNICODE_IS_HIGH_SURROGATE(ch) + && i < len + && Py_UNICODE_IS_LOW_SURROGATE(text[i])) + { + ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]); + i++; + } +#endif if (ch < 0x80) { /* Encode ASCII */ @@ -5066,11 +5095,11 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos, *p++ = (char)(0xc0 | (ch >> 6)); *p++ = (char)(0x80 | (ch & 0x3f)); } - else if (Py_UNICODE_IS_SURROGATE(ch)) { + else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) { /* surrogateescape error handler */ if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) { if (error_pos != NULL) { - *error_pos = (size_t)i; + *error_pos = (size_t)ch_pos; } if (reason != NULL) { *reason = "encoding error"; @@ -6741,7 +6770,7 @@ unicode_encode_ucs1(PyObject *unicode, /* cache callback name lookup (if not done yet, i.e. it's the first error) */ if (error_handler == _Py_ERROR_UNKNOWN) - error_handler = get_error_handler(errors); + error_handler = _Py_GetErrorHandler(errors); switch (error_handler) { case _Py_ERROR_STRICT: @@ -6945,7 +6974,7 @@ PyUnicode_DecodeASCII(const char *s, /* byte outsize range 0x00..0x7f: call the error handler */ if (error_handler == _Py_ERROR_UNKNOWN) - error_handler = get_error_handler(errors); + error_handler = _Py_GetErrorHandler(errors); switch (error_handler) { @@ -8404,7 +8433,7 @@ charmap_encoding_error( /* cache callback name lookup * (if not done yet, i.e. it's the first error) */ if (*error_handler == _Py_ERROR_UNKNOWN) - *error_handler = get_error_handler(errors); + *error_handler = _Py_GetErrorHandler(errors); switch (*error_handler) { case _Py_ERROR_STRICT: diff --git a/Programs/_freeze_importlib.c b/Programs/_freeze_importlib.c index 2621a7687ed..8830d131d6f 100644 --- a/Programs/_freeze_importlib.c +++ b/Programs/_freeze_importlib.c @@ -82,14 +82,6 @@ main(int argc, char *argv[]) /* Don't install importlib, since it could execute outdated bytecode. */ config._install_importlib = 0; config._frozen = 1; -#ifdef MS_WINDOWS - /* bpo-34523: initfsencoding() is not called if _install_importlib=0, - so interp->fscodec_initialized value remains 0. - PyUnicode_EncodeFSDefault() doesn't support the "surrogatepass" error - handler in such case, whereas it's the default error handler on Windows. - Force the "strict" error handler to work around this bootstrap issue. */ - config.filesystem_errors = "strict"; -#endif _PyInitError err = _Py_InitializeFromConfig(&config); /* No need to call _PyCoreConfig_Clear() since we didn't allocate any diff --git a/Python/fileutils.c b/Python/fileutils.c index 9a3c334d43b..0486f865924 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -32,6 +32,24 @@ extern int winerror_to_errno(int); int _Py_open_cloexec_works = -1; #endif + +static int +get_surrogateescape(_Py_error_handler errors, int *surrogateescape) +{ + switch (errors) + { + case _Py_ERROR_STRICT: + *surrogateescape = 0; + return 0; + case _Py_ERROR_SURROGATEESCAPE: + *surrogateescape = 1; + return 0; + default: + return -1; + } +} + + PyObject * _Py_device_encoding(int fd) { @@ -215,12 +233,17 @@ _Py_GetForceASCII(void) static int encode_ascii(const wchar_t *text, char **str, size_t *error_pos, const char **reason, - int raw_malloc, int surrogateescape) + int raw_malloc, _Py_error_handler errors) { char *result = NULL, *out; size_t len, i; wchar_t ch; + int surrogateescape; + if (get_surrogateescape(errors, &surrogateescape) < 0) { + return -3; + } + len = wcslen(text); /* +1 for NULL byte */ @@ -278,13 +301,18 @@ _Py_GetForceASCII(void) #if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII) static int decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen, - const char **reason, int surrogateescape) + const char **reason, _Py_error_handler errors) { wchar_t *res; unsigned char *in; wchar_t *out; size_t argsize = strlen(arg) + 1; + int surrogateescape; + if (get_surrogateescape(errors, &surrogateescape) < 0) { + return -3; + } + if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) { return -1; } @@ -325,7 +353,7 @@ decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen, static int decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, - const char **reason, int surrogateescape) + const char **reason, _Py_error_handler errors) { wchar_t *res; size_t argsize; @@ -336,6 +364,11 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, mbstate_t mbs; #endif + int surrogateescape; + if (get_surrogateescape(errors, &surrogateescape) < 0) { + return -3; + } + #ifdef HAVE_BROKEN_MBSTOWCS /* Some platforms have a broken implementation of * mbstowcs which does not count the characters that @@ -456,7 +489,7 @@ decode_error: /* Cannot use C locale for escaping; manually escape as if charset is ASCII (i.e. escape all bytes > 128. This will still roundtrip correctly in the locale's charset, which must be an ASCII superset. */ - return decode_ascii(arg, wstr, wlen, reason, surrogateescape); + return decode_ascii(arg, wstr, wlen, reason, errors); #endif /* HAVE_MBRTOWC */ } @@ -479,33 +512,35 @@ decode_error: invalid byte sequence in the input string into *wlen. If reason is not NULL, write the decoding error message into *reason. + Return -3 if the error handler 'errors' is not supported. + Use the Py_EncodeLocaleEx() function to encode the character string back to a byte string. */ int _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen, const char **reason, - int current_locale, int surrogateescape) + int current_locale, _Py_error_handler errors) { if (current_locale) { #ifdef __ANDROID__ return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, - surrogateescape); + errors); #else - return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); + return decode_current_locale(arg, wstr, wlen, reason, errors); #endif } #if defined(__APPLE__) || defined(__ANDROID__) return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, - surrogateescape); + errors); #else int use_utf8 = (Py_UTF8Mode == 1); #ifdef MS_WINDOWS use_utf8 |= !Py_LegacyWindowsFSEncodingFlag; #endif if (use_utf8) { - return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, - reason, surrogateescape); + return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, + errors); } #ifdef USE_FORCE_ASCII @@ -515,11 +550,11 @@ _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen, if (force_ascii) { /* force ASCII encoding to workaround mbstowcs() issue */ - return decode_ascii(arg, wstr, wlen, reason, surrogateescape); + return decode_ascii(arg, wstr, wlen, reason, errors); } #endif - return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); + return decode_current_locale(arg, wstr, wlen, reason, errors); #endif /* __APPLE__ or __ANDROID__ */ } @@ -547,8 +582,11 @@ wchar_t* Py_DecodeLocale(const char* arg, size_t *wlen) { wchar_t *wstr; - int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1); + int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, + NULL, 0, + _Py_ERROR_SURROGATEESCAPE); if (res != 0) { + assert(res != -3); if (wlen != NULL) { *wlen = (size_t)res; } @@ -561,13 +599,18 @@ Py_DecodeLocale(const char* arg, size_t *wlen) static int encode_current_locale(const wchar_t *text, char **str, size_t *error_pos, const char **reason, - int raw_malloc, int surrogateescape) + int raw_malloc, _Py_error_handler errors) { const size_t len = wcslen(text); char *result = NULL, *bytes = NULL; size_t i, size, converted; wchar_t c, buf[2]; + int surrogateescape; + if (get_surrogateescape(errors, &surrogateescape) < 0) { + return -3; + } + /* The function works in two steps: 1. compute the length of the output buffer in bytes (size) 2. outputs the bytes */ @@ -646,32 +689,50 @@ encode_error: return -2; } + +/* Encode a string to the locale encoding. + + Parameters: + + * raw_malloc: if non-zero, allocate memory using PyMem_RawMalloc() instead + of PyMem_Malloc(). + * current_locale: if non-zero, use the current LC_CTYPE, otherwise use + Python filesystem encoding. + * errors: error handler like "strict" or "surrogateescape". + + Return value: + + 0: success, *str is set to a newly allocated decoded string. + -1: memory allocation failure + -2: encoding error, set *error_pos and *reason (if set). + -3: the error handler 'errors' is not supported. + */ static int encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos, const char **reason, - int raw_malloc, int current_locale, int surrogateescape) + int raw_malloc, int current_locale, _Py_error_handler errors) { if (current_locale) { #ifdef __ANDROID__ return _Py_EncodeUTF8Ex(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); #else return encode_current_locale(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); #endif } #if defined(__APPLE__) || defined(__ANDROID__) return _Py_EncodeUTF8Ex(text, str, error_pos, reason, - raw_malloc, surrogateescape); -#else /* __APPLE__ */ + raw_malloc, errors); +#else int use_utf8 = (Py_UTF8Mode == 1); #ifdef MS_WINDOWS use_utf8 |= !Py_LegacyWindowsFSEncodingFlag; #endif if (use_utf8) { return _Py_EncodeUTF8Ex(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); } #ifdef USE_FORCE_ASCII @@ -681,12 +742,12 @@ encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos, if (force_ascii) { return encode_ascii(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); } #endif return encode_current_locale(text, str, error_pos, reason, - raw_malloc, surrogateescape); + raw_malloc, errors); #endif /* __APPLE__ or __ANDROID__ */ } @@ -696,7 +757,8 @@ encode_locale(const wchar_t *text, size_t *error_pos, { char *str; int res = encode_locale_ex(text, &str, error_pos, NULL, - raw_malloc, current_locale, 1); + raw_malloc, current_locale, + _Py_ERROR_SURROGATEESCAPE); if (res != -2 && error_pos) { *error_pos = (size_t)-1; } @@ -737,10 +799,10 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos) int _Py_EncodeLocaleEx(const wchar_t *text, char **str, size_t *error_pos, const char **reason, - int current_locale, int surrogateescape) + int current_locale, _Py_error_handler errors) { return encode_locale_ex(text, str, error_pos, reason, 1, - current_locale, surrogateescape); + current_locale, errors); }