bpo-34523: Support surrogatepass in locale codecs (GH-8995)
Add support for the "surrogatepass" error handler in PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault() for the UTF-8 encoding. Changes: * _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex() now support the surrogatepass error handler (_Py_ERROR_SURROGATEPASS). * _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() now use the _Py_error_handler enum instead of "int surrogateescape" to pass the error handler. These functions now return -3 if the error handler is unknown. * Add unit tests on _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() in test_codecs. * Rename get_error_handler() to _Py_GetErrorHandler() and expose it as a private function. * _freeze_importlib doesn't need config.filesystem_errors="strict" workaround anymore.
This commit is contained in:
parent
c5989cd876
commit
3d4226a832
|
@ -5,6 +5,24 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
|
||||||
|
typedef enum {
|
||||||
|
_Py_ERROR_UNKNOWN=0,
|
||||||
|
_Py_ERROR_STRICT,
|
||||||
|
_Py_ERROR_SURROGATEESCAPE,
|
||||||
|
_Py_ERROR_REPLACE,
|
||||||
|
_Py_ERROR_IGNORE,
|
||||||
|
_Py_ERROR_BACKSLASHREPLACE,
|
||||||
|
_Py_ERROR_SURROGATEPASS,
|
||||||
|
_Py_ERROR_XMLCHARREFREPLACE,
|
||||||
|
_Py_ERROR_OTHER
|
||||||
|
} _Py_error_handler;
|
||||||
|
|
||||||
|
PyAPI_FUNC(_Py_error_handler) _Py_GetErrorHandler(const char *errors);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
|
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
|
||||||
PyAPI_FUNC(wchar_t *) Py_DecodeLocale(
|
PyAPI_FUNC(wchar_t *) Py_DecodeLocale(
|
||||||
const char *arg,
|
const char *arg,
|
||||||
|
@ -26,7 +44,7 @@ PyAPI_FUNC(int) _Py_DecodeUTF8Ex(
|
||||||
wchar_t **wstr,
|
wchar_t **wstr,
|
||||||
size_t *wlen,
|
size_t *wlen,
|
||||||
const char **reason,
|
const char **reason,
|
||||||
int surrogateescape);
|
_Py_error_handler errors);
|
||||||
|
|
||||||
PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
|
PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
|
||||||
const wchar_t *text,
|
const wchar_t *text,
|
||||||
|
@ -34,19 +52,22 @@ PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
|
||||||
size_t *error_pos,
|
size_t *error_pos,
|
||||||
const char **reason,
|
const char **reason,
|
||||||
int raw_malloc,
|
int raw_malloc,
|
||||||
int surrogateescape);
|
_Py_error_handler errors);
|
||||||
|
|
||||||
PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
|
PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
|
||||||
const char *arg,
|
const char *arg,
|
||||||
Py_ssize_t arglen);
|
Py_ssize_t arglen);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
|
||||||
PyAPI_FUNC(int) _Py_DecodeLocaleEx(
|
PyAPI_FUNC(int) _Py_DecodeLocaleEx(
|
||||||
const char *arg,
|
const char *arg,
|
||||||
wchar_t **wstr,
|
wchar_t **wstr,
|
||||||
size_t *wlen,
|
size_t *wlen,
|
||||||
const char **reason,
|
const char **reason,
|
||||||
int current_locale,
|
int current_locale,
|
||||||
int surrogateescape);
|
_Py_error_handler errors);
|
||||||
|
|
||||||
PyAPI_FUNC(int) _Py_EncodeLocaleEx(
|
PyAPI_FUNC(int) _Py_EncodeLocaleEx(
|
||||||
const wchar_t *text,
|
const wchar_t *text,
|
||||||
|
@ -54,7 +75,7 @@ PyAPI_FUNC(int) _Py_EncodeLocaleEx(
|
||||||
size_t *error_pos,
|
size_t *error_pos,
|
||||||
const char **reason,
|
const char **reason,
|
||||||
int current_locale,
|
int current_locale,
|
||||||
int surrogateescape);
|
_Py_error_handler errors);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef Py_LIMITED_API
|
#ifndef Py_LIMITED_API
|
||||||
|
|
|
@ -9,6 +9,11 @@ from unittest import mock
|
||||||
|
|
||||||
from test import support
|
from test import support
|
||||||
|
|
||||||
|
try:
|
||||||
|
import _testcapi
|
||||||
|
except ImportError as exc:
|
||||||
|
_testcapi = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ctypes
|
import ctypes
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -2051,13 +2056,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
|
||||||
|
|
||||||
@support.cpython_only
|
@support.cpython_only
|
||||||
def test_basics_capi(self):
|
def test_basics_capi(self):
|
||||||
from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
|
|
||||||
s = "abc123" # all codecs should be able to encode these
|
s = "abc123" # all codecs should be able to encode these
|
||||||
for encoding in all_unicode_encodings:
|
for encoding in all_unicode_encodings:
|
||||||
if encoding not in broken_unicode_with_stateful:
|
if encoding not in broken_unicode_with_stateful:
|
||||||
# check incremental decoder/encoder (fetched via the C API)
|
# check incremental decoder/encoder (fetched via the C API)
|
||||||
try:
|
try:
|
||||||
cencoder = codec_incrementalencoder(encoding)
|
cencoder = _testcapi.codec_incrementalencoder(encoding)
|
||||||
except LookupError: # no IncrementalEncoder
|
except LookupError: # no IncrementalEncoder
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
@ -2066,7 +2070,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
|
||||||
for c in s:
|
for c in s:
|
||||||
encodedresult += cencoder.encode(c)
|
encodedresult += cencoder.encode(c)
|
||||||
encodedresult += cencoder.encode("", True)
|
encodedresult += cencoder.encode("", True)
|
||||||
cdecoder = codec_incrementaldecoder(encoding)
|
cdecoder = _testcapi.codec_incrementaldecoder(encoding)
|
||||||
decodedresult = ""
|
decodedresult = ""
|
||||||
for c in encodedresult:
|
for c in encodedresult:
|
||||||
decodedresult += cdecoder.decode(bytes([c]))
|
decodedresult += cdecoder.decode(bytes([c]))
|
||||||
|
@ -2077,12 +2081,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
|
||||||
if encoding not in ("idna", "mbcs"):
|
if encoding not in ("idna", "mbcs"):
|
||||||
# check incremental decoder/encoder with errors argument
|
# check incremental decoder/encoder with errors argument
|
||||||
try:
|
try:
|
||||||
cencoder = codec_incrementalencoder(encoding, "ignore")
|
cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
|
||||||
except LookupError: # no IncrementalEncoder
|
except LookupError: # no IncrementalEncoder
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
encodedresult = b"".join(cencoder.encode(c) for c in s)
|
encodedresult = b"".join(cencoder.encode(c) for c in s)
|
||||||
cdecoder = codec_incrementaldecoder(encoding, "ignore")
|
cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
|
||||||
decodedresult = "".join(cdecoder.decode(bytes([c]))
|
decodedresult = "".join(cdecoder.decode(bytes([c]))
|
||||||
for c in encodedresult)
|
for c in encodedresult)
|
||||||
self.assertEqual(decodedresult, s,
|
self.assertEqual(decodedresult, s,
|
||||||
|
@ -3263,5 +3267,109 @@ class Latin1Test(unittest.TestCase):
|
||||||
self.assertEqual(data.decode('latin1'), expected)
|
self.assertEqual(data.decode('latin1'), expected)
|
||||||
|
|
||||||
|
|
||||||
|
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
|
||||||
|
class LocaleCodecTest(unittest.TestCase):
|
||||||
|
"""
|
||||||
|
Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
|
||||||
|
"""
|
||||||
|
ENCODING = sys.getfilesystemencoding()
|
||||||
|
STRINGS = ("ascii", "ulatin1:\xa7\xe9",
|
||||||
|
"u255:\xff",
|
||||||
|
"UCS:\xe9\u20ac\U0010ffff",
|
||||||
|
"surrogates:\uDC80\uDCFF")
|
||||||
|
BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
|
||||||
|
SURROGATES = "\uDC80\uDCFF"
|
||||||
|
|
||||||
|
def encode(self, text, errors="strict"):
|
||||||
|
return _testcapi.EncodeLocaleEx(text, 0, errors)
|
||||||
|
|
||||||
|
def check_encode_strings(self, errors):
|
||||||
|
for text in self.STRINGS:
|
||||||
|
with self.subTest(text=text):
|
||||||
|
try:
|
||||||
|
expected = text.encode(self.ENCODING, errors)
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
with self.assertRaises(RuntimeError) as cm:
|
||||||
|
self.encode(self.SURROGATES)
|
||||||
|
errmsg = str(cm.exception)
|
||||||
|
self.assertTrue(errmsg.startswith("encode error: pos=0, reason="), errmsg)
|
||||||
|
else:
|
||||||
|
encoded = self.encode(text, errors)
|
||||||
|
self.assertEqual(encoded, expected)
|
||||||
|
|
||||||
|
def test_encode_strict(self):
|
||||||
|
self.check_encode_strings("strict")
|
||||||
|
|
||||||
|
def test_encode_surrogateescape(self):
|
||||||
|
self.check_encode_strings("surrogateescape")
|
||||||
|
|
||||||
|
def test_encode_surrogatepass(self):
|
||||||
|
try:
|
||||||
|
self.encode('', 'surrogatepass')
|
||||||
|
except ValueError as exc:
|
||||||
|
if str(exc) == 'unsupported error handler':
|
||||||
|
self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
|
||||||
|
f"surrogatepass error handler")
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
self.check_encode_strings("surrogatepass")
|
||||||
|
|
||||||
|
def decode(self, encoded, errors="strict"):
|
||||||
|
return _testcapi.DecodeLocaleEx(encoded, 0, errors)
|
||||||
|
|
||||||
|
def check_decode_strings(self, errors):
|
||||||
|
is_utf8 = (self.ENCODING == "utf-8")
|
||||||
|
if is_utf8:
|
||||||
|
encode_errors = 'surrogateescape'
|
||||||
|
else:
|
||||||
|
encode_errors = 'strict'
|
||||||
|
|
||||||
|
strings = list(self.BYTES_STRINGS)
|
||||||
|
for text in self.STRINGS:
|
||||||
|
try:
|
||||||
|
encoded = text.encode(self.ENCODING, encode_errors)
|
||||||
|
if encoded not in strings:
|
||||||
|
strings.append(encoded)
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
encoded = None
|
||||||
|
|
||||||
|
if is_utf8:
|
||||||
|
encoded2 = text.encode(self.ENCODING, 'surrogatepass')
|
||||||
|
if encoded2 != encoded:
|
||||||
|
strings.append(encoded2)
|
||||||
|
|
||||||
|
for encoded in strings:
|
||||||
|
with self.subTest(encoded=encoded):
|
||||||
|
try:
|
||||||
|
expected = encoded.decode(self.ENCODING, errors)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
with self.assertRaises(RuntimeError) as cm:
|
||||||
|
self.decode(encoded, errors)
|
||||||
|
errmsg = str(cm.exception)
|
||||||
|
self.assertTrue(errmsg.startswith("decode error: "), errmsg)
|
||||||
|
else:
|
||||||
|
decoded = self.decode(encoded, errors)
|
||||||
|
self.assertEqual(decoded, expected)
|
||||||
|
|
||||||
|
def test_decode_strict(self):
|
||||||
|
self.check_decode_strings("strict")
|
||||||
|
|
||||||
|
def test_decode_surrogateescape(self):
|
||||||
|
self.check_decode_strings("surrogateescape")
|
||||||
|
|
||||||
|
def test_decode_surrogatepass(self):
|
||||||
|
try:
|
||||||
|
self.decode(b'', 'surrogatepass')
|
||||||
|
except ValueError as exc:
|
||||||
|
if str(exc) == 'unsupported error handler':
|
||||||
|
self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
|
||||||
|
f"surrogatepass error handler")
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
self.check_decode_strings("surrogatepass")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
@ -4550,6 +4550,98 @@ new_hamt(PyObject *self, PyObject *args)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
encode_locale_ex(PyObject *self, PyObject *args)
|
||||||
|
{
|
||||||
|
PyObject *unicode;
|
||||||
|
int current_locale = 0;
|
||||||
|
wchar_t *wstr;
|
||||||
|
PyObject *res = NULL;
|
||||||
|
const char *errors = NULL;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "U|is", &unicode, ¤t_locale, &errors)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
wstr = PyUnicode_AsWideCharString(unicode, NULL);
|
||||||
|
if (wstr == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
|
||||||
|
|
||||||
|
char *str = NULL;
|
||||||
|
size_t error_pos;
|
||||||
|
const char *reason = NULL;
|
||||||
|
int ret = _Py_EncodeLocaleEx(wstr,
|
||||||
|
&str, &error_pos, &reason,
|
||||||
|
current_locale, error_handler);
|
||||||
|
PyMem_Free(wstr);
|
||||||
|
|
||||||
|
switch(ret) {
|
||||||
|
case 0:
|
||||||
|
res = PyBytes_FromString(str);
|
||||||
|
PyMem_RawFree(str);
|
||||||
|
break;
|
||||||
|
case -1:
|
||||||
|
PyErr_NoMemory();
|
||||||
|
break;
|
||||||
|
case -2:
|
||||||
|
PyErr_Format(PyExc_RuntimeError, "encode error: pos=%zu, reason=%s",
|
||||||
|
error_pos, reason);
|
||||||
|
break;
|
||||||
|
case -3:
|
||||||
|
PyErr_SetString(PyExc_ValueError, "unsupported error handler");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
PyErr_SetString(PyExc_ValueError, "unknow error code");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
decode_locale_ex(PyObject *self, PyObject *args)
|
||||||
|
{
|
||||||
|
char *str;
|
||||||
|
int current_locale = 0;
|
||||||
|
PyObject *res = NULL;
|
||||||
|
const char *errors = NULL;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "y|is", &str, ¤t_locale, &errors)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
|
||||||
|
|
||||||
|
wchar_t *wstr = NULL;
|
||||||
|
size_t wlen = 0;
|
||||||
|
const char *reason = NULL;
|
||||||
|
int ret = _Py_DecodeLocaleEx(str,
|
||||||
|
&wstr, &wlen, &reason,
|
||||||
|
current_locale, error_handler);
|
||||||
|
|
||||||
|
switch(ret) {
|
||||||
|
case 0:
|
||||||
|
res = PyUnicode_FromWideChar(wstr, wlen);
|
||||||
|
PyMem_RawFree(wstr);
|
||||||
|
break;
|
||||||
|
case -1:
|
||||||
|
PyErr_NoMemory();
|
||||||
|
break;
|
||||||
|
case -2:
|
||||||
|
PyErr_Format(PyExc_RuntimeError, "decode error: pos=%zu, reason=%s",
|
||||||
|
wlen, reason);
|
||||||
|
break;
|
||||||
|
case -3:
|
||||||
|
PyErr_SetString(PyExc_ValueError, "unsupported error handler");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
PyErr_SetString(PyExc_ValueError, "unknow error code");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static PyMethodDef TestMethods[] = {
|
static PyMethodDef TestMethods[] = {
|
||||||
{"raise_exception", raise_exception, METH_VARARGS},
|
{"raise_exception", raise_exception, METH_VARARGS},
|
||||||
{"raise_memoryerror", raise_memoryerror, METH_NOARGS},
|
{"raise_memoryerror", raise_memoryerror, METH_NOARGS},
|
||||||
|
@ -4771,6 +4863,8 @@ static PyMethodDef TestMethods[] = {
|
||||||
{"get_mapping_items", get_mapping_items, METH_O},
|
{"get_mapping_items", get_mapping_items, METH_O},
|
||||||
{"test_pythread_tss_key_state", test_pythread_tss_key_state, METH_VARARGS},
|
{"test_pythread_tss_key_state", test_pythread_tss_key_state, METH_VARARGS},
|
||||||
{"hamt", new_hamt, METH_NOARGS},
|
{"hamt", new_hamt, METH_NOARGS},
|
||||||
|
{"EncodeLocaleEx", encode_locale_ex, METH_VARARGS},
|
||||||
|
{"DecodeLocaleEx", decode_locale_ex, METH_VARARGS},
|
||||||
{NULL, NULL} /* sentinel */
|
{NULL, NULL} /* sentinel */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -313,7 +313,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
|
||||||
Py_ssize_t startpos, endpos, newpos;
|
Py_ssize_t startpos, endpos, newpos;
|
||||||
Py_ssize_t k;
|
Py_ssize_t k;
|
||||||
if (error_handler == _Py_ERROR_UNKNOWN) {
|
if (error_handler == _Py_ERROR_UNKNOWN) {
|
||||||
error_handler = get_error_handler(errors);
|
error_handler = _Py_GetErrorHandler(errors);
|
||||||
}
|
}
|
||||||
|
|
||||||
startpos = i-1;
|
startpos = i-1;
|
||||||
|
|
|
@ -318,20 +318,8 @@ static int convert_uc(PyObject *obj, void *addr);
|
||||||
|
|
||||||
#include "clinic/unicodeobject.c.h"
|
#include "clinic/unicodeobject.c.h"
|
||||||
|
|
||||||
typedef enum {
|
_Py_error_handler
|
||||||
_Py_ERROR_UNKNOWN=0,
|
_Py_GetErrorHandler(const char *errors)
|
||||||
_Py_ERROR_STRICT,
|
|
||||||
_Py_ERROR_SURROGATEESCAPE,
|
|
||||||
_Py_ERROR_REPLACE,
|
|
||||||
_Py_ERROR_IGNORE,
|
|
||||||
_Py_ERROR_BACKSLASHREPLACE,
|
|
||||||
_Py_ERROR_SURROGATEPASS,
|
|
||||||
_Py_ERROR_XMLCHARREFREPLACE,
|
|
||||||
_Py_ERROR_OTHER
|
|
||||||
} _Py_error_handler;
|
|
||||||
|
|
||||||
static _Py_error_handler
|
|
||||||
get_error_handler(const char *errors)
|
|
||||||
{
|
{
|
||||||
if (errors == NULL || strcmp(errors, "strict") == 0) {
|
if (errors == NULL || strcmp(errors, "strict") == 0) {
|
||||||
return _Py_ERROR_STRICT;
|
return _Py_ERROR_STRICT;
|
||||||
|
@ -3327,34 +3315,12 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
|
||||||
locale_error_handler(const char *errors, int *surrogateescape)
|
|
||||||
{
|
|
||||||
_Py_error_handler error_handler = get_error_handler(errors);
|
|
||||||
switch (error_handler)
|
|
||||||
{
|
|
||||||
case _Py_ERROR_STRICT:
|
|
||||||
*surrogateescape = 0;
|
|
||||||
return 0;
|
|
||||||
case _Py_ERROR_SURROGATEESCAPE:
|
|
||||||
*surrogateescape = 1;
|
|
||||||
return 0;
|
|
||||||
default:
|
|
||||||
PyErr_Format(PyExc_ValueError,
|
|
||||||
"only 'strict' and 'surrogateescape' error handlers "
|
|
||||||
"are supported, not '%s'",
|
|
||||||
errors);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
unicode_encode_locale(PyObject *unicode, const char *errors,
|
unicode_encode_locale(PyObject *unicode, const char *errors,
|
||||||
int current_locale)
|
int current_locale)
|
||||||
{
|
{
|
||||||
int surrogateescape;
|
_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
|
||||||
if (locale_error_handler(errors, &surrogateescape) < 0)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
Py_ssize_t wlen;
|
Py_ssize_t wlen;
|
||||||
wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
|
wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
|
||||||
|
@ -3373,7 +3339,7 @@ unicode_encode_locale(PyObject *unicode, const char *errors,
|
||||||
size_t error_pos;
|
size_t error_pos;
|
||||||
const char *reason;
|
const char *reason;
|
||||||
int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
|
int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
|
||||||
current_locale, surrogateescape);
|
current_locale, error_handler);
|
||||||
if (res != 0) {
|
if (res != 0) {
|
||||||
if (res == -2) {
|
if (res == -2) {
|
||||||
PyObject *exc;
|
PyObject *exc;
|
||||||
|
@ -3388,6 +3354,9 @@ unicode_encode_locale(PyObject *unicode, const char *errors,
|
||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
else if (res == -3) {
|
||||||
|
PyErr_SetString(PyExc_ValueError, "unsupported error handler");
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
PyErr_NoMemory();
|
PyErr_NoMemory();
|
||||||
PyMem_Free(wstr);
|
PyMem_Free(wstr);
|
||||||
|
@ -3571,9 +3540,7 @@ static PyObject*
|
||||||
unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
|
unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
|
||||||
int current_locale)
|
int current_locale)
|
||||||
{
|
{
|
||||||
int surrogateescape;
|
_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
|
||||||
if (locale_error_handler(errors, &surrogateescape) < 0)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
if (str[len] != '\0' || (size_t)len != strlen(str)) {
|
if (str[len] != '\0' || (size_t)len != strlen(str)) {
|
||||||
PyErr_SetString(PyExc_ValueError, "embedded null byte");
|
PyErr_SetString(PyExc_ValueError, "embedded null byte");
|
||||||
|
@ -3584,7 +3551,7 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
|
||||||
size_t wlen;
|
size_t wlen;
|
||||||
const char *reason;
|
const char *reason;
|
||||||
int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
|
int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
|
||||||
current_locale, surrogateescape);
|
current_locale, error_handler);
|
||||||
if (res != 0) {
|
if (res != 0) {
|
||||||
if (res == -2) {
|
if (res == -2) {
|
||||||
PyObject *exc;
|
PyObject *exc;
|
||||||
|
@ -3598,6 +3565,9 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
|
||||||
Py_DECREF(exc);
|
Py_DECREF(exc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (res == -3) {
|
||||||
|
PyErr_SetString(PyExc_ValueError, "unsupported error handler");
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
PyErr_NoMemory();
|
PyErr_NoMemory();
|
||||||
}
|
}
|
||||||
|
@ -4863,7 +4833,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (error_handler == _Py_ERROR_UNKNOWN)
|
if (error_handler == _Py_ERROR_UNKNOWN)
|
||||||
error_handler = get_error_handler(errors);
|
error_handler = _Py_GetErrorHandler(errors);
|
||||||
|
|
||||||
switch (error_handler) {
|
switch (error_handler) {
|
||||||
case _Py_ERROR_IGNORE:
|
case _Py_ERROR_IGNORE:
|
||||||
|
@ -4932,13 +4902,29 @@ onError:
|
||||||
is not NULL, write the decoding error message into *reason. */
|
is not NULL, write the decoding error message into *reason. */
|
||||||
int
|
int
|
||||||
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
|
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
|
||||||
const char **reason, int surrogateescape)
|
const char **reason, _Py_error_handler errors)
|
||||||
{
|
{
|
||||||
const char *orig_s = s;
|
const char *orig_s = s;
|
||||||
const char *e;
|
const char *e;
|
||||||
wchar_t *unicode;
|
wchar_t *unicode;
|
||||||
Py_ssize_t outpos;
|
Py_ssize_t outpos;
|
||||||
|
|
||||||
|
int surrogateescape = 0;
|
||||||
|
int surrogatepass = 0;
|
||||||
|
switch (errors)
|
||||||
|
{
|
||||||
|
case _Py_ERROR_STRICT:
|
||||||
|
break;
|
||||||
|
case _Py_ERROR_SURROGATEESCAPE:
|
||||||
|
surrogateescape = 1;
|
||||||
|
break;
|
||||||
|
case _Py_ERROR_SURROGATEPASS:
|
||||||
|
surrogatepass = 1;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return -3;
|
||||||
|
}
|
||||||
|
|
||||||
/* Note: size will always be longer than the resulting Unicode
|
/* Note: size will always be longer than the resulting Unicode
|
||||||
character count */
|
character count */
|
||||||
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
|
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
|
||||||
|
@ -4971,31 +4957,47 @@ _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (!ch && s == e)
|
if (!ch && s == e) {
|
||||||
break;
|
break;
|
||||||
if (!surrogateescape) {
|
|
||||||
PyMem_RawFree(unicode );
|
|
||||||
if (reason != NULL) {
|
|
||||||
switch (ch) {
|
|
||||||
case 0:
|
|
||||||
*reason = "unexpected end of data";
|
|
||||||
break;
|
|
||||||
case 1:
|
|
||||||
*reason = "invalid start byte";
|
|
||||||
break;
|
|
||||||
/* 2, 3, 4 */
|
|
||||||
default:
|
|
||||||
*reason = "invalid continuation byte";
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (wlen != NULL) {
|
|
||||||
*wlen = s - orig_s;
|
|
||||||
}
|
|
||||||
return -2;
|
|
||||||
}
|
}
|
||||||
/* surrogateescape */
|
|
||||||
unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
|
if (surrogateescape) {
|
||||||
|
unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/* Is it a valid three-byte code? */
|
||||||
|
if (surrogatepass
|
||||||
|
&& (e - s) >= 3
|
||||||
|
&& (s[0] & 0xf0) == 0xe0
|
||||||
|
&& (s[1] & 0xc0) == 0x80
|
||||||
|
&& (s[2] & 0xc0) == 0x80)
|
||||||
|
{
|
||||||
|
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
|
||||||
|
s += 3;
|
||||||
|
unicode[outpos++] = ch;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
PyMem_RawFree(unicode );
|
||||||
|
if (reason != NULL) {
|
||||||
|
switch (ch) {
|
||||||
|
case 0:
|
||||||
|
*reason = "unexpected end of data";
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
*reason = "invalid start byte";
|
||||||
|
break;
|
||||||
|
/* 2, 3, 4 */
|
||||||
|
default:
|
||||||
|
*reason = "invalid continuation byte";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (wlen != NULL) {
|
||||||
|
*wlen = s - orig_s;
|
||||||
|
}
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
unicode[outpos] = L'\0';
|
unicode[outpos] = L'\0';
|
||||||
|
@ -5030,13 +5032,29 @@ _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
|
||||||
On memory allocation failure, return -1. */
|
On memory allocation failure, return -1. */
|
||||||
int
|
int
|
||||||
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
|
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
|
||||||
const char **reason, int raw_malloc, int surrogateescape)
|
const char **reason, int raw_malloc, _Py_error_handler errors)
|
||||||
{
|
{
|
||||||
const Py_ssize_t max_char_size = 4;
|
const Py_ssize_t max_char_size = 4;
|
||||||
Py_ssize_t len = wcslen(text);
|
Py_ssize_t len = wcslen(text);
|
||||||
|
|
||||||
assert(len >= 0);
|
assert(len >= 0);
|
||||||
|
|
||||||
|
int surrogateescape = 0;
|
||||||
|
int surrogatepass = 0;
|
||||||
|
switch (errors)
|
||||||
|
{
|
||||||
|
case _Py_ERROR_STRICT:
|
||||||
|
break;
|
||||||
|
case _Py_ERROR_SURROGATEESCAPE:
|
||||||
|
surrogateescape = 1;
|
||||||
|
break;
|
||||||
|
case _Py_ERROR_SURROGATEPASS:
|
||||||
|
surrogatepass = 1;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return -3;
|
||||||
|
}
|
||||||
|
|
||||||
if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
|
if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -5053,8 +5071,19 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
|
||||||
|
|
||||||
char *p = bytes;
|
char *p = bytes;
|
||||||
Py_ssize_t i;
|
Py_ssize_t i;
|
||||||
for (i = 0; i < len; i++) {
|
for (i = 0; i < len; ) {
|
||||||
|
Py_ssize_t ch_pos = i;
|
||||||
Py_UCS4 ch = text[i];
|
Py_UCS4 ch = text[i];
|
||||||
|
i++;
|
||||||
|
#if Py_UNICODE_SIZE == 2
|
||||||
|
if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
|
||||||
|
&& i < len
|
||||||
|
&& Py_UNICODE_IS_LOW_SURROGATE(text[i]))
|
||||||
|
{
|
||||||
|
ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (ch < 0x80) {
|
if (ch < 0x80) {
|
||||||
/* Encode ASCII */
|
/* Encode ASCII */
|
||||||
|
@ -5066,11 +5095,11 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
|
||||||
*p++ = (char)(0xc0 | (ch >> 6));
|
*p++ = (char)(0xc0 | (ch >> 6));
|
||||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||||
}
|
}
|
||||||
else if (Py_UNICODE_IS_SURROGATE(ch)) {
|
else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
|
||||||
/* surrogateescape error handler */
|
/* surrogateescape error handler */
|
||||||
if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
|
if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
|
||||||
if (error_pos != NULL) {
|
if (error_pos != NULL) {
|
||||||
*error_pos = (size_t)i;
|
*error_pos = (size_t)ch_pos;
|
||||||
}
|
}
|
||||||
if (reason != NULL) {
|
if (reason != NULL) {
|
||||||
*reason = "encoding error";
|
*reason = "encoding error";
|
||||||
|
@ -6741,7 +6770,7 @@ unicode_encode_ucs1(PyObject *unicode,
|
||||||
|
|
||||||
/* cache callback name lookup (if not done yet, i.e. it's the first error) */
|
/* cache callback name lookup (if not done yet, i.e. it's the first error) */
|
||||||
if (error_handler == _Py_ERROR_UNKNOWN)
|
if (error_handler == _Py_ERROR_UNKNOWN)
|
||||||
error_handler = get_error_handler(errors);
|
error_handler = _Py_GetErrorHandler(errors);
|
||||||
|
|
||||||
switch (error_handler) {
|
switch (error_handler) {
|
||||||
case _Py_ERROR_STRICT:
|
case _Py_ERROR_STRICT:
|
||||||
|
@ -6945,7 +6974,7 @@ PyUnicode_DecodeASCII(const char *s,
|
||||||
/* byte outsize range 0x00..0x7f: call the error handler */
|
/* byte outsize range 0x00..0x7f: call the error handler */
|
||||||
|
|
||||||
if (error_handler == _Py_ERROR_UNKNOWN)
|
if (error_handler == _Py_ERROR_UNKNOWN)
|
||||||
error_handler = get_error_handler(errors);
|
error_handler = _Py_GetErrorHandler(errors);
|
||||||
|
|
||||||
switch (error_handler)
|
switch (error_handler)
|
||||||
{
|
{
|
||||||
|
@ -8404,7 +8433,7 @@ charmap_encoding_error(
|
||||||
/* cache callback name lookup
|
/* cache callback name lookup
|
||||||
* (if not done yet, i.e. it's the first error) */
|
* (if not done yet, i.e. it's the first error) */
|
||||||
if (*error_handler == _Py_ERROR_UNKNOWN)
|
if (*error_handler == _Py_ERROR_UNKNOWN)
|
||||||
*error_handler = get_error_handler(errors);
|
*error_handler = _Py_GetErrorHandler(errors);
|
||||||
|
|
||||||
switch (*error_handler) {
|
switch (*error_handler) {
|
||||||
case _Py_ERROR_STRICT:
|
case _Py_ERROR_STRICT:
|
||||||
|
|
|
@ -82,14 +82,6 @@ main(int argc, char *argv[])
|
||||||
/* Don't install importlib, since it could execute outdated bytecode. */
|
/* Don't install importlib, since it could execute outdated bytecode. */
|
||||||
config._install_importlib = 0;
|
config._install_importlib = 0;
|
||||||
config._frozen = 1;
|
config._frozen = 1;
|
||||||
#ifdef MS_WINDOWS
|
|
||||||
/* bpo-34523: initfsencoding() is not called if _install_importlib=0,
|
|
||||||
so interp->fscodec_initialized value remains 0.
|
|
||||||
PyUnicode_EncodeFSDefault() doesn't support the "surrogatepass" error
|
|
||||||
handler in such case, whereas it's the default error handler on Windows.
|
|
||||||
Force the "strict" error handler to work around this bootstrap issue. */
|
|
||||||
config.filesystem_errors = "strict";
|
|
||||||
#endif
|
|
||||||
|
|
||||||
_PyInitError err = _Py_InitializeFromConfig(&config);
|
_PyInitError err = _Py_InitializeFromConfig(&config);
|
||||||
/* No need to call _PyCoreConfig_Clear() since we didn't allocate any
|
/* No need to call _PyCoreConfig_Clear() since we didn't allocate any
|
||||||
|
|
|
@ -32,6 +32,24 @@ extern int winerror_to_errno(int);
|
||||||
int _Py_open_cloexec_works = -1;
|
int _Py_open_cloexec_works = -1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
static int
|
||||||
|
get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
|
||||||
|
{
|
||||||
|
switch (errors)
|
||||||
|
{
|
||||||
|
case _Py_ERROR_STRICT:
|
||||||
|
*surrogateescape = 0;
|
||||||
|
return 0;
|
||||||
|
case _Py_ERROR_SURROGATEESCAPE:
|
||||||
|
*surrogateescape = 1;
|
||||||
|
return 0;
|
||||||
|
default:
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
_Py_device_encoding(int fd)
|
_Py_device_encoding(int fd)
|
||||||
{
|
{
|
||||||
|
@ -215,12 +233,17 @@ _Py_GetForceASCII(void)
|
||||||
static int
|
static int
|
||||||
encode_ascii(const wchar_t *text, char **str,
|
encode_ascii(const wchar_t *text, char **str,
|
||||||
size_t *error_pos, const char **reason,
|
size_t *error_pos, const char **reason,
|
||||||
int raw_malloc, int surrogateescape)
|
int raw_malloc, _Py_error_handler errors)
|
||||||
{
|
{
|
||||||
char *result = NULL, *out;
|
char *result = NULL, *out;
|
||||||
size_t len, i;
|
size_t len, i;
|
||||||
wchar_t ch;
|
wchar_t ch;
|
||||||
|
|
||||||
|
int surrogateescape;
|
||||||
|
if (get_surrogateescape(errors, &surrogateescape) < 0) {
|
||||||
|
return -3;
|
||||||
|
}
|
||||||
|
|
||||||
len = wcslen(text);
|
len = wcslen(text);
|
||||||
|
|
||||||
/* +1 for NULL byte */
|
/* +1 for NULL byte */
|
||||||
|
@ -278,13 +301,18 @@ _Py_GetForceASCII(void)
|
||||||
#if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
|
#if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
|
||||||
static int
|
static int
|
||||||
decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
|
decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
|
||||||
const char **reason, int surrogateescape)
|
const char **reason, _Py_error_handler errors)
|
||||||
{
|
{
|
||||||
wchar_t *res;
|
wchar_t *res;
|
||||||
unsigned char *in;
|
unsigned char *in;
|
||||||
wchar_t *out;
|
wchar_t *out;
|
||||||
size_t argsize = strlen(arg) + 1;
|
size_t argsize = strlen(arg) + 1;
|
||||||
|
|
||||||
|
int surrogateescape;
|
||||||
|
if (get_surrogateescape(errors, &surrogateescape) < 0) {
|
||||||
|
return -3;
|
||||||
|
}
|
||||||
|
|
||||||
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
|
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -325,7 +353,7 @@ decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
|
||||||
|
|
||||||
static int
|
static int
|
||||||
decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
|
decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
|
||||||
const char **reason, int surrogateescape)
|
const char **reason, _Py_error_handler errors)
|
||||||
{
|
{
|
||||||
wchar_t *res;
|
wchar_t *res;
|
||||||
size_t argsize;
|
size_t argsize;
|
||||||
|
@ -336,6 +364,11 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
|
||||||
mbstate_t mbs;
|
mbstate_t mbs;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
int surrogateescape;
|
||||||
|
if (get_surrogateescape(errors, &surrogateescape) < 0) {
|
||||||
|
return -3;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef HAVE_BROKEN_MBSTOWCS
|
#ifdef HAVE_BROKEN_MBSTOWCS
|
||||||
/* Some platforms have a broken implementation of
|
/* Some platforms have a broken implementation of
|
||||||
* mbstowcs which does not count the characters that
|
* mbstowcs which does not count the characters that
|
||||||
|
@ -456,7 +489,7 @@ decode_error:
|
||||||
/* Cannot use C locale for escaping; manually escape as if charset
|
/* Cannot use C locale for escaping; manually escape as if charset
|
||||||
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
|
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
|
||||||
correctly in the locale's charset, which must be an ASCII superset. */
|
correctly in the locale's charset, which must be an ASCII superset. */
|
||||||
return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
|
return decode_ascii(arg, wstr, wlen, reason, errors);
|
||||||
#endif /* HAVE_MBRTOWC */
|
#endif /* HAVE_MBRTOWC */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -479,33 +512,35 @@ decode_error:
|
||||||
invalid byte sequence in the input string into *wlen. If reason is not NULL,
|
invalid byte sequence in the input string into *wlen. If reason is not NULL,
|
||||||
write the decoding error message into *reason.
|
write the decoding error message into *reason.
|
||||||
|
|
||||||
|
Return -3 if the error handler 'errors' is not supported.
|
||||||
|
|
||||||
Use the Py_EncodeLocaleEx() function to encode the character string back to
|
Use the Py_EncodeLocaleEx() function to encode the character string back to
|
||||||
a byte string. */
|
a byte string. */
|
||||||
int
|
int
|
||||||
_Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
|
_Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
|
||||||
const char **reason,
|
const char **reason,
|
||||||
int current_locale, int surrogateescape)
|
int current_locale, _Py_error_handler errors)
|
||||||
{
|
{
|
||||||
if (current_locale) {
|
if (current_locale) {
|
||||||
#ifdef __ANDROID__
|
#ifdef __ANDROID__
|
||||||
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
|
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
|
||||||
surrogateescape);
|
errors);
|
||||||
#else
|
#else
|
||||||
return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
|
return decode_current_locale(arg, wstr, wlen, reason, errors);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||||
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
|
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
|
||||||
surrogateescape);
|
errors);
|
||||||
#else
|
#else
|
||||||
int use_utf8 = (Py_UTF8Mode == 1);
|
int use_utf8 = (Py_UTF8Mode == 1);
|
||||||
#ifdef MS_WINDOWS
|
#ifdef MS_WINDOWS
|
||||||
use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
|
use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
|
||||||
#endif
|
#endif
|
||||||
if (use_utf8) {
|
if (use_utf8) {
|
||||||
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen,
|
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
|
||||||
reason, surrogateescape);
|
errors);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_FORCE_ASCII
|
#ifdef USE_FORCE_ASCII
|
||||||
|
@ -515,11 +550,11 @@ _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
|
||||||
|
|
||||||
if (force_ascii) {
|
if (force_ascii) {
|
||||||
/* force ASCII encoding to workaround mbstowcs() issue */
|
/* force ASCII encoding to workaround mbstowcs() issue */
|
||||||
return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
|
return decode_ascii(arg, wstr, wlen, reason, errors);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
|
return decode_current_locale(arg, wstr, wlen, reason, errors);
|
||||||
#endif /* __APPLE__ or __ANDROID__ */
|
#endif /* __APPLE__ or __ANDROID__ */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -547,8 +582,11 @@ wchar_t*
|
||||||
Py_DecodeLocale(const char* arg, size_t *wlen)
|
Py_DecodeLocale(const char* arg, size_t *wlen)
|
||||||
{
|
{
|
||||||
wchar_t *wstr;
|
wchar_t *wstr;
|
||||||
int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1);
|
int res = _Py_DecodeLocaleEx(arg, &wstr, wlen,
|
||||||
|
NULL, 0,
|
||||||
|
_Py_ERROR_SURROGATEESCAPE);
|
||||||
if (res != 0) {
|
if (res != 0) {
|
||||||
|
assert(res != -3);
|
||||||
if (wlen != NULL) {
|
if (wlen != NULL) {
|
||||||
*wlen = (size_t)res;
|
*wlen = (size_t)res;
|
||||||
}
|
}
|
||||||
|
@ -561,13 +599,18 @@ Py_DecodeLocale(const char* arg, size_t *wlen)
|
||||||
static int
|
static int
|
||||||
encode_current_locale(const wchar_t *text, char **str,
|
encode_current_locale(const wchar_t *text, char **str,
|
||||||
size_t *error_pos, const char **reason,
|
size_t *error_pos, const char **reason,
|
||||||
int raw_malloc, int surrogateescape)
|
int raw_malloc, _Py_error_handler errors)
|
||||||
{
|
{
|
||||||
const size_t len = wcslen(text);
|
const size_t len = wcslen(text);
|
||||||
char *result = NULL, *bytes = NULL;
|
char *result = NULL, *bytes = NULL;
|
||||||
size_t i, size, converted;
|
size_t i, size, converted;
|
||||||
wchar_t c, buf[2];
|
wchar_t c, buf[2];
|
||||||
|
|
||||||
|
int surrogateescape;
|
||||||
|
if (get_surrogateescape(errors, &surrogateescape) < 0) {
|
||||||
|
return -3;
|
||||||
|
}
|
||||||
|
|
||||||
/* The function works in two steps:
|
/* The function works in two steps:
|
||||||
1. compute the length of the output buffer in bytes (size)
|
1. compute the length of the output buffer in bytes (size)
|
||||||
2. outputs the bytes */
|
2. outputs the bytes */
|
||||||
|
@ -646,32 +689,50 @@ encode_error:
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Encode a string to the locale encoding.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
|
||||||
|
* raw_malloc: if non-zero, allocate memory using PyMem_RawMalloc() instead
|
||||||
|
of PyMem_Malloc().
|
||||||
|
* current_locale: if non-zero, use the current LC_CTYPE, otherwise use
|
||||||
|
Python filesystem encoding.
|
||||||
|
* errors: error handler like "strict" or "surrogateescape".
|
||||||
|
|
||||||
|
Return value:
|
||||||
|
|
||||||
|
0: success, *str is set to a newly allocated decoded string.
|
||||||
|
-1: memory allocation failure
|
||||||
|
-2: encoding error, set *error_pos and *reason (if set).
|
||||||
|
-3: the error handler 'errors' is not supported.
|
||||||
|
*/
|
||||||
static int
|
static int
|
||||||
encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
|
encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
|
||||||
const char **reason,
|
const char **reason,
|
||||||
int raw_malloc, int current_locale, int surrogateescape)
|
int raw_malloc, int current_locale, _Py_error_handler errors)
|
||||||
{
|
{
|
||||||
if (current_locale) {
|
if (current_locale) {
|
||||||
#ifdef __ANDROID__
|
#ifdef __ANDROID__
|
||||||
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
|
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
|
||||||
raw_malloc, surrogateescape);
|
raw_malloc, errors);
|
||||||
#else
|
#else
|
||||||
return encode_current_locale(text, str, error_pos, reason,
|
return encode_current_locale(text, str, error_pos, reason,
|
||||||
raw_malloc, surrogateescape);
|
raw_malloc, errors);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||||
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
|
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
|
||||||
raw_malloc, surrogateescape);
|
raw_malloc, errors);
|
||||||
#else /* __APPLE__ */
|
#else
|
||||||
int use_utf8 = (Py_UTF8Mode == 1);
|
int use_utf8 = (Py_UTF8Mode == 1);
|
||||||
#ifdef MS_WINDOWS
|
#ifdef MS_WINDOWS
|
||||||
use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
|
use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
|
||||||
#endif
|
#endif
|
||||||
if (use_utf8) {
|
if (use_utf8) {
|
||||||
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
|
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
|
||||||
raw_malloc, surrogateescape);
|
raw_malloc, errors);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_FORCE_ASCII
|
#ifdef USE_FORCE_ASCII
|
||||||
|
@ -681,12 +742,12 @@ encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
|
||||||
|
|
||||||
if (force_ascii) {
|
if (force_ascii) {
|
||||||
return encode_ascii(text, str, error_pos, reason,
|
return encode_ascii(text, str, error_pos, reason,
|
||||||
raw_malloc, surrogateescape);
|
raw_malloc, errors);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return encode_current_locale(text, str, error_pos, reason,
|
return encode_current_locale(text, str, error_pos, reason,
|
||||||
raw_malloc, surrogateescape);
|
raw_malloc, errors);
|
||||||
#endif /* __APPLE__ or __ANDROID__ */
|
#endif /* __APPLE__ or __ANDROID__ */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -696,7 +757,8 @@ encode_locale(const wchar_t *text, size_t *error_pos,
|
||||||
{
|
{
|
||||||
char *str;
|
char *str;
|
||||||
int res = encode_locale_ex(text, &str, error_pos, NULL,
|
int res = encode_locale_ex(text, &str, error_pos, NULL,
|
||||||
raw_malloc, current_locale, 1);
|
raw_malloc, current_locale,
|
||||||
|
_Py_ERROR_SURROGATEESCAPE);
|
||||||
if (res != -2 && error_pos) {
|
if (res != -2 && error_pos) {
|
||||||
*error_pos = (size_t)-1;
|
*error_pos = (size_t)-1;
|
||||||
}
|
}
|
||||||
|
@ -737,10 +799,10 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos)
|
||||||
int
|
int
|
||||||
_Py_EncodeLocaleEx(const wchar_t *text, char **str,
|
_Py_EncodeLocaleEx(const wchar_t *text, char **str,
|
||||||
size_t *error_pos, const char **reason,
|
size_t *error_pos, const char **reason,
|
||||||
int current_locale, int surrogateescape)
|
int current_locale, _Py_error_handler errors)
|
||||||
{
|
{
|
||||||
return encode_locale_ex(text, str, error_pos, reason, 1,
|
return encode_locale_ex(text, str, error_pos, reason, 1,
|
||||||
current_locale, surrogateescape);
|
current_locale, errors);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue