bpo-29240: Fix locale encodings in UTF-8 Mode (#5170)
Modify locale.localeconv(), time.tzname, os.strerror() and other functions to ignore the UTF-8 Mode: always use the current locale encoding. Changes: * Add _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx(). On decoding or encoding error, they return the position of the error and an error message which are used to raise Unicode errors in PyUnicode_DecodeLocale() and PyUnicode_EncodeLocale(). * Replace _Py_DecodeCurrentLocale() with _Py_DecodeLocaleEx(). * PyUnicode_DecodeLocale() now uses _Py_DecodeLocaleEx() for all cases, especially for the strict error handler. * Add _Py_DecodeUTF8Ex(): return more information on decoding error and supports the strict error handler. * Rename _Py_EncodeUTF8_surrogateescape() to _Py_EncodeUTF8Ex(). * Replace _Py_EncodeCurrentLocale() with _Py_EncodeLocaleEx(). * Ignore the UTF-8 mode to encode/decode localeconv(), strerror() and time zone name. * Remove PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize() and PyUnicode_EncodeLocale() now ignore the UTF-8 mode: always use the "current" locale. * Remove _PyUnicode_DecodeCurrentLocale(), _PyUnicode_DecodeCurrentLocaleAndSize() and _PyUnicode_EncodeCurrentLocale().
This commit is contained in:
parent
ee3b83547c
commit
7ed7aead95
|
@ -106,6 +106,16 @@ Operating System Utilities
|
||||||
surrogate character, escape the bytes using the surrogateescape error
|
surrogate character, escape the bytes using the surrogateescape error
|
||||||
handler instead of decoding them.
|
handler instead of decoding them.
|
||||||
|
|
||||||
|
Encoding, highest priority to lowest priority:
|
||||||
|
|
||||||
|
* ``UTF-8`` on macOS and Android;
|
||||||
|
* ``UTF-8`` if the Python UTF-8 mode is enabled;
|
||||||
|
* ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
|
||||||
|
``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
|
||||||
|
and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
|
||||||
|
``ISO-8859-1`` encoding.
|
||||||
|
* the current locale encoding.
|
||||||
|
|
||||||
Return a pointer to a newly allocated wide character string, use
|
Return a pointer to a newly allocated wide character string, use
|
||||||
:c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write
|
:c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write
|
||||||
the number of wide characters excluding the null character into ``*size``
|
the number of wide characters excluding the null character into ``*size``
|
||||||
|
@ -137,6 +147,18 @@ Operating System Utilities
|
||||||
:ref:`surrogateescape error handler <surrogateescape>`: surrogate characters
|
:ref:`surrogateescape error handler <surrogateescape>`: surrogate characters
|
||||||
in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
|
in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
|
||||||
|
|
||||||
|
Encoding, highest priority to lowest priority:
|
||||||
|
|
||||||
|
* ``UTF-8`` on macOS and Android;
|
||||||
|
* ``UTF-8`` if the Python UTF-8 mode is enabled;
|
||||||
|
* ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
|
||||||
|
``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
|
||||||
|
and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
|
||||||
|
``ISO-8859-1`` encoding.
|
||||||
|
* the current locale encoding.
|
||||||
|
|
||||||
|
The function uses the UTF-8 encoding in the Python UTF-8 mode.
|
||||||
|
|
||||||
Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free`
|
Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free`
|
||||||
to free the memory. Return ``NULL`` on encoding error or memory allocation
|
to free the memory. Return ``NULL`` on encoding error or memory allocation
|
||||||
error
|
error
|
||||||
|
|
|
@ -770,12 +770,20 @@ system.
|
||||||
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
|
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
|
||||||
Python startup).
|
Python startup).
|
||||||
|
|
||||||
|
This function ignores the Python UTF-8 mode.
|
||||||
|
|
||||||
.. seealso::
|
.. seealso::
|
||||||
|
|
||||||
The :c:func:`Py_DecodeLocale` function.
|
The :c:func:`Py_DecodeLocale` function.
|
||||||
|
|
||||||
.. versionadded:: 3.3
|
.. versionadded:: 3.3
|
||||||
|
|
||||||
|
.. versionchanged:: 3.7
|
||||||
|
The function now also uses the current locale encoding for the
|
||||||
|
``surrogateescape`` error handler. Previously, :c:func:`Py_DecodeLocale`
|
||||||
|
was used for the ``surrogateescape``, and the current locale encoding was
|
||||||
|
used for ``strict``.
|
||||||
|
|
||||||
|
|
||||||
.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors)
|
.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors)
|
||||||
|
|
||||||
|
@ -797,12 +805,20 @@ system.
|
||||||
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
|
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
|
||||||
Python startup).
|
Python startup).
|
||||||
|
|
||||||
|
This function ignores the Python UTF-8 mode.
|
||||||
|
|
||||||
.. seealso::
|
.. seealso::
|
||||||
|
|
||||||
The :c:func:`Py_EncodeLocale` function.
|
The :c:func:`Py_EncodeLocale` function.
|
||||||
|
|
||||||
.. versionadded:: 3.3
|
.. versionadded:: 3.3
|
||||||
|
|
||||||
|
.. versionchanged:: 3.7
|
||||||
|
The function now also uses the current locale encoding for the
|
||||||
|
``surrogateescape`` error handler. Previously, :c:func:`Py_EncodeLocale`
|
||||||
|
was used for the ``surrogateescape``, and the current locale encoding was
|
||||||
|
used for ``strict``.
|
||||||
|
|
||||||
|
|
||||||
File System Encoding
|
File System Encoding
|
||||||
""""""""""""""""""""
|
""""""""""""""""""""
|
||||||
|
|
|
@ -20,18 +20,41 @@ PyAPI_FUNC(char*) _Py_EncodeLocaleRaw(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef Py_BUILD_CORE
|
#ifdef Py_BUILD_CORE
|
||||||
PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
|
PyAPI_FUNC(int) _Py_DecodeUTF8Ex(
|
||||||
const char *s,
|
|
||||||
Py_ssize_t size,
|
|
||||||
size_t *p_wlen);
|
|
||||||
|
|
||||||
PyAPI_FUNC(wchar_t *) _Py_DecodeCurrentLocale(
|
|
||||||
const char *arg,
|
const char *arg,
|
||||||
size_t *size);
|
Py_ssize_t arglen,
|
||||||
|
wchar_t **wstr,
|
||||||
|
size_t *wlen,
|
||||||
|
const char **reason,
|
||||||
|
int surrogateescape);
|
||||||
|
|
||||||
PyAPI_FUNC(char*) _Py_EncodeCurrentLocale(
|
PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
|
||||||
const wchar_t *text,
|
const wchar_t *text,
|
||||||
size_t *error_pos);
|
char **str,
|
||||||
|
size_t *error_pos,
|
||||||
|
const char **reason,
|
||||||
|
int raw_malloc,
|
||||||
|
int surrogateescape);
|
||||||
|
|
||||||
|
PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
|
||||||
|
const char *arg,
|
||||||
|
Py_ssize_t arglen);
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) _Py_DecodeLocaleEx(
|
||||||
|
const char *arg,
|
||||||
|
wchar_t **wstr,
|
||||||
|
size_t *wlen,
|
||||||
|
const char **reason,
|
||||||
|
int current_locale,
|
||||||
|
int surrogateescape);
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) _Py_EncodeLocaleEx(
|
||||||
|
const wchar_t *text,
|
||||||
|
char **str,
|
||||||
|
size_t *error_pos,
|
||||||
|
const char **reason,
|
||||||
|
int current_locale,
|
||||||
|
int surrogateescape);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef Py_LIMITED_API
|
#ifndef Py_LIMITED_API
|
||||||
|
|
|
@ -1810,20 +1810,6 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
|
||||||
PyObject *unicode,
|
PyObject *unicode,
|
||||||
const char *errors
|
const char *errors
|
||||||
);
|
);
|
||||||
|
|
||||||
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocale(
|
|
||||||
const char *str,
|
|
||||||
const char *errors);
|
|
||||||
|
|
||||||
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocaleAndSize(
|
|
||||||
const char *str,
|
|
||||||
Py_ssize_t len,
|
|
||||||
const char *errors);
|
|
||||||
|
|
||||||
PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCurrentLocale(
|
|
||||||
PyObject *unicode,
|
|
||||||
const char *errors
|
|
||||||
);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* --- File system encoding ---------------------------------------------- */
|
/* --- File system encoding ---------------------------------------------- */
|
||||||
|
|
|
@ -696,7 +696,7 @@ static int parse_isoformat_date(const char *dtstr,
|
||||||
if (NULL == p) {
|
if (NULL == p) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (*(p++) != '-') {
|
if (*(p++) != '-') {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
|
|
|
@ -572,8 +572,9 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args)
|
||||||
if (!PyArg_ParseTuple(args, "sz", &domain, &codeset))
|
if (!PyArg_ParseTuple(args, "sz", &domain, &codeset))
|
||||||
return NULL;
|
return NULL;
|
||||||
codeset = bind_textdomain_codeset(domain, codeset);
|
codeset = bind_textdomain_codeset(domain, codeset);
|
||||||
if (codeset)
|
if (codeset) {
|
||||||
return PyUnicode_DecodeLocale(codeset, NULL);
|
return PyUnicode_DecodeLocale(codeset, NULL);
|
||||||
|
}
|
||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -449,8 +449,8 @@ search_for_exec_prefix(const _PyCoreConfig *core_config,
|
||||||
n = fread(buf, 1, MAXPATHLEN, f);
|
n = fread(buf, 1, MAXPATHLEN, f);
|
||||||
buf[n] = '\0';
|
buf[n] = '\0';
|
||||||
fclose(f);
|
fclose(f);
|
||||||
rel_builddir_path = _Py_DecodeUTF8_surrogateescape(buf, n, NULL);
|
rel_builddir_path = _Py_DecodeUTF8_surrogateescape(buf, n);
|
||||||
if (rel_builddir_path != NULL) {
|
if (rel_builddir_path) {
|
||||||
wcsncpy(exec_prefix, calculate->argv0_path, MAXPATHLEN);
|
wcsncpy(exec_prefix, calculate->argv0_path, MAXPATHLEN);
|
||||||
exec_prefix[MAXPATHLEN] = L'\0';
|
exec_prefix[MAXPATHLEN] = L'\0';
|
||||||
joinpath(exec_prefix, rel_builddir_path);
|
joinpath(exec_prefix, rel_builddir_path);
|
||||||
|
|
|
@ -132,13 +132,13 @@ static PyModuleDef readlinemodule;
|
||||||
static PyObject *
|
static PyObject *
|
||||||
encode(PyObject *b)
|
encode(PyObject *b)
|
||||||
{
|
{
|
||||||
return _PyUnicode_EncodeCurrentLocale(b, "surrogateescape");
|
return PyUnicode_EncodeLocale(b, "surrogateescape");
|
||||||
}
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
decode(const char *s)
|
decode(const char *s)
|
||||||
{
|
{
|
||||||
return _PyUnicode_DecodeCurrentLocale(s, "surrogateescape");
|
return PyUnicode_DecodeLocale(s, "surrogateescape");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -418,11 +418,11 @@ tmtotuple(struct tm *p
|
||||||
SET(8, p->tm_isdst);
|
SET(8, p->tm_isdst);
|
||||||
#ifdef HAVE_STRUCT_TM_TM_ZONE
|
#ifdef HAVE_STRUCT_TM_TM_ZONE
|
||||||
PyStructSequence_SET_ITEM(v, 9,
|
PyStructSequence_SET_ITEM(v, 9,
|
||||||
_PyUnicode_DecodeCurrentLocale(p->tm_zone, "surrogateescape"));
|
PyUnicode_DecodeLocale(p->tm_zone, "surrogateescape"));
|
||||||
SET(10, p->tm_gmtoff);
|
SET(10, p->tm_gmtoff);
|
||||||
#else
|
#else
|
||||||
PyStructSequence_SET_ITEM(v, 9,
|
PyStructSequence_SET_ITEM(v, 9,
|
||||||
_PyUnicode_DecodeCurrentLocale(zone, "surrogateescape"));
|
PyUnicode_DecodeLocale(zone, "surrogateescape"));
|
||||||
PyStructSequence_SET_ITEM(v, 10, _PyLong_FromTime_t(gmtoff));
|
PyStructSequence_SET_ITEM(v, 10, _PyLong_FromTime_t(gmtoff));
|
||||||
#endif /* HAVE_STRUCT_TM_TM_ZONE */
|
#endif /* HAVE_STRUCT_TM_TM_ZONE */
|
||||||
#undef SET
|
#undef SET
|
||||||
|
@ -809,8 +809,7 @@ time_strftime(PyObject *self, PyObject *args)
|
||||||
#ifdef HAVE_WCSFTIME
|
#ifdef HAVE_WCSFTIME
|
||||||
ret = PyUnicode_FromWideChar(outbuf, buflen);
|
ret = PyUnicode_FromWideChar(outbuf, buflen);
|
||||||
#else
|
#else
|
||||||
ret = _PyUnicode_DecodeCurrentLocaleAndSize(outbuf, buflen,
|
ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, "surrogateescape");
|
||||||
"surrogateescape");
|
|
||||||
#endif
|
#endif
|
||||||
PyMem_Free(outbuf);
|
PyMem_Free(outbuf);
|
||||||
break;
|
break;
|
||||||
|
@ -1541,8 +1540,8 @@ PyInit_timezone(PyObject *m) {
|
||||||
PyModule_AddIntConstant(m, "altzone", timezone-3600);
|
PyModule_AddIntConstant(m, "altzone", timezone-3600);
|
||||||
#endif
|
#endif
|
||||||
PyModule_AddIntConstant(m, "daylight", daylight);
|
PyModule_AddIntConstant(m, "daylight", daylight);
|
||||||
otz0 = _PyUnicode_DecodeCurrentLocale(tzname[0], "surrogateescape");
|
otz0 = PyUnicode_DecodeLocale(tzname[0], "surrogateescape");
|
||||||
otz1 = _PyUnicode_DecodeCurrentLocale(tzname[1], "surrogateescape");
|
otz1 = PyUnicode_DecodeLocale(tzname[1], "surrogateescape");
|
||||||
PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
|
PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
|
||||||
#else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
|
#else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
|
||||||
{
|
{
|
||||||
|
|
|
@ -3327,53 +3327,6 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t
|
|
||||||
wcstombs_errorpos(const wchar_t *wstr)
|
|
||||||
{
|
|
||||||
size_t len;
|
|
||||||
#if SIZEOF_WCHAR_T == 2
|
|
||||||
wchar_t buf[3];
|
|
||||||
#else
|
|
||||||
wchar_t buf[2];
|
|
||||||
#endif
|
|
||||||
char outbuf[MB_LEN_MAX];
|
|
||||||
const wchar_t *start, *previous;
|
|
||||||
|
|
||||||
#if SIZEOF_WCHAR_T == 2
|
|
||||||
buf[2] = 0;
|
|
||||||
#else
|
|
||||||
buf[1] = 0;
|
|
||||||
#endif
|
|
||||||
start = wstr;
|
|
||||||
while (*wstr != L'\0')
|
|
||||||
{
|
|
||||||
previous = wstr;
|
|
||||||
#if SIZEOF_WCHAR_T == 2
|
|
||||||
if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
|
|
||||||
&& Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
|
|
||||||
{
|
|
||||||
buf[0] = wstr[0];
|
|
||||||
buf[1] = wstr[1];
|
|
||||||
wstr += 2;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
buf[0] = *wstr;
|
|
||||||
buf[1] = 0;
|
|
||||||
wstr++;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
buf[0] = *wstr;
|
|
||||||
wstr++;
|
|
||||||
#endif
|
|
||||||
len = wcstombs(outbuf, buf, sizeof(outbuf));
|
|
||||||
if (len == (size_t)-1)
|
|
||||||
return previous - start;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* failed to find the unencodable character */
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
locale_error_handler(const char *errors, int *surrogateescape)
|
locale_error_handler(const char *errors, int *surrogateescape)
|
||||||
{
|
{
|
||||||
|
@ -3396,130 +3349,60 @@ locale_error_handler(const char *errors, int *surrogateescape)
|
||||||
}
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale)
|
unicode_encode_locale(PyObject *unicode, const char *errors,
|
||||||
|
int current_locale)
|
||||||
{
|
{
|
||||||
Py_ssize_t wlen, wlen2;
|
|
||||||
wchar_t *wstr;
|
|
||||||
char *errmsg;
|
|
||||||
PyObject *bytes, *reason, *exc;
|
|
||||||
size_t error_pos, errlen;
|
|
||||||
int surrogateescape;
|
int surrogateescape;
|
||||||
|
|
||||||
if (locale_error_handler(errors, &surrogateescape) < 0)
|
if (locale_error_handler(errors, &surrogateescape) < 0)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
wstr = PyUnicode_AsWideCharString(unicode, &wlen);
|
Py_ssize_t wlen;
|
||||||
if (wstr == NULL)
|
wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
|
||||||
|
if (wstr == NULL) {
|
||||||
return NULL;
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
wlen2 = wcslen(wstr);
|
Py_ssize_t wlen2 = wcslen(wstr);
|
||||||
if (wlen2 != wlen) {
|
if (wlen2 != wlen) {
|
||||||
PyMem_Free(wstr);
|
PyMem_Free(wstr);
|
||||||
PyErr_SetString(PyExc_ValueError, "embedded null character");
|
PyErr_SetString(PyExc_ValueError, "embedded null character");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (surrogateescape) {
|
char *str;
|
||||||
/* "surrogateescape" error handler */
|
size_t error_pos;
|
||||||
char *str;
|
const char *reason;
|
||||||
|
int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
|
||||||
if (current_locale) {
|
current_locale, surrogateescape);
|
||||||
str = _Py_EncodeCurrentLocale(wstr, &error_pos);
|
if (res != 0) {
|
||||||
|
if (res == -2) {
|
||||||
|
PyObject *exc;
|
||||||
|
exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
|
||||||
|
"locale", unicode,
|
||||||
|
(Py_ssize_t)error_pos,
|
||||||
|
(Py_ssize_t)(error_pos+1),
|
||||||
|
reason);
|
||||||
|
if (exc != NULL) {
|
||||||
|
PyCodec_StrictErrors(exc);
|
||||||
|
Py_DECREF(exc);
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
str = Py_EncodeLocale(wstr, &error_pos);
|
PyErr_NoMemory();
|
||||||
}
|
|
||||||
if (str == NULL) {
|
|
||||||
if (error_pos == (size_t)-1) {
|
|
||||||
PyErr_NoMemory();
|
|
||||||
PyMem_Free(wstr);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
goto encode_error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
PyMem_Free(wstr);
|
|
||||||
|
|
||||||
bytes = PyBytes_FromString(str);
|
|
||||||
if (current_locale) {
|
|
||||||
PyMem_RawFree(str);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
PyMem_Free(str);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
/* strict mode */
|
|
||||||
size_t len, len2;
|
|
||||||
|
|
||||||
len = wcstombs(NULL, wstr, 0);
|
|
||||||
if (len == (size_t)-1) {
|
|
||||||
error_pos = (size_t)-1;
|
|
||||||
goto encode_error;
|
|
||||||
}
|
|
||||||
|
|
||||||
bytes = PyBytes_FromStringAndSize(NULL, len);
|
|
||||||
if (bytes == NULL) {
|
|
||||||
PyMem_Free(wstr);
|
PyMem_Free(wstr);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
|
|
||||||
if (len2 == (size_t)-1 || len2 > len) {
|
|
||||||
Py_DECREF(bytes);
|
|
||||||
error_pos = (size_t)-1;
|
|
||||||
goto encode_error;
|
|
||||||
}
|
|
||||||
PyMem_Free(wstr);
|
|
||||||
}
|
}
|
||||||
return bytes;
|
|
||||||
|
|
||||||
encode_error:
|
|
||||||
errmsg = strerror(errno);
|
|
||||||
assert(errmsg != NULL);
|
|
||||||
|
|
||||||
if (error_pos == (size_t)-1)
|
|
||||||
error_pos = wcstombs_errorpos(wstr);
|
|
||||||
|
|
||||||
PyMem_Free(wstr);
|
PyMem_Free(wstr);
|
||||||
|
|
||||||
wstr = Py_DecodeLocale(errmsg, &errlen);
|
PyObject *bytes = PyBytes_FromString(str);
|
||||||
if (wstr != NULL) {
|
PyMem_RawFree(str);
|
||||||
reason = PyUnicode_FromWideChar(wstr, errlen);
|
return bytes;
|
||||||
PyMem_RawFree(wstr);
|
|
||||||
} else {
|
|
||||||
errmsg = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (errmsg == NULL)
|
|
||||||
reason = PyUnicode_FromString(
|
|
||||||
"wcstombs() encountered an unencodable "
|
|
||||||
"wide character");
|
|
||||||
if (reason == NULL)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
|
|
||||||
"locale", unicode,
|
|
||||||
(Py_ssize_t)error_pos,
|
|
||||||
(Py_ssize_t)(error_pos+1),
|
|
||||||
reason);
|
|
||||||
Py_DECREF(reason);
|
|
||||||
if (exc != NULL) {
|
|
||||||
PyCodec_StrictErrors(exc);
|
|
||||||
Py_DECREF(exc);
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
|
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
|
||||||
{
|
|
||||||
return unicode_encode_locale(unicode, errors, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
PyObject *
|
|
||||||
_PyUnicode_EncodeCurrentLocale(PyObject *unicode, const char *errors)
|
|
||||||
{
|
{
|
||||||
return unicode_encode_locale(unicode, errors, 1);
|
return unicode_encode_locale(unicode, errors, 1);
|
||||||
}
|
}
|
||||||
|
@ -3687,51 +3570,11 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t
|
|
||||||
mbstowcs_errorpos(const char *str, size_t len)
|
|
||||||
{
|
|
||||||
#ifdef HAVE_MBRTOWC
|
|
||||||
const char *start = str;
|
|
||||||
mbstate_t mbs;
|
|
||||||
size_t converted;
|
|
||||||
wchar_t ch;
|
|
||||||
|
|
||||||
memset(&mbs, 0, sizeof mbs);
|
|
||||||
while (len)
|
|
||||||
{
|
|
||||||
converted = mbrtowc(&ch, str, len, &mbs);
|
|
||||||
if (converted == 0)
|
|
||||||
/* Reached end of string */
|
|
||||||
break;
|
|
||||||
if (converted == (size_t)-1 || converted == (size_t)-2) {
|
|
||||||
/* Conversion error or incomplete character */
|
|
||||||
return str - start;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
str += converted;
|
|
||||||
len -= converted;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* failed to find the undecodable byte sequence */
|
|
||||||
return 0;
|
|
||||||
#endif
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
|
unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
|
||||||
int current_locale)
|
int current_locale)
|
||||||
{
|
{
|
||||||
wchar_t smallbuf[256];
|
|
||||||
size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
|
|
||||||
wchar_t *wstr;
|
|
||||||
size_t wlen, wlen2;
|
|
||||||
PyObject *unicode;
|
|
||||||
int surrogateescape;
|
int surrogateescape;
|
||||||
size_t error_pos, errlen;
|
|
||||||
char *errmsg;
|
|
||||||
PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
|
|
||||||
|
|
||||||
if (locale_error_handler(errors, &surrogateescape) < 0)
|
if (locale_error_handler(errors, &surrogateescape) < 0)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
@ -3740,113 +3583,47 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (surrogateescape) {
|
wchar_t *wstr;
|
||||||
/* "surrogateescape" error handler */
|
size_t wlen;
|
||||||
if (current_locale) {
|
const char *reason;
|
||||||
wstr = _Py_DecodeCurrentLocale(str, &wlen);
|
int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
|
||||||
|
current_locale, surrogateescape);
|
||||||
|
if (res != 0) {
|
||||||
|
if (res == -2) {
|
||||||
|
PyObject *exc;
|
||||||
|
exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
|
||||||
|
"locale", str, len,
|
||||||
|
(Py_ssize_t)wlen,
|
||||||
|
(Py_ssize_t)(wlen + 1),
|
||||||
|
reason);
|
||||||
|
if (exc != NULL) {
|
||||||
|
PyCodec_StrictErrors(exc);
|
||||||
|
Py_DECREF(exc);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
wstr = Py_DecodeLocale(str, &wlen);
|
PyErr_NoMemory();
|
||||||
}
|
}
|
||||||
if (wstr == NULL) {
|
|
||||||
if (wlen == (size_t)-1)
|
|
||||||
PyErr_NoMemory();
|
|
||||||
else
|
|
||||||
PyErr_SetFromErrno(PyExc_OSError);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
unicode = PyUnicode_FromWideChar(wstr, wlen);
|
|
||||||
PyMem_RawFree(wstr);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
/* strict mode */
|
|
||||||
#ifndef HAVE_BROKEN_MBSTOWCS
|
|
||||||
wlen = mbstowcs(NULL, str, 0);
|
|
||||||
#else
|
|
||||||
wlen = len;
|
|
||||||
#endif
|
|
||||||
if (wlen == (size_t)-1)
|
|
||||||
goto decode_error;
|
|
||||||
if (wlen+1 <= smallbuf_len) {
|
|
||||||
wstr = smallbuf;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
wstr = PyMem_New(wchar_t, wlen+1);
|
|
||||||
if (!wstr)
|
|
||||||
return PyErr_NoMemory();
|
|
||||||
}
|
|
||||||
|
|
||||||
wlen2 = mbstowcs(wstr, str, wlen+1);
|
|
||||||
if (wlen2 == (size_t)-1) {
|
|
||||||
if (wstr != smallbuf)
|
|
||||||
PyMem_Free(wstr);
|
|
||||||
goto decode_error;
|
|
||||||
}
|
|
||||||
#ifdef HAVE_BROKEN_MBSTOWCS
|
|
||||||
assert(wlen2 == wlen);
|
|
||||||
#endif
|
|
||||||
unicode = PyUnicode_FromWideChar(wstr, wlen2);
|
|
||||||
if (wstr != smallbuf)
|
|
||||||
PyMem_Free(wstr);
|
|
||||||
}
|
|
||||||
return unicode;
|
|
||||||
|
|
||||||
decode_error:
|
|
||||||
errmsg = strerror(errno);
|
|
||||||
assert(errmsg != NULL);
|
|
||||||
|
|
||||||
error_pos = mbstowcs_errorpos(str, len);
|
|
||||||
wstr = Py_DecodeLocale(errmsg, &errlen);
|
|
||||||
if (wstr != NULL) {
|
|
||||||
reason = PyUnicode_FromWideChar(wstr, errlen);
|
|
||||||
PyMem_RawFree(wstr);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (reason == NULL)
|
|
||||||
reason = PyUnicode_FromString(
|
|
||||||
"mbstowcs() encountered an invalid multibyte sequence");
|
|
||||||
if (reason == NULL)
|
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
|
|
||||||
"locale", str, len,
|
|
||||||
(Py_ssize_t)error_pos,
|
|
||||||
(Py_ssize_t)(error_pos+1),
|
|
||||||
reason);
|
|
||||||
Py_DECREF(reason);
|
|
||||||
if (exc != NULL) {
|
|
||||||
PyCodec_StrictErrors(exc);
|
|
||||||
Py_DECREF(exc);
|
|
||||||
}
|
}
|
||||||
return NULL;
|
|
||||||
|
PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
|
||||||
|
PyMem_RawFree(wstr);
|
||||||
|
return unicode;
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject*
|
PyObject*
|
||||||
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
|
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
|
||||||
const char *errors)
|
const char *errors)
|
||||||
{
|
|
||||||
return unicode_decode_locale(str, len, errors, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
PyObject*
|
|
||||||
_PyUnicode_DecodeCurrentLocaleAndSize(const char *str, Py_ssize_t len,
|
|
||||||
const char *errors)
|
|
||||||
{
|
{
|
||||||
return unicode_decode_locale(str, len, errors, 1);
|
return unicode_decode_locale(str, len, errors, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject*
|
|
||||||
_PyUnicode_DecodeCurrentLocale(const char *str, const char *errors)
|
|
||||||
{
|
|
||||||
return unicode_decode_locale(str, (Py_ssize_t)strlen(str), errors, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
PyObject*
|
PyObject*
|
||||||
PyUnicode_DecodeLocale(const char *str, const char *errors)
|
PyUnicode_DecodeLocale(const char *str, const char *errors)
|
||||||
{
|
{
|
||||||
Py_ssize_t size = (Py_ssize_t)strlen(str);
|
Py_ssize_t size = (Py_ssize_t)strlen(str);
|
||||||
return unicode_decode_locale(str, size, errors, 0);
|
return unicode_decode_locale(str, size, errors, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -3878,7 +3655,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
|
||||||
Py_FileSystemDefaultEncodeErrors);
|
Py_FileSystemDefaultEncodeErrors);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
|
return unicode_decode_locale(s, size,
|
||||||
|
Py_FileSystemDefaultEncodeErrors, 0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -5128,17 +4906,23 @@ onError:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* UTF-8 decoder using the surrogateescape error handler .
|
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
|
||||||
|
non-zero, use strict error handler otherwise.
|
||||||
|
|
||||||
On success, return a pointer to a newly allocated wide character string (use
|
On success, write a pointer to a newly allocated wide character string into
|
||||||
PyMem_RawFree() to free the memory) and write the output length (in number
|
*wstr (use PyMem_RawFree() to free the memory) and write the output length
|
||||||
of wchar_t units) into *p_wlen (if p_wlen is set).
|
(in number of wchar_t units) into *wlen (if wlen is set).
|
||||||
|
|
||||||
On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen
|
On memory allocation failure, return -1.
|
||||||
(if p_wlen is set). */
|
|
||||||
wchar_t*
|
On decoding error (if surrogateescape is zero), return -2. If wlen is
|
||||||
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
|
non-NULL, write the start of the illegal byte sequence into *wlen. If reason
|
||||||
|
is not NULL, write the decoding error message into *reason. */
|
||||||
|
int
|
||||||
|
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
|
||||||
|
const char **reason, int surrogateescape)
|
||||||
{
|
{
|
||||||
|
const char *orig_s = s;
|
||||||
const char *e;
|
const char *e;
|
||||||
wchar_t *unicode;
|
wchar_t *unicode;
|
||||||
Py_ssize_t outpos;
|
Py_ssize_t outpos;
|
||||||
|
@ -5146,18 +4930,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
|
||||||
/* Note: size will always be longer than the resulting Unicode
|
/* Note: size will always be longer than the resulting Unicode
|
||||||
character count */
|
character count */
|
||||||
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
|
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
|
||||||
if (p_wlen) {
|
return -1;
|
||||||
*p_wlen = (size_t)-1;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
|
unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
|
||||||
if (!unicode) {
|
if (!unicode) {
|
||||||
if (p_wlen) {
|
return -1;
|
||||||
*p_wlen = (size_t)-1;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Unpack UTF-8 encoded data */
|
/* Unpack UTF-8 encoded data */
|
||||||
|
@ -5175,7 +4953,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
|
||||||
Py_UNREACHABLE();
|
Py_UNREACHABLE();
|
||||||
#else
|
#else
|
||||||
assert(ch > 0xFFFF && ch <= MAX_UNICODE);
|
assert(ch > 0xFFFF && ch <= MAX_UNICODE);
|
||||||
/* compute and append the two surrogates: */
|
/* write a surrogate pair */
|
||||||
unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
|
unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
|
||||||
unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
|
unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
|
||||||
#endif
|
#endif
|
||||||
|
@ -5183,60 +4961,88 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
|
||||||
else {
|
else {
|
||||||
if (!ch && s == e)
|
if (!ch && s == e)
|
||||||
break;
|
break;
|
||||||
|
if (!surrogateescape) {
|
||||||
|
PyMem_RawFree(unicode );
|
||||||
|
if (reason != NULL) {
|
||||||
|
switch (ch) {
|
||||||
|
case 0:
|
||||||
|
*reason = "unexpected end of data";
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
*reason = "invalid start byte";
|
||||||
|
break;
|
||||||
|
/* 2, 3, 4 */
|
||||||
|
default:
|
||||||
|
*reason = "invalid continuation byte";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (wlen != NULL) {
|
||||||
|
*wlen = s - orig_s;
|
||||||
|
}
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
/* surrogateescape */
|
/* surrogateescape */
|
||||||
unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
|
unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
unicode[outpos] = L'\0';
|
unicode[outpos] = L'\0';
|
||||||
if (p_wlen) {
|
if (wlen) {
|
||||||
*p_wlen = outpos;
|
*wlen = outpos;
|
||||||
}
|
}
|
||||||
return unicode;
|
*wstr = unicode;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
wchar_t*
|
||||||
|
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
|
||||||
|
{
|
||||||
|
wchar_t *wstr;
|
||||||
|
int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
|
||||||
|
if (res != 0) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return wstr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* UTF-8 encoder using the surrogateescape error handler .
|
/* UTF-8 encoder using the surrogateescape error handler .
|
||||||
|
|
||||||
On success, return a pointer to a newly allocated character string (use
|
On success, return 0 and write the newly allocated character string (use
|
||||||
PyMem_Free() to free the memory).
|
PyMem_Free() to free the memory) into *str.
|
||||||
|
|
||||||
On encoding failure, return NULL and write the position of the invalid
|
On encoding failure, return -2 and write the position of the invalid
|
||||||
surrogate character into *error_pos (if error_pos is set).
|
surrogate character into *error_pos (if error_pos is set) and the decoding
|
||||||
|
error message into *reason (if reason is set).
|
||||||
|
|
||||||
On memory allocation failure, return NULL and write (size_t)-1 into
|
On memory allocation failure, return -1. */
|
||||||
*error_pos (if error_pos is set). */
|
int
|
||||||
char*
|
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
|
||||||
_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
|
const char **reason, int raw_malloc, int surrogateescape)
|
||||||
int raw_malloc)
|
|
||||||
{
|
{
|
||||||
const Py_ssize_t max_char_size = 4;
|
const Py_ssize_t max_char_size = 4;
|
||||||
Py_ssize_t len = wcslen(text);
|
Py_ssize_t len = wcslen(text);
|
||||||
|
|
||||||
assert(len >= 0);
|
assert(len >= 0);
|
||||||
|
|
||||||
|
if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
char *bytes;
|
char *bytes;
|
||||||
if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
|
if (raw_malloc) {
|
||||||
if (raw_malloc) {
|
bytes = PyMem_RawMalloc((len + 1) * max_char_size);
|
||||||
bytes = PyMem_RawMalloc((len + 1) * max_char_size);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
bytes = PyMem_Malloc((len + 1) * max_char_size);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
bytes = NULL;
|
bytes = PyMem_Malloc((len + 1) * max_char_size);
|
||||||
}
|
}
|
||||||
if (bytes == NULL) {
|
if (bytes == NULL) {
|
||||||
if (error_pos != NULL) {
|
return -1;
|
||||||
*error_pos = (size_t)-1;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
char *p = bytes;
|
char *p = bytes;
|
||||||
Py_ssize_t i;
|
Py_ssize_t i;
|
||||||
for (i = 0; i < len;) {
|
for (i = 0; i < len; i++) {
|
||||||
Py_UCS4 ch = text[i++];
|
Py_UCS4 ch = text[i];
|
||||||
|
|
||||||
if (ch < 0x80) {
|
if (ch < 0x80) {
|
||||||
/* Encode ASCII */
|
/* Encode ASCII */
|
||||||
|
@ -5250,11 +5056,20 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
|
||||||
}
|
}
|
||||||
else if (Py_UNICODE_IS_SURROGATE(ch)) {
|
else if (Py_UNICODE_IS_SURROGATE(ch)) {
|
||||||
/* surrogateescape error handler */
|
/* surrogateescape error handler */
|
||||||
if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
|
if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
|
||||||
if (error_pos != NULL) {
|
if (error_pos != NULL) {
|
||||||
*error_pos = (size_t)i - 1;
|
*error_pos = (size_t)i;
|
||||||
}
|
}
|
||||||
goto error;
|
if (reason != NULL) {
|
||||||
|
*reason = "encoding error";
|
||||||
|
}
|
||||||
|
if (raw_malloc) {
|
||||||
|
PyMem_RawFree(bytes);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
PyMem_Free(bytes);
|
||||||
|
}
|
||||||
|
return -2;
|
||||||
}
|
}
|
||||||
*p++ = (char)(ch & 0xff);
|
*p++ = (char)(ch & 0xff);
|
||||||
}
|
}
|
||||||
|
@ -5286,18 +5101,16 @@ _Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos,
|
||||||
if (error_pos != NULL) {
|
if (error_pos != NULL) {
|
||||||
*error_pos = (size_t)-1;
|
*error_pos = (size_t)-1;
|
||||||
}
|
}
|
||||||
goto error;
|
if (raw_malloc) {
|
||||||
|
PyMem_RawFree(bytes);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
PyMem_Free(bytes);
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
return bytes2;
|
*str = bytes2;
|
||||||
|
return 0;
|
||||||
error:
|
|
||||||
if (raw_malloc) {
|
|
||||||
PyMem_RawFree(bytes);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
PyMem_Free(bytes);
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -20,9 +20,6 @@ extern int winerror_to_errno(int);
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#endif /* HAVE_FCNTL_H */
|
#endif /* HAVE_FCNTL_H */
|
||||||
|
|
||||||
extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text,
|
|
||||||
size_t *error_pos, int raw_malloc);
|
|
||||||
|
|
||||||
#ifdef O_CLOEXEC
|
#ifdef O_CLOEXEC
|
||||||
/* Does open() support the O_CLOEXEC flag? Possible values:
|
/* Does open() support the O_CLOEXEC flag? Possible values:
|
||||||
|
|
||||||
|
@ -69,7 +66,10 @@ _Py_device_encoding(int fd)
|
||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !defined(__APPLE__) && !defined(MS_WINDOWS)
|
#if !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS)
|
||||||
|
|
||||||
|
#define USE_FORCE_ASCII
|
||||||
|
|
||||||
extern int _Py_normalize_encoding(const char *, char *, size_t);
|
extern int _Py_normalize_encoding(const char *, char *, size_t);
|
||||||
|
|
||||||
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
|
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
|
||||||
|
@ -90,7 +90,7 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);
|
||||||
|
|
||||||
1: the workaround is used: Py_EncodeLocale() uses
|
1: the workaround is used: Py_EncodeLocale() uses
|
||||||
encode_ascii_surrogateescape() and Py_DecodeLocale() uses
|
encode_ascii_surrogateescape() and Py_DecodeLocale() uses
|
||||||
decode_ascii_surrogateescape()
|
decode_ascii()
|
||||||
0: the workaround is not used: Py_EncodeLocale() uses wcstombs() and
|
0: the workaround is not used: Py_EncodeLocale() uses wcstombs() and
|
||||||
Py_DecodeLocale() uses mbstowcs()
|
Py_DecodeLocale() uses mbstowcs()
|
||||||
-1: unknown, need to call check_force_ascii() to get the value
|
-1: unknown, need to call check_force_ascii() to get the value
|
||||||
|
@ -180,16 +180,15 @@ error:
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static char*
|
static int
|
||||||
encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_malloc)
|
encode_ascii(const wchar_t *text, char **str,
|
||||||
|
size_t *error_pos, const char **reason,
|
||||||
|
int raw_malloc, int surrogateescape)
|
||||||
{
|
{
|
||||||
char *result = NULL, *out;
|
char *result = NULL, *out;
|
||||||
size_t len, i;
|
size_t len, i;
|
||||||
wchar_t ch;
|
wchar_t ch;
|
||||||
|
|
||||||
if (error_pos != NULL)
|
|
||||||
*error_pos = (size_t)-1;
|
|
||||||
|
|
||||||
len = wcslen(text);
|
len = wcslen(text);
|
||||||
|
|
||||||
/* +1 for NULL byte */
|
/* +1 for NULL byte */
|
||||||
|
@ -199,8 +198,9 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal
|
||||||
else {
|
else {
|
||||||
result = PyMem_Malloc(len + 1);
|
result = PyMem_Malloc(len + 1);
|
||||||
}
|
}
|
||||||
if (result == NULL)
|
if (result == NULL) {
|
||||||
return NULL;
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
out = result;
|
out = result;
|
||||||
for (i=0; i<len; i++) {
|
for (i=0; i<len; i++) {
|
||||||
|
@ -210,60 +210,84 @@ encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos, int raw_mal
|
||||||
/* ASCII character */
|
/* ASCII character */
|
||||||
*out++ = (char)ch;
|
*out++ = (char)ch;
|
||||||
}
|
}
|
||||||
else if (0xdc80 <= ch && ch <= 0xdcff) {
|
else if (surrogateescape && 0xdc80 <= ch && ch <= 0xdcff) {
|
||||||
/* UTF-8b surrogate */
|
/* UTF-8b surrogate */
|
||||||
*out++ = (char)(ch - 0xdc00);
|
*out++ = (char)(ch - 0xdc00);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (error_pos != NULL) {
|
|
||||||
*error_pos = i;
|
|
||||||
}
|
|
||||||
if (raw_malloc) {
|
if (raw_malloc) {
|
||||||
PyMem_RawFree(result);
|
PyMem_RawFree(result);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
PyMem_Free(result);
|
PyMem_Free(result);
|
||||||
}
|
}
|
||||||
return NULL;
|
if (error_pos != NULL) {
|
||||||
|
*error_pos = i;
|
||||||
|
}
|
||||||
|
if (reason) {
|
||||||
|
*reason = "encoding error";
|
||||||
|
}
|
||||||
|
return -2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*out = '\0';
|
*out = '\0';
|
||||||
return result;
|
*str = result;
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
#endif /* !defined(__APPLE__) && !defined(MS_WINDOWS) */
|
#endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */
|
||||||
|
|
||||||
#if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC))
|
|
||||||
static wchar_t*
|
#if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
|
||||||
decode_ascii_surrogateescape(const char *arg, size_t *size)
|
static int
|
||||||
|
decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
|
||||||
|
const char **reason, int surrogateescape)
|
||||||
{
|
{
|
||||||
wchar_t *res;
|
wchar_t *res;
|
||||||
unsigned char *in;
|
unsigned char *in;
|
||||||
wchar_t *out;
|
wchar_t *out;
|
||||||
size_t argsize = strlen(arg) + 1;
|
size_t argsize = strlen(arg) + 1;
|
||||||
|
|
||||||
if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
|
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
|
||||||
return NULL;
|
return -1;
|
||||||
res = PyMem_RawMalloc(argsize*sizeof(wchar_t));
|
}
|
||||||
if (!res)
|
res = PyMem_RawMalloc(argsize * sizeof(wchar_t));
|
||||||
return NULL;
|
if (!res) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
in = (unsigned char*)arg;
|
|
||||||
out = res;
|
out = res;
|
||||||
while(*in)
|
for (in = (unsigned char*)arg; *in; in++) {
|
||||||
if(*in < 128)
|
unsigned char ch = *in;
|
||||||
*out++ = *in++;
|
if (ch < 128) {
|
||||||
else
|
*out++ = ch;
|
||||||
*out++ = 0xdc00 + *in++;
|
}
|
||||||
|
else {
|
||||||
|
if (!surrogateescape) {
|
||||||
|
PyMem_RawFree(res);
|
||||||
|
if (wlen) {
|
||||||
|
*wlen = in - (unsigned char*)arg;
|
||||||
|
}
|
||||||
|
if (reason) {
|
||||||
|
*reason = "decoding error";
|
||||||
|
}
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
|
*out++ = 0xdc00 + ch;
|
||||||
|
}
|
||||||
|
}
|
||||||
*out = 0;
|
*out = 0;
|
||||||
if (size != NULL)
|
|
||||||
*size = out - res;
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined(__APPLE__) && !defined(__ANDROID__)
|
if (wlen != NULL) {
|
||||||
static wchar_t*
|
*wlen = out - res;
|
||||||
decode_current_locale(const char* arg, size_t *size)
|
}
|
||||||
|
*wstr = res;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif /* !HAVE_MBRTOWC */
|
||||||
|
|
||||||
|
static int
|
||||||
|
decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
|
||||||
|
const char **reason, int surrogateescape)
|
||||||
{
|
{
|
||||||
wchar_t *res;
|
wchar_t *res;
|
||||||
size_t argsize;
|
size_t argsize;
|
||||||
|
@ -284,15 +308,15 @@ decode_current_locale(const char* arg, size_t *size)
|
||||||
argsize = mbstowcs(NULL, arg, 0);
|
argsize = mbstowcs(NULL, arg, 0);
|
||||||
#endif
|
#endif
|
||||||
if (argsize != (size_t)-1) {
|
if (argsize != (size_t)-1) {
|
||||||
if (argsize == PY_SSIZE_T_MAX)
|
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
|
||||||
goto oom;
|
return -1;
|
||||||
argsize += 1;
|
}
|
||||||
if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
|
res = (wchar_t *)PyMem_RawMalloc((argsize + 1) * sizeof(wchar_t));
|
||||||
goto oom;
|
if (!res) {
|
||||||
res = (wchar_t *)PyMem_RawMalloc(argsize*sizeof(wchar_t));
|
return -1;
|
||||||
if (!res)
|
}
|
||||||
goto oom;
|
|
||||||
count = mbstowcs(res, arg, argsize);
|
count = mbstowcs(res, arg, argsize + 1);
|
||||||
if (count != (size_t)-1) {
|
if (count != (size_t)-1) {
|
||||||
wchar_t *tmp;
|
wchar_t *tmp;
|
||||||
/* Only use the result if it contains no
|
/* Only use the result if it contains no
|
||||||
|
@ -301,13 +325,16 @@ decode_current_locale(const char* arg, size_t *size)
|
||||||
!Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
|
!Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
|
||||||
;
|
;
|
||||||
if (*tmp == 0) {
|
if (*tmp == 0) {
|
||||||
if (size != NULL)
|
if (wlen != NULL) {
|
||||||
*size = count;
|
*wlen = count;
|
||||||
return res;
|
}
|
||||||
|
*wstr = res;
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
PyMem_RawFree(res);
|
PyMem_RawFree(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Conversion failed. Fall back to escaping with surrogateescape. */
|
/* Conversion failed. Fall back to escaping with surrogateescape. */
|
||||||
#ifdef HAVE_MBRTOWC
|
#ifdef HAVE_MBRTOWC
|
||||||
/* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
|
/* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
|
||||||
|
@ -315,30 +342,37 @@ decode_current_locale(const char* arg, size_t *size)
|
||||||
/* Overallocate; as multi-byte characters are in the argument, the
|
/* Overallocate; as multi-byte characters are in the argument, the
|
||||||
actual output could use less memory. */
|
actual output could use less memory. */
|
||||||
argsize = strlen(arg) + 1;
|
argsize = strlen(arg) + 1;
|
||||||
if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
|
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
|
||||||
goto oom;
|
return -1;
|
||||||
res = (wchar_t*)PyMem_RawMalloc(argsize*sizeof(wchar_t));
|
}
|
||||||
if (!res)
|
res = (wchar_t*)PyMem_RawMalloc(argsize * sizeof(wchar_t));
|
||||||
goto oom;
|
if (!res) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
in = (unsigned char*)arg;
|
in = (unsigned char*)arg;
|
||||||
out = res;
|
out = res;
|
||||||
memset(&mbs, 0, sizeof mbs);
|
memset(&mbs, 0, sizeof mbs);
|
||||||
while (argsize) {
|
while (argsize) {
|
||||||
size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
|
size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
|
||||||
if (converted == 0)
|
if (converted == 0) {
|
||||||
/* Reached end of string; null char stored. */
|
/* Reached end of string; null char stored. */
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (converted == (size_t)-2) {
|
if (converted == (size_t)-2) {
|
||||||
/* Incomplete character. This should never happen,
|
/* Incomplete character. This should never happen,
|
||||||
since we provide everything that we have -
|
since we provide everything that we have -
|
||||||
unless there is a bug in the C library, or I
|
unless there is a bug in the C library, or I
|
||||||
misunderstood how mbrtowc works. */
|
misunderstood how mbrtowc works. */
|
||||||
PyMem_RawFree(res);
|
goto decode_error;
|
||||||
if (size != NULL)
|
|
||||||
*size = (size_t)-2;
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (converted == (size_t)-1) {
|
if (converted == (size_t)-1) {
|
||||||
|
if (!surrogateescape) {
|
||||||
|
goto decode_error;
|
||||||
|
}
|
||||||
|
|
||||||
/* Conversion error. Escape as UTF-8b, and start over
|
/* Conversion error. Escape as UTF-8b, and start over
|
||||||
in the initial shift state. */
|
in the initial shift state. */
|
||||||
*out++ = 0xdc00 + *in++;
|
*out++ = 0xdc00 + *in++;
|
||||||
|
@ -346,12 +380,18 @@ decode_current_locale(const char* arg, size_t *size)
|
||||||
memset(&mbs, 0, sizeof mbs);
|
memset(&mbs, 0, sizeof mbs);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Py_UNICODE_IS_SURROGATE(*out)) {
|
if (Py_UNICODE_IS_SURROGATE(*out)) {
|
||||||
|
if (!surrogateescape) {
|
||||||
|
goto decode_error;
|
||||||
|
}
|
||||||
|
|
||||||
/* Surrogate character. Escape the original
|
/* Surrogate character. Escape the original
|
||||||
byte sequence with surrogateescape. */
|
byte sequence with surrogateescape. */
|
||||||
argsize -= converted;
|
argsize -= converted;
|
||||||
while (converted--)
|
while (converted--) {
|
||||||
*out++ = 0xdc00 + *in++;
|
*out++ = 0xdc00 + *in++;
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* successfully converted some bytes */
|
/* successfully converted some bytes */
|
||||||
|
@ -359,55 +399,80 @@ decode_current_locale(const char* arg, size_t *size)
|
||||||
argsize -= converted;
|
argsize -= converted;
|
||||||
out++;
|
out++;
|
||||||
}
|
}
|
||||||
if (size != NULL)
|
if (wlen != NULL) {
|
||||||
*size = out - res;
|
*wlen = out - res;
|
||||||
|
}
|
||||||
|
*wstr = res;
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
decode_error:
|
||||||
|
PyMem_RawFree(res);
|
||||||
|
if (wlen) {
|
||||||
|
*wlen = in - (unsigned char*)arg;
|
||||||
|
}
|
||||||
|
if (reason) {
|
||||||
|
*reason = "decoding error";
|
||||||
|
}
|
||||||
|
return -2;
|
||||||
#else /* HAVE_MBRTOWC */
|
#else /* HAVE_MBRTOWC */
|
||||||
/* Cannot use C locale for escaping; manually escape as if charset
|
/* Cannot use C locale for escaping; manually escape as if charset
|
||||||
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
|
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
|
||||||
correctly in the locale's charset, which must be an ASCII superset. */
|
correctly in the locale's charset, which must be an ASCII superset. */
|
||||||
res = decode_ascii_surrogateescape(arg, size);
|
return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
|
||||||
if (res == NULL)
|
|
||||||
goto oom;
|
|
||||||
#endif /* HAVE_MBRTOWC */
|
#endif /* HAVE_MBRTOWC */
|
||||||
return res;
|
|
||||||
|
|
||||||
oom:
|
|
||||||
if (size != NULL) {
|
|
||||||
*size = (size_t)-1;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
static wchar_t*
|
/* Decode a byte string from the locale encoding.
|
||||||
decode_locale(const char* arg, size_t *size, int ignore_utf8_mode)
|
|
||||||
|
Use the strict error handler if 'surrogateescape' is zero. Use the
|
||||||
|
surrogateescape error handler if 'surrogateescape' is non-zero: undecodable
|
||||||
|
bytes are decoded as characters in range U+DC80..U+DCFF. If a byte sequence
|
||||||
|
can be decoded as a surrogate character, escape the bytes using the
|
||||||
|
surrogateescape error handler instead of decoding them.
|
||||||
|
|
||||||
|
On sucess, return 0 and write the newly allocated wide character string into
|
||||||
|
*wstr (use PyMem_RawFree() to free the memory). If wlen is not NULL, write
|
||||||
|
the number of wide characters excluding the null character into *wlen.
|
||||||
|
|
||||||
|
On memory allocation failure, return -1.
|
||||||
|
|
||||||
|
On decoding error, return -2. If wlen is not NULL, write the start of
|
||||||
|
invalid byte sequence in the input string into *wlen. If reason is not NULL,
|
||||||
|
write the decoding error message into *reason.
|
||||||
|
|
||||||
|
Use the Py_EncodeLocaleEx() function to encode the character string back to
|
||||||
|
a byte string. */
|
||||||
|
int
|
||||||
|
_Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
|
||||||
|
const char **reason,
|
||||||
|
int current_locale, int surrogateescape)
|
||||||
{
|
{
|
||||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
if (current_locale) {
|
||||||
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
|
return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
|
||||||
#else
|
|
||||||
if (!ignore_utf8_mode && Py_UTF8Mode == 1) {
|
|
||||||
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef MS_WINDOWS
|
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||||
if (force_ascii == -1)
|
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
|
||||||
|
surrogateescape);
|
||||||
|
#else
|
||||||
|
if (Py_UTF8Mode == 1) {
|
||||||
|
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
|
||||||
|
surrogateescape);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef USE_FORCE_ASCII
|
||||||
|
if (force_ascii == -1) {
|
||||||
force_ascii = check_force_ascii();
|
force_ascii = check_force_ascii();
|
||||||
|
}
|
||||||
|
|
||||||
if (force_ascii) {
|
if (force_ascii) {
|
||||||
/* force ASCII encoding to workaround mbstowcs() issue */
|
/* force ASCII encoding to workaround mbstowcs() issue */
|
||||||
wchar_t *wstr = decode_ascii_surrogateescape(arg, size);
|
return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
|
||||||
if (wstr == NULL) {
|
|
||||||
if (size != NULL) {
|
|
||||||
*size = (size_t)-1;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
return wstr;
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return decode_current_locale(arg, size);
|
return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
|
||||||
#endif /* __APPLE__ or __ANDROID__ */
|
#endif /* __APPLE__ or __ANDROID__ */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -432,23 +497,24 @@ decode_locale(const char* arg, size_t *size, int ignore_utf8_mode)
|
||||||
Use the Py_EncodeLocale() function to encode the character string back to a
|
Use the Py_EncodeLocale() function to encode the character string back to a
|
||||||
byte string. */
|
byte string. */
|
||||||
wchar_t*
|
wchar_t*
|
||||||
Py_DecodeLocale(const char* arg, size_t *size)
|
Py_DecodeLocale(const char* arg, size_t *wlen)
|
||||||
{
|
{
|
||||||
return decode_locale(arg, size, 0);
|
wchar_t *wstr;
|
||||||
|
int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1);
|
||||||
|
if (res != 0) {
|
||||||
|
if (wlen != NULL) {
|
||||||
|
*wlen = (size_t)res;
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return wstr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Similar to Py_DecodeLocale() but ignore the UTF-8 mode */
|
static int
|
||||||
wchar_t*
|
encode_current_locale(const wchar_t *text, char **str,
|
||||||
_Py_DecodeCurrentLocale(const char* arg, size_t *size)
|
size_t *error_pos, const char **reason,
|
||||||
{
|
int raw_malloc, int surrogateescape)
|
||||||
return decode_locale(arg, size, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#if !defined(__APPLE__) && !defined(__ANDROID__)
|
|
||||||
static char*
|
|
||||||
encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
|
|
||||||
{
|
{
|
||||||
const size_t len = wcslen(text);
|
const size_t len = wcslen(text);
|
||||||
char *result = NULL, *bytes = NULL;
|
char *result = NULL, *bytes = NULL;
|
||||||
|
@ -464,38 +530,37 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
|
||||||
for (i=0; i < len; i++) {
|
for (i=0; i < len; i++) {
|
||||||
c = text[i];
|
c = text[i];
|
||||||
if (c >= 0xdc80 && c <= 0xdcff) {
|
if (c >= 0xdc80 && c <= 0xdcff) {
|
||||||
|
if (!surrogateescape) {
|
||||||
|
goto encode_error;
|
||||||
|
}
|
||||||
/* UTF-8b surrogate */
|
/* UTF-8b surrogate */
|
||||||
if (bytes != NULL) {
|
if (bytes != NULL) {
|
||||||
*bytes++ = c - 0xdc00;
|
*bytes++ = c - 0xdc00;
|
||||||
size--;
|
size--;
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
size++;
|
size++;
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
buf[0] = c;
|
buf[0] = c;
|
||||||
if (bytes != NULL)
|
if (bytes != NULL) {
|
||||||
converted = wcstombs(bytes, buf, size);
|
converted = wcstombs(bytes, buf, size);
|
||||||
else
|
}
|
||||||
|
else {
|
||||||
converted = wcstombs(NULL, buf, 0);
|
converted = wcstombs(NULL, buf, 0);
|
||||||
|
}
|
||||||
if (converted == (size_t)-1) {
|
if (converted == (size_t)-1) {
|
||||||
if (raw_malloc) {
|
goto encode_error;
|
||||||
PyMem_RawFree(result);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
PyMem_Free(result);
|
|
||||||
}
|
|
||||||
if (error_pos != NULL)
|
|
||||||
*error_pos = i;
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
if (bytes != NULL) {
|
if (bytes != NULL) {
|
||||||
bytes += converted;
|
bytes += converted;
|
||||||
size -= converted;
|
size -= converted;
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
size += converted;
|
size += converted;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (result != NULL) {
|
if (result != NULL) {
|
||||||
|
@ -511,38 +576,78 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc)
|
||||||
result = PyMem_Malloc(size);
|
result = PyMem_Malloc(size);
|
||||||
}
|
}
|
||||||
if (result == NULL) {
|
if (result == NULL) {
|
||||||
if (error_pos != NULL) {
|
return -1;
|
||||||
*error_pos = (size_t)-1;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
bytes = result;
|
bytes = result;
|
||||||
}
|
}
|
||||||
return result;
|
*str = result;
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
encode_error:
|
||||||
|
if (raw_malloc) {
|
||||||
|
PyMem_RawFree(result);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
PyMem_Free(result);
|
||||||
|
}
|
||||||
|
if (error_pos != NULL) {
|
||||||
|
*error_pos = i;
|
||||||
|
}
|
||||||
|
if (reason) {
|
||||||
|
*reason = "encoding error";
|
||||||
|
}
|
||||||
|
return -2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
|
||||||
|
const char **reason,
|
||||||
|
int raw_malloc, int current_locale, int surrogateescape)
|
||||||
|
{
|
||||||
|
if (current_locale) {
|
||||||
|
return encode_current_locale(text, str, error_pos, reason,
|
||||||
|
raw_malloc, surrogateescape);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||||
|
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
|
||||||
|
raw_malloc, surrogateescape);
|
||||||
|
#else /* __APPLE__ */
|
||||||
|
if (Py_UTF8Mode == 1) {
|
||||||
|
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
|
||||||
|
raw_malloc, surrogateescape);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef USE_FORCE_ASCII
|
||||||
|
if (force_ascii == -1) {
|
||||||
|
force_ascii = check_force_ascii();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (force_ascii) {
|
||||||
|
return encode_ascii(text, str, error_pos, reason,
|
||||||
|
raw_malloc, surrogateescape);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
return encode_current_locale(text, str, error_pos, reason,
|
||||||
|
raw_malloc, surrogateescape);
|
||||||
|
#endif /* __APPLE__ or __ANDROID__ */
|
||||||
|
}
|
||||||
|
|
||||||
static char*
|
static char*
|
||||||
encode_locale(const wchar_t *text, size_t *error_pos,
|
encode_locale(const wchar_t *text, size_t *error_pos,
|
||||||
int raw_malloc, int ignore_utf8_mode)
|
int raw_malloc, int current_locale)
|
||||||
{
|
{
|
||||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
char *str;
|
||||||
return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc);
|
int res = encode_locale_ex(text, &str, error_pos, NULL,
|
||||||
#else /* __APPLE__ */
|
raw_malloc, current_locale, 1);
|
||||||
if (!ignore_utf8_mode && Py_UTF8Mode == 1) {
|
if (res != -2 && error_pos) {
|
||||||
return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc);
|
*error_pos = (size_t)-1;
|
||||||
}
|
}
|
||||||
|
if (res != 0) {
|
||||||
#ifndef MS_WINDOWS
|
return NULL;
|
||||||
if (force_ascii == -1)
|
}
|
||||||
force_ascii = check_force_ascii();
|
return str;
|
||||||
|
|
||||||
if (force_ascii)
|
|
||||||
return encode_ascii_surrogateescape(text, error_pos, raw_malloc);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return encode_current_locale(text, error_pos, raw_malloc);
|
|
||||||
#endif /* __APPLE__ or __ANDROID__ */
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Encode a wide character string to the locale encoding with the
|
/* Encode a wide character string to the locale encoding with the
|
||||||
|
@ -573,11 +678,13 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Similar to _Py_EncodeLocaleRaw() but ignore the UTF-8 Mode */
|
int
|
||||||
char*
|
_Py_EncodeLocaleEx(const wchar_t *text, char **str,
|
||||||
_Py_EncodeCurrentLocale(const wchar_t *text, size_t *error_pos)
|
size_t *error_pos, const char **reason,
|
||||||
|
int current_locale, int surrogateescape)
|
||||||
{
|
{
|
||||||
return encode_locale(text, error_pos, 1, 1);
|
return encode_locale_ex(text, str, error_pos, reason, 1,
|
||||||
|
current_locale, surrogateescape);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -382,8 +382,8 @@ _Py_FindEnvConfigValue(FILE *env_file, const wchar_t *key,
|
||||||
/* Comment - skip */
|
/* Comment - skip */
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
tmpbuffer = _Py_DecodeUTF8_surrogateescape(buffer, n, NULL);
|
tmpbuffer = _Py_DecodeUTF8_surrogateescape(buffer, n);
|
||||||
if (tmpbuffer != NULL) {
|
if (tmpbuffer) {
|
||||||
wchar_t * state;
|
wchar_t * state;
|
||||||
wchar_t * tok = wcstok(tmpbuffer, L" \t\r\n", &state);
|
wchar_t * tok = wcstok(tmpbuffer, L" \t\r\n", &state);
|
||||||
if ((tok != NULL) && !wcscmp(tok, key)) {
|
if ((tok != NULL) && !wcscmp(tok, key)) {
|
||||||
|
|
Loading…
Reference in New Issue