mirror of https://github.com/python/cpython
bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris (GH-25096)
This commit is contained in:
parent
4908fae3d5
commit
9032cf5cb1
|
@ -53,6 +53,18 @@ PyAPI_FUNC(void) _Py_closerange(int first, int last);
|
|||
PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(void);
|
||||
PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void);
|
||||
|
||||
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
|
||||
extern int _Py_LocaleUsesNonUnicodeWchar(void);
|
||||
|
||||
extern wchar_t* _Py_DecodeNonUnicodeWchar(
|
||||
const wchar_t* native,
|
||||
Py_ssize_t size);
|
||||
|
||||
extern int _Py_EncodeNonUnicodeWchar_InPlace(
|
||||
wchar_t* unicode,
|
||||
Py_ssize_t size);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -57,6 +57,10 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
|
||||
#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
|
||||
#endif
|
||||
|
||||
/* Uncomment to display statistics on interned strings at exit
|
||||
in _PyUnicode_ClearInterned(). */
|
||||
/* #define INTERNED_STATS 1 */
|
||||
|
@ -2217,6 +2221,20 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
|
|||
if (size == 0)
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
|
||||
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
|
||||
/* Oracle Solaris uses non-Unicode internal wchar_t form for
|
||||
non-Unicode locales and hence needs conversion to UCS-4 first. */
|
||||
if (_Py_LocaleUsesNonUnicodeWchar()) {
|
||||
wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
|
||||
if (!converted) {
|
||||
return NULL;
|
||||
}
|
||||
PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
|
||||
PyMem_Free(converted);
|
||||
return unicode;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Single character Unicode objects in the Latin-1 range are
|
||||
shared when using this constructor */
|
||||
if (size == 1 && (Py_UCS4)*u < 256)
|
||||
|
@ -3295,6 +3313,17 @@ PyUnicode_AsWideChar(PyObject *unicode,
|
|||
res = size;
|
||||
}
|
||||
unicode_copy_as_widechar(unicode, w, size);
|
||||
|
||||
#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
|
||||
/* Oracle Solaris uses non-Unicode internal wchar_t form for
|
||||
non-Unicode locales and hence needs conversion first. */
|
||||
if (_Py_LocaleUsesNonUnicodeWchar()) {
|
||||
if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -3321,6 +3350,17 @@ PyUnicode_AsWideCharString(PyObject *unicode,
|
|||
return NULL;
|
||||
}
|
||||
unicode_copy_as_widechar(unicode, buffer, buflen + 1);
|
||||
|
||||
#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
|
||||
/* Oracle Solaris uses non-Unicode internal wchar_t form for
|
||||
non-Unicode locales and hence needs conversion first. */
|
||||
if (_Py_LocaleUsesNonUnicodeWchar()) {
|
||||
if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (size != NULL) {
|
||||
*size = buflen;
|
||||
}
|
||||
|
|
|
@ -18,6 +18,10 @@ extern int winerror_to_errno(int);
|
|||
#include <sys/ioctl.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
|
||||
#include <iconv.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_FCNTL_H
|
||||
#include <fcntl.h>
|
||||
#endif /* HAVE_FCNTL_H */
|
||||
|
@ -93,6 +97,12 @@ _Py_device_encoding(int fd)
|
|||
static size_t
|
||||
is_valid_wide_char(wchar_t ch)
|
||||
{
|
||||
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
|
||||
/* Oracle Solaris doesn't use Unicode code points as wchar_t encoding
|
||||
for non-Unicode locales, which makes values higher than MAX_UNICODE
|
||||
possibly valid. */
|
||||
return 1;
|
||||
#endif
|
||||
if (Py_UNICODE_IS_SURROGATE(ch)) {
|
||||
// Reject lone surrogate characters
|
||||
return 0;
|
||||
|
@ -922,6 +932,102 @@ _Py_GetLocaleEncodingObject(void)
|
|||
return str;
|
||||
}
|
||||
|
||||
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
|
||||
|
||||
/* Check whether current locale uses Unicode as internal wchar_t form. */
|
||||
int
|
||||
_Py_LocaleUsesNonUnicodeWchar(void)
|
||||
{
|
||||
/* Oracle Solaris uses non-Unicode internal wchar_t form for
|
||||
non-Unicode locales and hence needs conversion to UTF first. */
|
||||
char* codeset = nl_langinfo(CODESET);
|
||||
if (!codeset) {
|
||||
return 0;
|
||||
}
|
||||
/* 646 refers to ISO/IEC 646 standard that corresponds to ASCII encoding */
|
||||
return (strcmp(codeset, "UTF-8") != 0 && strcmp(codeset, "646") != 0);
|
||||
}
|
||||
|
||||
static wchar_t *
|
||||
_Py_ConvertWCharForm(const wchar_t *source, Py_ssize_t size,
|
||||
const char *tocode, const char *fromcode)
|
||||
{
|
||||
Py_BUILD_ASSERT(sizeof(wchar_t) == 4);
|
||||
|
||||
/* Ensure we won't overflow the size. */
|
||||
if (size > (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t))) {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* the string doesn't have to be NULL terminated */
|
||||
wchar_t* target = PyMem_Malloc(size * sizeof(wchar_t));
|
||||
if (target == NULL) {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
iconv_t cd = iconv_open(tocode, fromcode);
|
||||
if (cd == (iconv_t)-1) {
|
||||
PyErr_Format(PyExc_ValueError, "iconv_open() failed");
|
||||
PyMem_Free(target);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *inbuf = (char *) source;
|
||||
char *outbuf = (char *) target;
|
||||
size_t inbytesleft = sizeof(wchar_t) * size;
|
||||
size_t outbytesleft = inbytesleft;
|
||||
|
||||
size_t ret = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
|
||||
if (ret == DECODE_ERROR) {
|
||||
PyErr_Format(PyExc_ValueError, "iconv() failed");
|
||||
PyMem_Free(target);
|
||||
iconv_close(cd);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
iconv_close(cd);
|
||||
return target;
|
||||
}
|
||||
|
||||
/* Convert a wide character string to the UCS-4 encoded string. This
|
||||
is necessary on systems where internal form of wchar_t are not Unicode
|
||||
code points (e.g. Oracle Solaris).
|
||||
|
||||
Return a pointer to a newly allocated string, use PyMem_Free() to free
|
||||
the memory. Return NULL and raise exception on conversion or memory
|
||||
allocation error. */
|
||||
wchar_t *
|
||||
_Py_DecodeNonUnicodeWchar(const wchar_t *native, Py_ssize_t size)
|
||||
{
|
||||
return _Py_ConvertWCharForm(native, size, "UCS-4-INTERNAL", "wchar_t");
|
||||
}
|
||||
|
||||
/* Convert a UCS-4 encoded string to native wide character string. This
|
||||
is necessary on systems where internal form of wchar_t are not Unicode
|
||||
code points (e.g. Oracle Solaris).
|
||||
|
||||
The conversion is done in place. This can be done because both wchar_t
|
||||
and UCS-4 use 4-byte encoding, and one wchar_t symbol always correspond
|
||||
to a single UCS-4 symbol and vice versa. (This is true for Oracle Solaris,
|
||||
which is currently the only system using these functions; it doesn't have
|
||||
to be for other systems).
|
||||
|
||||
Return 0 on success. Return -1 and raise exception on conversion
|
||||
or memory allocation error. */
|
||||
int
|
||||
_Py_EncodeNonUnicodeWchar_InPlace(wchar_t *unicode, Py_ssize_t size)
|
||||
{
|
||||
wchar_t* result = _Py_ConvertWCharForm(unicode, size, "wchar_t", "UCS-4-INTERNAL");
|
||||
if (!result) {
|
||||
return -1;
|
||||
}
|
||||
memcpy(unicode, result, size * sizeof(wchar_t));
|
||||
PyMem_Free(result);
|
||||
return 0;
|
||||
}
|
||||
#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */
|
||||
|
||||
#ifdef MS_WINDOWS
|
||||
static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */
|
||||
|
|
|
@ -15264,6 +15264,22 @@ else
|
|||
$as_echo "no" >&6; }
|
||||
fi
|
||||
|
||||
case $ac_sys_system/$ac_sys_release in
|
||||
SunOS/*)
|
||||
if test -f /etc/os-release; then
|
||||
OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
|
||||
if test "x$OS_NAME" = "xOracle Solaris"; then
|
||||
# bpo-43667: In Oracle Solaris, the internal form of wchar_t in
|
||||
# non-Unicode locales is not Unicode and hence cannot be used directly.
|
||||
# https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
|
||||
|
||||
$as_echo "#define HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
# check for endianness
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
|
||||
$as_echo_n "checking whether byte ordering is bigendian... " >&6; }
|
||||
|
|
16
configure.ac
16
configure.ac
|
@ -4765,6 +4765,22 @@ else
|
|||
AC_MSG_RESULT(no)
|
||||
fi
|
||||
|
||||
case $ac_sys_system/$ac_sys_release in
|
||||
SunOS/*)
|
||||
if test -f /etc/os-release; then
|
||||
OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
|
||||
if test "x$OS_NAME" = "xOracle Solaris"; then
|
||||
# bpo-43667: In Oracle Solaris, the internal form of wchar_t in
|
||||
# non-Unicode locales is not Unicode and hence cannot be used directly.
|
||||
# https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
|
||||
AC_DEFINE(HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION, 1,
|
||||
[Define if the internal form of wchar_t in non-Unicode locales
|
||||
is not Unicode.])
|
||||
fi
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
# check for endianness
|
||||
AC_C_BIGENDIAN
|
||||
|
||||
|
|
|
@ -748,6 +748,10 @@
|
|||
/* Define to 1 if you have the `nice' function. */
|
||||
#undef HAVE_NICE
|
||||
|
||||
/* Define if the internal form of wchar_t in non-Unicode locales is not
|
||||
Unicode. */
|
||||
#undef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
|
||||
|
||||
/* Define to 1 if you have the `openat' function. */
|
||||
#undef HAVE_OPENAT
|
||||
|
||||
|
|
Loading…
Reference in New Issue