From 9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Kul=C3=ADk?= Date: Fri, 30 Apr 2021 15:21:42 +0200 Subject: [PATCH] bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris (GH-25096) --- Include/internal/pycore_fileutils.h | 12 ++++ Objects/unicodeobject.c | 40 +++++++++++ Python/fileutils.c | 106 ++++++++++++++++++++++++++++ configure | 16 +++++ configure.ac | 16 +++++ pyconfig.h.in | 4 ++ 6 files changed, 194 insertions(+) diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h index 9281f4eeb8b..c1c9244a1bc 100644 --- a/Include/internal/pycore_fileutils.h +++ b/Include/internal/pycore_fileutils.h @@ -53,6 +53,18 @@ PyAPI_FUNC(void) _Py_closerange(int first, int last); PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(void); PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void); +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION +extern int _Py_LocaleUsesNonUnicodeWchar(void); + +extern wchar_t* _Py_DecodeNonUnicodeWchar( + const wchar_t* native, + Py_ssize_t size); + +extern int _Py_EncodeNonUnicodeWchar_InPlace( + wchar_t* unicode, + Py_ssize_t size); +#endif + #ifdef __cplusplus } #endif diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 74c5888d13b..bfd5c881215 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -57,6 +57,10 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include #endif +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION +#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar() +#endif + /* Uncomment to display statistics on interned strings at exit in _PyUnicode_ClearInterned(). */ /* #define INTERNED_STATS 1 */ @@ -2217,6 +2221,20 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) if (size == 0) _Py_RETURN_UNICODE_EMPTY(); +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + /* Oracle Solaris uses non-Unicode internal wchar_t form for + non-Unicode locales and hence needs conversion to UCS-4 first. */ + if (_Py_LocaleUsesNonUnicodeWchar()) { + wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size); + if (!converted) { + return NULL; + } + PyObject *unicode = _PyUnicode_FromUCS4(converted, size); + PyMem_Free(converted); + return unicode; + } +#endif + /* Single character Unicode objects in the Latin-1 range are shared when using this constructor */ if (size == 1 && (Py_UCS4)*u < 256) @@ -3295,6 +3313,17 @@ PyUnicode_AsWideChar(PyObject *unicode, res = size; } unicode_copy_as_widechar(unicode, w, size); + +#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + /* Oracle Solaris uses non-Unicode internal wchar_t form for + non-Unicode locales and hence needs conversion first. */ + if (_Py_LocaleUsesNonUnicodeWchar()) { + if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) { + return -1; + } + } +#endif + return res; } @@ -3321,6 +3350,17 @@ PyUnicode_AsWideCharString(PyObject *unicode, return NULL; } unicode_copy_as_widechar(unicode, buffer, buflen + 1); + +#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + /* Oracle Solaris uses non-Unicode internal wchar_t form for + non-Unicode locales and hence needs conversion first. */ + if (_Py_LocaleUsesNonUnicodeWchar()) { + if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) { + return NULL; + } + } +#endif + if (size != NULL) { *size = buflen; } diff --git a/Python/fileutils.c b/Python/fileutils.c index 2a079bbadcc..a8fab00629d 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -18,6 +18,10 @@ extern int winerror_to_errno(int); #include #endif +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION +#include +#endif + #ifdef HAVE_FCNTL_H #include #endif /* HAVE_FCNTL_H */ @@ -93,6 +97,12 @@ _Py_device_encoding(int fd) static size_t is_valid_wide_char(wchar_t ch) { +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + /* Oracle Solaris doesn't use Unicode code points as wchar_t encoding + for non-Unicode locales, which makes values higher than MAX_UNICODE + possibly valid. */ + return 1; +#endif if (Py_UNICODE_IS_SURROGATE(ch)) { // Reject lone surrogate characters return 0; @@ -922,6 +932,102 @@ _Py_GetLocaleEncodingObject(void) return str; } +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + +/* Check whether current locale uses Unicode as internal wchar_t form. */ +int +_Py_LocaleUsesNonUnicodeWchar(void) +{ + /* Oracle Solaris uses non-Unicode internal wchar_t form for + non-Unicode locales and hence needs conversion to UTF first. */ + char* codeset = nl_langinfo(CODESET); + if (!codeset) { + return 0; + } + /* 646 refers to ISO/IEC 646 standard that corresponds to ASCII encoding */ + return (strcmp(codeset, "UTF-8") != 0 && strcmp(codeset, "646") != 0); +} + +static wchar_t * +_Py_ConvertWCharForm(const wchar_t *source, Py_ssize_t size, + const char *tocode, const char *fromcode) +{ + Py_BUILD_ASSERT(sizeof(wchar_t) == 4); + + /* Ensure we won't overflow the size. */ + if (size > (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t))) { + PyErr_NoMemory(); + return NULL; + } + + /* the string doesn't have to be NULL terminated */ + wchar_t* target = PyMem_Malloc(size * sizeof(wchar_t)); + if (target == NULL) { + PyErr_NoMemory(); + return NULL; + } + + iconv_t cd = iconv_open(tocode, fromcode); + if (cd == (iconv_t)-1) { + PyErr_Format(PyExc_ValueError, "iconv_open() failed"); + PyMem_Free(target); + return NULL; + } + + char *inbuf = (char *) source; + char *outbuf = (char *) target; + size_t inbytesleft = sizeof(wchar_t) * size; + size_t outbytesleft = inbytesleft; + + size_t ret = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); + if (ret == DECODE_ERROR) { + PyErr_Format(PyExc_ValueError, "iconv() failed"); + PyMem_Free(target); + iconv_close(cd); + return NULL; + } + + iconv_close(cd); + return target; +} + +/* Convert a wide character string to the UCS-4 encoded string. This + is necessary on systems where internal form of wchar_t are not Unicode + code points (e.g. Oracle Solaris). + + Return a pointer to a newly allocated string, use PyMem_Free() to free + the memory. Return NULL and raise exception on conversion or memory + allocation error. */ +wchar_t * +_Py_DecodeNonUnicodeWchar(const wchar_t *native, Py_ssize_t size) +{ + return _Py_ConvertWCharForm(native, size, "UCS-4-INTERNAL", "wchar_t"); +} + +/* Convert a UCS-4 encoded string to native wide character string. This + is necessary on systems where internal form of wchar_t are not Unicode + code points (e.g. Oracle Solaris). + + The conversion is done in place. This can be done because both wchar_t + and UCS-4 use 4-byte encoding, and one wchar_t symbol always correspond + to a single UCS-4 symbol and vice versa. (This is true for Oracle Solaris, + which is currently the only system using these functions; it doesn't have + to be for other systems). + + Return 0 on success. Return -1 and raise exception on conversion + or memory allocation error. */ +int +_Py_EncodeNonUnicodeWchar_InPlace(wchar_t *unicode, Py_ssize_t size) +{ + wchar_t* result = _Py_ConvertWCharForm(unicode, size, "wchar_t", "UCS-4-INTERNAL"); + if (!result) { + return -1; + } + memcpy(unicode, result, size * sizeof(wchar_t)); + PyMem_Free(result); + return 0; +} +#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */ #ifdef MS_WINDOWS static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */ diff --git a/configure b/configure index ad0367fe0e2..08a88aa46d5 100755 --- a/configure +++ b/configure @@ -15264,6 +15264,22 @@ else $as_echo "no" >&6; } fi +case $ac_sys_system/$ac_sys_release in +SunOS/*) + if test -f /etc/os-release; then + OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release) + if test "x$OS_NAME" = "xOracle Solaris"; then + # bpo-43667: In Oracle Solaris, the internal form of wchar_t in + # non-Unicode locales is not Unicode and hence cannot be used directly. + # https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html + +$as_echo "#define HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 1" >>confdefs.h + + fi + fi + ;; +esac + # check for endianness { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5 $as_echo_n "checking whether byte ordering is bigendian... " >&6; } diff --git a/configure.ac b/configure.ac index 3df9bd0acec..b2643292b2c 100644 --- a/configure.ac +++ b/configure.ac @@ -4765,6 +4765,22 @@ else AC_MSG_RESULT(no) fi +case $ac_sys_system/$ac_sys_release in +SunOS/*) + if test -f /etc/os-release; then + OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release) + if test "x$OS_NAME" = "xOracle Solaris"; then + # bpo-43667: In Oracle Solaris, the internal form of wchar_t in + # non-Unicode locales is not Unicode and hence cannot be used directly. + # https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html + AC_DEFINE(HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION, 1, + [Define if the internal form of wchar_t in non-Unicode locales + is not Unicode.]) + fi + fi + ;; +esac + # check for endianness AC_C_BIGENDIAN diff --git a/pyconfig.h.in b/pyconfig.h.in index 6e54d553b77..63438d857a0 100644 --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -748,6 +748,10 @@ /* Define to 1 if you have the `nice' function. */ #undef HAVE_NICE +/* Define if the internal form of wchar_t in non-Unicode locales is not + Unicode. */ +#undef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + /* Define to 1 if you have the `openat' function. */ #undef HAVE_OPENAT