From 27b1ca29ccf523e736a47c02f554de5374e241fc Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 3 Dec 2012 12:47:59 +0100 Subject: [PATCH] Issue #16416: On Mac OS X, operating system data are now always encoded/decoded to/from UTF-8/surrogateescape, instead of the locale encoding (which may be ASCII if no locale environment variable is set), to avoid inconsistencies with os.fsencode() and os.fsdecode() functions which are already using UTF-8/surrogateescape. --- Misc/NEWS | 6 +++++ Modules/python.c | 8 ------ Objects/unicodeobject.c | 9 ++++--- Python/fileutils.c | 60 ++++++++++++++++++++++++++++++++++++----- 4 files changed, 65 insertions(+), 18 deletions(-) diff --git a/Misc/NEWS b/Misc/NEWS index 9d8db7513db..fbcfe90c0b7 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,12 @@ What's New in Python 3.2.4 Core and Builtins ----------------- +- Issue #16416: On Mac OS X, operating system data are now always + encoded/decoded to/from UTF-8/surrogateescape, instead of the locale encoding + (which may be ASCII if no locale environment variable is set), to avoid + inconsistencies with os.fsencode() and os.fsdecode() functions which are + already using UTF-8/surrogateescape. + - Issue #16588: Silence unused-but-set warnings in Python/thread_pthread.h - Issue #16306: Fix multiple error messages when unknown command line diff --git a/Modules/python.c b/Modules/python.c index cf9383f444e..2be69f1f545 100644 --- a/Modules/python.c +++ b/Modules/python.c @@ -15,10 +15,6 @@ wmain(int argc, wchar_t **argv) } #else -#ifdef __APPLE__ -extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size); -#endif - int main(int argc, char **argv) { @@ -45,11 +41,7 @@ main(int argc, char **argv) oldloc = strdup(setlocale(LC_ALL, NULL)); setlocale(LC_ALL, ""); for (i = 0; i < argc; i++) { -#ifdef __APPLE__ - argv_copy[i] = _Py_DecodeUTF8_surrogateescape(argv[i], strlen(argv[i])); -#else argv_copy[i] = _Py_char2wchar(argv[i], NULL); -#endif if (!argv_copy[i]) { fprintf(stderr, "Fatal Python error: " "unable to decode the command line argument #%i\n", diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 35b424e33ab..565d2982708 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2792,7 +2792,10 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, #ifdef __APPLE__ /* Simplified UTF-8 decoder using surrogateescape error handler, - used to decode the command line arguments on Mac OS X. */ + used to decode the command line arguments on Mac OS X. + + Return a pointer to a newly allocated wide character string (use + PyMem_Free() to free the memory), or NULL on memory allocation error. */ wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) @@ -2803,10 +2806,8 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) /* Note: size will always be longer than the resulting Unicode character count */ - if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { - PyErr_NoMemory(); + if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) return NULL; - } unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); if (!unicode) return NULL; diff --git a/Python/fileutils.c b/Python/fileutils.c index c563eaa5fbb..cba6696695c 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -3,6 +3,10 @@ # include #endif +#ifdef __APPLE__ +extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size); +#endif + #ifdef HAVE_STAT /* Decode a byte string from the locale encoding with the @@ -23,6 +27,17 @@ wchar_t* _Py_char2wchar(const char* arg, size_t *size) { +#ifdef __APPLE__ + wchar_t *wstr; + wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg)); + if (size != NULL) { + if (wstr != NULL) + *size = wcslen(wstr); + else + *size = (size_t)-1; + } + return wstr; +#else wchar_t *res; #ifdef HAVE_BROKEN_MBSTOWCS /* Some platforms have a broken implementation of @@ -107,7 +122,7 @@ _Py_char2wchar(const char* arg, size_t *size) argsize -= converted; out++; } -#else +#else /* HAVE_MBRTOWC */ /* Cannot use C locale for escaping; manually escape as if charset is ASCII (i.e. escape all bytes > 128. This will still roundtrip correctly in the locale's charset, which must be an ASCII superset. */ @@ -121,13 +136,14 @@ _Py_char2wchar(const char* arg, size_t *size) else *out++ = 0xdc00 + *in++; *out = 0; -#endif +#endif /* HAVE_MBRTOWC */ if (size != NULL) *size = out - res; return res; oom: fprintf(stderr, "out of memory\n"); return NULL; +#endif /* __APPLE__ */ } /* Encode a (wide) character string to the locale encoding with the @@ -144,14 +160,42 @@ oom: char* _Py_wchar2char(const wchar_t *text, size_t *error_pos) { +#ifdef __APPLE__ + Py_ssize_t len; + PyObject *unicode, *bytes = NULL; + char *cpath; + + unicode = PyUnicode_FromWideChar(text, wcslen(text)); + if (unicode == NULL) + return NULL; + + bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape"); + Py_DECREF(unicode); + if (bytes == NULL) { + PyErr_Clear(); + if (error_pos != NULL) + *error_pos = (size_t)-1; + return NULL; + } + + len = PyBytes_GET_SIZE(bytes); + cpath = PyMem_Malloc(len+1); + if (cpath == NULL) { + PyErr_Clear(); + Py_DECREF(bytes); + if (error_pos != NULL) + *error_pos = (size_t)-1; + return NULL; + } + memcpy(cpath, PyBytes_AsString(bytes), len + 1); + Py_DECREF(bytes); + return cpath; +#else /* __APPLE__ */ const size_t len = wcslen(text); char *result = NULL, *bytes = NULL; size_t i, size, converted; wchar_t c, buf[2]; - if (error_pos != NULL) - *error_pos = (size_t)-1; - /* The function works in two steps: 1. compute the length of the output buffer in bytes (size) 2. outputs the bytes */ @@ -198,11 +242,15 @@ _Py_wchar2char(const wchar_t *text, size_t *error_pos) size += 1; /* nul byte at the end */ result = PyMem_Malloc(size); - if (result == NULL) + if (result == NULL) { + if (error_pos != NULL) + *error_pos = (size_t)-1; return NULL; + } bytes = result; } return result; +#endif /* __APPLE__ */ } /* In principle, this should use HAVE__WSTAT, and _wstat