From 081dfee4f154f4dfd11a3cf14516340f385049bd Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Wed, 18 Mar 2009 14:47:41 +0000 Subject: [PATCH] Issue 4474: On platforms with sizeof(wchar_t) == 4 and sizeof(Py_UNICODE) == 2, PyUnicode_FromWideChar now converts each character outside the BMP to the appropriate surrogate pair. Thanks Victor Stinner for the patch. --- Misc/NEWS | 4 +++ Modules/_testcapimodule.c | 45 +++++++++++++++++++++++++++ Objects/unicodeobject.c | 64 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+) diff --git a/Misc/NEWS b/Misc/NEWS index e289c18f1ce..90b6091b116 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,10 @@ What's New in Python 3.1 alpha 2? Core and Builtins ----------------- +- Issue #4474: PyUnicode_FromWideChar now converts characters outside + the BMP to surrogate pairs, on systems with sizeof(wchar_t) == 4 + and sizeof(Py_UNICODE) == 2. + - Issue #5237: Allow auto-numbered fields in str.format(). For example: '{} {}'.format(1, 2) == '1 2'. diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c index 438d5a2a449..4ba489875eb 100644 --- a/Modules/_testcapimodule.c +++ b/Modules/_testcapimodule.c @@ -707,6 +707,50 @@ test_Z_code(PyObject *self) Py_RETURN_NONE; } +static PyObject * +test_widechar(PyObject *self) +{ +#if defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) + const wchar_t wtext[2] = {(wchar_t)0x10ABCDu}; + size_t wtextlen = 1; +#else + const wchar_t wtext[3] = {(wchar_t)0xDBEAu, (wchar_t)0xDFCDu}; + size_t wtextlen = 2; +#endif + PyObject *wide, *utf8; + + wide = PyUnicode_FromWideChar(wtext, wtextlen); + if (wide == NULL) + return NULL; + + utf8 = PyUnicode_FromString("\xf4\x8a\xaf\x8d"); + if (utf8 == NULL) { + Py_DECREF(wide); + return NULL; + } + + if (PyUnicode_GET_SIZE(wide) != PyUnicode_GET_SIZE(utf8)) { + Py_DECREF(wide); + Py_DECREF(utf8); + return raiseTestError("test_widechar", + "wide string and utf8 string " + "have different length"); + } + if (PyUnicode_Compare(wide, utf8)) { + Py_DECREF(wide); + Py_DECREF(utf8); + if (PyErr_Occurred()) + return NULL; + return raiseTestError("test_widechar", + "wide string and utf8 string " + "are different"); + } + + Py_DECREF(wide); + Py_DECREF(utf8); + Py_RETURN_NONE; +} + static PyObject * test_empty_argparse(PyObject *self) { @@ -1206,6 +1250,7 @@ static PyMethodDef TestMethods[] = { {"test_s_code", (PyCFunction)test_s_code, METH_NOARGS}, {"test_u_code", (PyCFunction)test_u_code, METH_NOARGS}, {"test_Z_code", (PyCFunction)test_Z_code, METH_NOARGS}, + {"test_widechar", (PyCFunction)test_widechar, METH_NOARGS}, #ifdef WITH_THREAD {"_test_thread_state", test_thread_state, METH_VARARGS}, {"_pending_threadfunc", pending_threadfunc, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e88c8c10e8c..03c65e34b6e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -561,6 +561,66 @@ PyObject *PyUnicode_FromString(const char *u) #ifdef HAVE_WCHAR_H +#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) +# define CONVERT_WCHAR_TO_SURROGATES +#endif + +#ifdef CONVERT_WCHAR_TO_SURROGATES + +/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need + to convert from UTF32 to UTF16. */ + +PyObject *PyUnicode_FromWideChar(register const wchar_t *w, + Py_ssize_t size) +{ + PyUnicodeObject *unicode; + register Py_ssize_t i; + Py_ssize_t alloc; + const wchar_t *orig_w; + + if (w == NULL) { + if (size == 0) + return PyUnicode_FromStringAndSize(NULL, 0); + PyErr_BadInternalCall(); + return NULL; + } + + if (size == -1) { + size = wcslen(w); + } + + alloc = size; + orig_w = w; + for (i = size; i > 0; i--) { + if (*w > 0xFFFF) + alloc++; + w++; + } + w = orig_w; + unicode = _PyUnicode_New(alloc); + if (!unicode) + return NULL; + + /* Copy the wchar_t data into the new object */ + { + register Py_UNICODE *u; + u = PyUnicode_AS_UNICODE(unicode); + for (i = size; i > 0; i--) { + if (*w > 0xFFFF) { + wchar_t ordinal = *w++; + ordinal -= 0x10000; + *u++ = 0xD800 | (ordinal >> 10); + *u++ = 0xDC00 | (ordinal & 0x3FF); + } + else + *u++ = *w++; + } + } + return (PyObject *)unicode; +} + +#else + PyObject *PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) { @@ -597,6 +657,10 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w, return (PyObject *)unicode; } +#endif /* CONVERT_WCHAR_TO_SURROGATES */ + +#undef CONVERT_WCHAR_TO_SURROGATES + static void makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) {