Issue 4474: On platforms with sizeof(wchar_t) == 4 and

sizeof(Py_UNICODE) == 2, PyUnicode_FromWideChar now converts each character outside the BMP to the appropriate surrogate pair. Thanks Victor Stinner for the patch.
2009-03-18 14:47:41 +00:00 · 2009-03-18 14:47:41 +00:00 · 081dfee4f1
parent ecdfd513a2
commit 081dfee4f1
3 changed files with 113 additions and 0 deletions
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -12,6 +12,10 @@ What's New in Python 3.1 alpha 2?
 Core and Builtins
 -----------------
 - Issue #4474: PyUnicode_FromWideChar now converts characters outside
  the BMP to surrogate pairs, on systems with sizeof(wchar_t) == 4
  and sizeof(Py_UNICODE) == 2.
 - Issue #5237: Allow auto-numbered fields in str.format(). For
  example: '{} {}'.format(1, 2) == '1 2'.
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@ -707,6 +707,50 @@ test_Z_code(PyObject *self)
 	Py_RETURN_NONE;
 }
 static PyObject *
 test_widechar(PyObject *self)
 {
 #if defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
 	const wchar_t wtext[2] = {(wchar_t)0x10ABCDu};
 	size_t wtextlen = 1;
 #else
 	const wchar_t wtext[3] = {(wchar_t)0xDBEAu, (wchar_t)0xDFCDu};
 	size_t wtextlen = 2;
 #endif
 	PyObject *wide, *utf8;
 	wide = PyUnicode_FromWideChar(wtext, wtextlen);
 	if (wide == NULL)
 		return NULL;
 	utf8 = PyUnicode_FromString("\xf4\x8a\xaf\x8d");
 	if (utf8 == NULL) {
 		Py_DECREF(wide);
 		return NULL;
 	}
 	if (PyUnicode_GET_SIZE(wide) != PyUnicode_GET_SIZE(utf8)) {
 		Py_DECREF(wide);
 		Py_DECREF(utf8);
 		return raiseTestError("test_widechar",
 				      "wide string and utf8 string "
 				      "have different length");
 	}
 	if (PyUnicode_Compare(wide, utf8)) {
 		Py_DECREF(wide);
 		Py_DECREF(utf8);
 		if (PyErr_Occurred())
 			return NULL;
 		return raiseTestError("test_widechar",
 				      "wide string and utf8 string "
 				      "are different");
 	}
 	Py_DECREF(wide);
 	Py_DECREF(utf8);
 	Py_RETURN_NONE;
 }
 static PyObject *
 test_empty_argparse(PyObject *self)
 {
@ -1206,6 +1250,7 @@ static PyMethodDef TestMethods[] = {
 	{"test_s_code",		(PyCFunction)test_s_code,	 METH_NOARGS},
 	{"test_u_code",		(PyCFunction)test_u_code,	 METH_NOARGS},
 	{"test_Z_code",		(PyCFunction)test_Z_code,	 METH_NOARGS},
 	{"test_widechar",	(PyCFunction)test_widechar,	 METH_NOARGS},
 #ifdef WITH_THREAD
 	{"_test_thread_state",  test_thread_state, 		 METH_VARARGS},
 	{"_pending_threadfunc",	pending_threadfunc,		 METH_VARARGS},
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -561,6 +561,66 @@ PyObject *PyUnicode_FromString(const char *u)
 #ifdef HAVE_WCHAR_H
 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
 # define CONVERT_WCHAR_TO_SURROGATES
 #endif
 #ifdef CONVERT_WCHAR_TO_SURROGATES
 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
   to convert from UTF32 to UTF16. */
 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
                                 Py_ssize_t size)
 {
    PyUnicodeObject *unicode;
    register Py_ssize_t i;
    Py_ssize_t alloc;
    const wchar_t *orig_w;
    if (w == NULL) {
        if (size == 0)
            return PyUnicode_FromStringAndSize(NULL, 0);
        PyErr_BadInternalCall();
        return NULL;
    }
    if (size == -1) {
        size = wcslen(w);
    }
    alloc = size;
    orig_w = w;
    for (i = size; i > 0; i--) {
        if (*w > 0xFFFF)
            alloc++;
        w++;
    }
    w = orig_w;
    unicode = _PyUnicode_New(alloc);
    if (!unicode)
        return NULL;
    /* Copy the wchar_t data into the new object */
    {
        register Py_UNICODE *u;
        u = PyUnicode_AS_UNICODE(unicode);
        for (i = size; i > 0; i--) {
            if (*w > 0xFFFF) {
                wchar_t ordinal = *w++;
                ordinal -= 0x10000;
                *u++ = 0xD800 | (ordinal >> 10);
                *u++ = 0xDC00 | (ordinal & 0x3FF);
            }
            else
                *u++ = *w++;
        }
    }
    return (PyObject *)unicode;
 }
 #else
 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
                                 Py_ssize_t size)
 {
@ -597,6 +657,10 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
    return (PyObject *)unicode;
 }
 #endif /* CONVERT_WCHAR_TO_SURROGATES */
 #undef CONVERT_WCHAR_TO_SURROGATES
 static void
 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 {