Issue 4474: On platforms with sizeof(wchar_t) == 4 and

sizeof(Py_UNICODE) == 2, PyUnicode_FromWideChar now converts
each character outside the BMP to the appropriate surrogate pair.

Thanks Victor Stinner for the patch.

(backport of r70452 from py3k to trunk)
This commit is contained in:
Mark Dickinson 2009-03-18 16:07:26 +00:00
parent eb15863a97
commit 6b265f1bf8
3 changed files with 105 additions and 0 deletions

View File

@ -12,6 +12,10 @@ What's New in Python 2.7 alpha 1
Core and Builtins
-----------------
- Issue #4474: PyUnicode_FromWideChar now converts characters outside
the BMP to surrogate pairs, on systems with sizeof(wchar_t) == 4
and sizeof(Py_UNICODE) == 2.
- Issue #5237: Allow auto-numbered fields in str.format(). For
example: '{} {}'.format(1, 2) == '1 2'.

View File

@ -620,6 +620,48 @@ test_u_code(PyObject *self)
return Py_None;
}
static PyObject *
test_widechar(PyObject *self)
{
#if defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
const wchar_t wtext[2] = {(wchar_t)0x10ABCDu};
size_t wtextlen = 1;
#else
const wchar_t wtext[3] = {(wchar_t)0xDBEAu, (wchar_t)0xDFCDu};
size_t wtextlen = 2;
#endif
PyObject *wide, *utf8;
wide = PyUnicode_FromWideChar(wtext, wtextlen);
if (wide == NULL)
return NULL;
utf8 = PyUnicode_FromString("\xf4\x8a\xaf\x8d");
if (utf8 == NULL) {
Py_DECREF(wide);
return NULL;
}
if (PyUnicode_GET_SIZE(wide) != PyUnicode_GET_SIZE(utf8)) {
Py_DECREF(wide);
Py_DECREF(utf8);
return raiseTestError("test_widechar",
"wide string and utf8 string have different length");
}
if (PyUnicode_Compare(wide, utf8)) {
Py_DECREF(wide);
Py_DECREF(utf8);
if (PyErr_Occurred())
return NULL;
return raiseTestError("test_widechar",
"wide string and utf8 string are differents");
}
Py_DECREF(wide);
Py_DECREF(utf8);
Py_RETURN_NONE;
}
static PyObject *
test_empty_argparse(PyObject *self)
{
@ -975,6 +1017,7 @@ static PyMethodDef TestMethods[] = {
#endif
#ifdef Py_USING_UNICODE
{"test_u_code", (PyCFunction)test_u_code, METH_NOARGS},
{"test_widechar", (PyCFunction)test_widechar, METH_NOARGS},
#endif
#ifdef WITH_THREAD
{"_test_thread_state", test_thread_state, METH_VARARGS},

View File

@ -529,6 +529,60 @@ PyObject *PyUnicode_FromString(const char *u)
#ifdef HAVE_WCHAR_H
#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
# define CONVERT_WCHAR_TO_SURROGATES
#endif
#ifdef CONVERT_WCHAR_TO_SURROGATES
/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
to convert from UTF32 to UTF16. */
PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Py_ssize_t size)
{
PyUnicodeObject *unicode;
register Py_ssize_t i;
Py_ssize_t alloc;
const wchar_t *orig_w;
if (w == NULL) {
PyErr_BadInternalCall();
return NULL;
}
alloc = size;
orig_w = w;
for (i = size; i > 0; i--) {
if (*w > 0xFFFF)
alloc++;
w++;
}
w = orig_w;
unicode = _PyUnicode_New(alloc);
if (!unicode)
return NULL;
/* Copy the wchar_t data into the new object */
{
register Py_UNICODE *u;
u = PyUnicode_AS_UNICODE(unicode);
for (i = size; i > 0; i--) {
if (*w > 0xFFFF) {
wchar_t ordinal = *w++;
ordinal -= 0x10000;
*u++ = 0xD800 | (ordinal >> 10);
*u++ = 0xDC00 | (ordinal & 0x3FF);
}
else
*u++ = *w++;
}
}
return (PyObject *)unicode;
}
#else
PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Py_ssize_t size)
{
@ -559,6 +613,10 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
return (PyObject *)unicode;
}
#endif /* CONVERT_WCHAR_TO_SURROGATES */
#undef CONVERT_WCHAR_TO_SURROGATES
static void
makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
{