Issue 4474: On platforms with sizeof(wchar_t) == 4 and
sizeof(Py_UNICODE) == 2, PyUnicode_FromWideChar now converts each character outside the BMP to the appropriate surrogate pair. Thanks Victor Stinner for the patch.
This commit is contained in:
parent
ecdfd513a2
commit
081dfee4f1
|
@ -12,6 +12,10 @@ What's New in Python 3.1 alpha 2?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #4474: PyUnicode_FromWideChar now converts characters outside
|
||||||
|
the BMP to surrogate pairs, on systems with sizeof(wchar_t) == 4
|
||||||
|
and sizeof(Py_UNICODE) == 2.
|
||||||
|
|
||||||
- Issue #5237: Allow auto-numbered fields in str.format(). For
|
- Issue #5237: Allow auto-numbered fields in str.format(). For
|
||||||
example: '{} {}'.format(1, 2) == '1 2'.
|
example: '{} {}'.format(1, 2) == '1 2'.
|
||||||
|
|
||||||
|
|
|
@ -707,6 +707,50 @@ test_Z_code(PyObject *self)
|
||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
test_widechar(PyObject *self)
|
||||||
|
{
|
||||||
|
#if defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
|
||||||
|
const wchar_t wtext[2] = {(wchar_t)0x10ABCDu};
|
||||||
|
size_t wtextlen = 1;
|
||||||
|
#else
|
||||||
|
const wchar_t wtext[3] = {(wchar_t)0xDBEAu, (wchar_t)0xDFCDu};
|
||||||
|
size_t wtextlen = 2;
|
||||||
|
#endif
|
||||||
|
PyObject *wide, *utf8;
|
||||||
|
|
||||||
|
wide = PyUnicode_FromWideChar(wtext, wtextlen);
|
||||||
|
if (wide == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
utf8 = PyUnicode_FromString("\xf4\x8a\xaf\x8d");
|
||||||
|
if (utf8 == NULL) {
|
||||||
|
Py_DECREF(wide);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (PyUnicode_GET_SIZE(wide) != PyUnicode_GET_SIZE(utf8)) {
|
||||||
|
Py_DECREF(wide);
|
||||||
|
Py_DECREF(utf8);
|
||||||
|
return raiseTestError("test_widechar",
|
||||||
|
"wide string and utf8 string "
|
||||||
|
"have different length");
|
||||||
|
}
|
||||||
|
if (PyUnicode_Compare(wide, utf8)) {
|
||||||
|
Py_DECREF(wide);
|
||||||
|
Py_DECREF(utf8);
|
||||||
|
if (PyErr_Occurred())
|
||||||
|
return NULL;
|
||||||
|
return raiseTestError("test_widechar",
|
||||||
|
"wide string and utf8 string "
|
||||||
|
"are different");
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_DECREF(wide);
|
||||||
|
Py_DECREF(utf8);
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
test_empty_argparse(PyObject *self)
|
test_empty_argparse(PyObject *self)
|
||||||
{
|
{
|
||||||
|
@ -1206,6 +1250,7 @@ static PyMethodDef TestMethods[] = {
|
||||||
{"test_s_code", (PyCFunction)test_s_code, METH_NOARGS},
|
{"test_s_code", (PyCFunction)test_s_code, METH_NOARGS},
|
||||||
{"test_u_code", (PyCFunction)test_u_code, METH_NOARGS},
|
{"test_u_code", (PyCFunction)test_u_code, METH_NOARGS},
|
||||||
{"test_Z_code", (PyCFunction)test_Z_code, METH_NOARGS},
|
{"test_Z_code", (PyCFunction)test_Z_code, METH_NOARGS},
|
||||||
|
{"test_widechar", (PyCFunction)test_widechar, METH_NOARGS},
|
||||||
#ifdef WITH_THREAD
|
#ifdef WITH_THREAD
|
||||||
{"_test_thread_state", test_thread_state, METH_VARARGS},
|
{"_test_thread_state", test_thread_state, METH_VARARGS},
|
||||||
{"_pending_threadfunc", pending_threadfunc, METH_VARARGS},
|
{"_pending_threadfunc", pending_threadfunc, METH_VARARGS},
|
||||||
|
|
|
@ -561,6 +561,66 @@ PyObject *PyUnicode_FromString(const char *u)
|
||||||
|
|
||||||
#ifdef HAVE_WCHAR_H
|
#ifdef HAVE_WCHAR_H
|
||||||
|
|
||||||
|
#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
|
||||||
|
# define CONVERT_WCHAR_TO_SURROGATES
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef CONVERT_WCHAR_TO_SURROGATES
|
||||||
|
|
||||||
|
/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
|
||||||
|
to convert from UTF32 to UTF16. */
|
||||||
|
|
||||||
|
PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
|
||||||
|
Py_ssize_t size)
|
||||||
|
{
|
||||||
|
PyUnicodeObject *unicode;
|
||||||
|
register Py_ssize_t i;
|
||||||
|
Py_ssize_t alloc;
|
||||||
|
const wchar_t *orig_w;
|
||||||
|
|
||||||
|
if (w == NULL) {
|
||||||
|
if (size == 0)
|
||||||
|
return PyUnicode_FromStringAndSize(NULL, 0);
|
||||||
|
PyErr_BadInternalCall();
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (size == -1) {
|
||||||
|
size = wcslen(w);
|
||||||
|
}
|
||||||
|
|
||||||
|
alloc = size;
|
||||||
|
orig_w = w;
|
||||||
|
for (i = size; i > 0; i--) {
|
||||||
|
if (*w > 0xFFFF)
|
||||||
|
alloc++;
|
||||||
|
w++;
|
||||||
|
}
|
||||||
|
w = orig_w;
|
||||||
|
unicode = _PyUnicode_New(alloc);
|
||||||
|
if (!unicode)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/* Copy the wchar_t data into the new object */
|
||||||
|
{
|
||||||
|
register Py_UNICODE *u;
|
||||||
|
u = PyUnicode_AS_UNICODE(unicode);
|
||||||
|
for (i = size; i > 0; i--) {
|
||||||
|
if (*w > 0xFFFF) {
|
||||||
|
wchar_t ordinal = *w++;
|
||||||
|
ordinal -= 0x10000;
|
||||||
|
*u++ = 0xD800 | (ordinal >> 10);
|
||||||
|
*u++ = 0xDC00 | (ordinal & 0x3FF);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
*u++ = *w++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (PyObject *)unicode;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
|
PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
|
||||||
Py_ssize_t size)
|
Py_ssize_t size)
|
||||||
{
|
{
|
||||||
|
@ -597,6 +657,10 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
|
||||||
return (PyObject *)unicode;
|
return (PyObject *)unicode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif /* CONVERT_WCHAR_TO_SURROGATES */
|
||||||
|
|
||||||
|
#undef CONVERT_WCHAR_TO_SURROGATES
|
||||||
|
|
||||||
static void
|
static void
|
||||||
makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
|
makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue