mirror of https://github.com/python/cpython
gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful() (#120639)
Add PyUnicodeWriter_WriteWideChar() and PyUnicodeWriter_DecodeUTF8Stateful() functions. Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
aed31beca9
commit
4123226bbd
|
@ -1551,9 +1551,17 @@ object.
|
||||||
On success, return ``0``.
|
On success, return ``0``.
|
||||||
On error, set an exception, leave the writer unchanged, and return ``-1``.
|
On error, set an exception, leave the writer unchanged, and return ``-1``.
|
||||||
|
|
||||||
To use a different error handler than ``strict``,
|
See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
|
||||||
:c:func:`PyUnicode_DecodeUTF8` can be used with
|
|
||||||
:c:func:`PyUnicodeWriter_WriteStr`.
|
.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size)
|
||||||
|
|
||||||
|
Writer the wide string *str* into *writer*.
|
||||||
|
|
||||||
|
*size* is a number of wide characters. If *size* is equal to ``-1``, call
|
||||||
|
``wcslen(str)`` to get the string length.
|
||||||
|
|
||||||
|
On success, return ``0``.
|
||||||
|
On error, set an exception, leave the writer unchanged, and return ``-1``.
|
||||||
|
|
||||||
.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
|
.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
|
||||||
|
|
||||||
|
@ -1586,3 +1594,24 @@ object.
|
||||||
|
|
||||||
On success, return ``0``.
|
On success, return ``0``.
|
||||||
On error, set an exception, leave the writer unchanged, and return ``-1``.
|
On error, set an exception, leave the writer unchanged, and return ``-1``.
|
||||||
|
|
||||||
|
.. c:function:: int PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, const char *string, Py_ssize_t length, const char *errors, Py_ssize_t *consumed)
|
||||||
|
|
||||||
|
Decode the string *str* from UTF-8 with *errors* error handler and write the
|
||||||
|
output into *writer*.
|
||||||
|
|
||||||
|
*size* is the string length in bytes. If *size* is equal to ``-1``, call
|
||||||
|
``strlen(str)`` to get the string length.
|
||||||
|
|
||||||
|
*errors* is an error handler name, such as ``"replace"``. If *errors* is
|
||||||
|
``NULL``, use the strict error handler.
|
||||||
|
|
||||||
|
If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded
|
||||||
|
bytes on success.
|
||||||
|
If *consumed* is ``NULL``, treat trailing incomplete UTF-8 byte sequences
|
||||||
|
as an error.
|
||||||
|
|
||||||
|
On success, return ``0``.
|
||||||
|
On error, set an exception, leave the writer unchanged, and return ``-1``.
|
||||||
|
|
||||||
|
See also :c:func:`PyUnicodeWriter_WriteUTF8`.
|
||||||
|
|
|
@ -291,10 +291,12 @@ New Features
|
||||||
* :c:func:`PyUnicodeWriter_Finish`.
|
* :c:func:`PyUnicodeWriter_Finish`.
|
||||||
* :c:func:`PyUnicodeWriter_WriteChar`.
|
* :c:func:`PyUnicodeWriter_WriteChar`.
|
||||||
* :c:func:`PyUnicodeWriter_WriteUTF8`.
|
* :c:func:`PyUnicodeWriter_WriteUTF8`.
|
||||||
|
* :c:func:`PyUnicodeWriter_WriteWideChar`.
|
||||||
* :c:func:`PyUnicodeWriter_WriteStr`.
|
* :c:func:`PyUnicodeWriter_WriteStr`.
|
||||||
* :c:func:`PyUnicodeWriter_WriteRepr`.
|
* :c:func:`PyUnicodeWriter_WriteRepr`.
|
||||||
* :c:func:`PyUnicodeWriter_WriteSubstring`.
|
* :c:func:`PyUnicodeWriter_WriteSubstring`.
|
||||||
* :c:func:`PyUnicodeWriter_Format`.
|
* :c:func:`PyUnicodeWriter_Format`.
|
||||||
|
* :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
|
||||||
|
|
||||||
(Contributed by Victor Stinner in :gh:`119182`.)
|
(Contributed by Victor Stinner in :gh:`119182`.)
|
||||||
|
|
||||||
|
|
|
@ -459,6 +459,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
|
||||||
PyUnicodeWriter *writer,
|
PyUnicodeWriter *writer,
|
||||||
const char *str,
|
const char *str,
|
||||||
Py_ssize_t size);
|
Py_ssize_t size);
|
||||||
|
PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
|
||||||
|
PyUnicodeWriter *writer,
|
||||||
|
const wchar_t *str,
|
||||||
|
Py_ssize_t size);
|
||||||
|
|
||||||
PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
|
PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
|
||||||
PyUnicodeWriter *writer,
|
PyUnicodeWriter *writer,
|
||||||
|
@ -475,6 +479,12 @@ PyAPI_FUNC(int) PyUnicodeWriter_Format(
|
||||||
PyUnicodeWriter *writer,
|
PyUnicodeWriter *writer,
|
||||||
const char *format,
|
const char *format,
|
||||||
...);
|
...);
|
||||||
|
PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful(
|
||||||
|
PyUnicodeWriter *writer,
|
||||||
|
const char *string, /* UTF-8 encoded string */
|
||||||
|
Py_ssize_t length, /* size of string */
|
||||||
|
const char *errors, /* error handling */
|
||||||
|
Py_ssize_t *consumed); /* bytes consumed */
|
||||||
|
|
||||||
|
|
||||||
/* --- Private _PyUnicodeWriter API --------------------------------------- */
|
/* --- Private _PyUnicodeWriter API --------------------------------------- */
|
||||||
|
|
|
@ -374,6 +374,119 @@ test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args))
|
||||||
|
{
|
||||||
|
// test PyUnicodeWriter_DecodeUTF8Stateful()
|
||||||
|
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
|
||||||
|
if (writer == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
// incomplete trailing UTF-8 sequence
|
||||||
|
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "replace", NULL) < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
PyObject *result = PyUnicodeWriter_Finish(writer);
|
||||||
|
if (result == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
assert(PyUnicode_EqualToUTF8(result,
|
||||||
|
"ignore-replace\xef\xbf\xbd"
|
||||||
|
"-incomplete\xef\xbf\xbd"));
|
||||||
|
Py_DECREF(result);
|
||||||
|
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
|
||||||
|
error:
|
||||||
|
PyUnicodeWriter_Discard(writer);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args))
|
||||||
|
{
|
||||||
|
// test PyUnicodeWriter_DecodeUTF8Stateful()
|
||||||
|
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
|
||||||
|
if (writer == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
Py_ssize_t consumed;
|
||||||
|
|
||||||
|
// valid string
|
||||||
|
consumed = 12345;
|
||||||
|
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
assert(consumed == 4);
|
||||||
|
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
// non-ASCII
|
||||||
|
consumed = 12345;
|
||||||
|
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "\xC3\xA9-\xE2\x82\xAC", 6, NULL, &consumed) < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
assert(consumed == 6);
|
||||||
|
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
// consumed is 0 if write fails
|
||||||
|
consumed = 12345;
|
||||||
|
assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0);
|
||||||
|
PyErr_Clear();
|
||||||
|
assert(consumed == 0);
|
||||||
|
|
||||||
|
// ignore error handler
|
||||||
|
consumed = 12345;
|
||||||
|
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
assert(consumed == 5);
|
||||||
|
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
// incomplete trailing UTF-8 sequence
|
||||||
|
consumed = 12345;
|
||||||
|
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "ignore", &consumed) < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
assert(consumed == 10);
|
||||||
|
|
||||||
|
PyObject *result = PyUnicodeWriter_Finish(writer);
|
||||||
|
if (result == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
assert(PyUnicode_EqualToUTF8(result,
|
||||||
|
"text-\xC3\xA9-\xE2\x82\xAC-"
|
||||||
|
"more-incomplete"));
|
||||||
|
Py_DECREF(result);
|
||||||
|
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
|
||||||
|
error:
|
||||||
|
PyUnicodeWriter_Discard(writer);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args))
|
test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args))
|
||||||
{
|
{
|
||||||
|
@ -436,6 +549,42 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args))
|
||||||
|
{
|
||||||
|
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
|
||||||
|
if (writer == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
if (PyUnicodeWriter_WriteChar(writer, '.') < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
PyObject *result = PyUnicodeWriter_Finish(writer);
|
||||||
|
if (result == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
assert(PyUnicode_EqualToUTF8(result,
|
||||||
|
"latin1=\xC3\xA9-euro=\xE2\x82\xAC."));
|
||||||
|
Py_DECREF(result);
|
||||||
|
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
|
||||||
|
error:
|
||||||
|
PyUnicodeWriter_Discard(writer);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static PyMethodDef TestMethods[] = {
|
static PyMethodDef TestMethods[] = {
|
||||||
{"unicode_new", unicode_new, METH_VARARGS},
|
{"unicode_new", unicode_new, METH_VARARGS},
|
||||||
{"unicode_fill", unicode_fill, METH_VARARGS},
|
{"unicode_fill", unicode_fill, METH_VARARGS},
|
||||||
|
@ -448,8 +597,11 @@ static PyMethodDef TestMethods[] = {
|
||||||
{"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS},
|
{"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS},
|
||||||
{"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS},
|
{"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS},
|
||||||
{"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS},
|
{"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS},
|
||||||
|
{"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS},
|
||||||
|
{"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS},
|
||||||
{"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS},
|
{"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS},
|
||||||
{"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS},
|
{"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS},
|
||||||
|
{"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS},
|
||||||
{NULL},
|
{NULL},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1374,46 +1374,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
|
||||||
return obj;
|
return obj;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if SIZEOF_WCHAR_T == 2
|
|
||||||
/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
|
|
||||||
will decode surrogate pairs, the other conversions are implemented as macros
|
|
||||||
for efficiency.
|
|
||||||
|
|
||||||
This function assumes that unicode can hold one more code point than wstr
|
|
||||||
characters for a terminating null character. */
|
|
||||||
static void
|
|
||||||
unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
|
|
||||||
PyObject *unicode)
|
|
||||||
{
|
|
||||||
const wchar_t *iter;
|
|
||||||
Py_UCS4 *ucs4_out;
|
|
||||||
|
|
||||||
assert(unicode != NULL);
|
|
||||||
assert(_PyUnicode_CHECK(unicode));
|
|
||||||
assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
|
|
||||||
ucs4_out = PyUnicode_4BYTE_DATA(unicode);
|
|
||||||
|
|
||||||
for (iter = begin; iter < end; ) {
|
|
||||||
assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
|
|
||||||
_PyUnicode_GET_LENGTH(unicode)));
|
|
||||||
if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
|
|
||||||
&& (iter+1) < end
|
|
||||||
&& Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
|
|
||||||
{
|
|
||||||
*ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
|
|
||||||
iter += 2;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
*ucs4_out++ = *iter;
|
|
||||||
iter++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
|
|
||||||
_PyUnicode_GET_LENGTH(unicode)));
|
|
||||||
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
unicode_check_modifiable(PyObject *unicode)
|
unicode_check_modifiable(PyObject *unicode)
|
||||||
{
|
{
|
||||||
|
@ -1937,6 +1897,62 @@ unicode_char(Py_UCS4 ch)
|
||||||
return unicode;
|
return unicode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
unicode_write_widechar(int kind, void *data,
|
||||||
|
const wchar_t *u, Py_ssize_t size,
|
||||||
|
Py_ssize_t num_surrogates)
|
||||||
|
{
|
||||||
|
switch (kind) {
|
||||||
|
case PyUnicode_1BYTE_KIND:
|
||||||
|
_PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case PyUnicode_2BYTE_KIND:
|
||||||
|
#if SIZEOF_WCHAR_T == 2
|
||||||
|
memcpy(data, u, size * 2);
|
||||||
|
#else
|
||||||
|
_PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
|
||||||
|
case PyUnicode_4BYTE_KIND:
|
||||||
|
{
|
||||||
|
#if SIZEOF_WCHAR_T == 2
|
||||||
|
// Convert a 16-bits wchar_t representation to UCS4, this will decode
|
||||||
|
// surrogate pairs.
|
||||||
|
const wchar_t *end = u + size;
|
||||||
|
Py_UCS4 *ucs4_out = (Py_UCS4*)data;
|
||||||
|
# ifndef NDEBUG
|
||||||
|
Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
|
||||||
|
# endif
|
||||||
|
for (const wchar_t *iter = u; iter < end; ) {
|
||||||
|
assert(ucs4_out < ucs4_end);
|
||||||
|
if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
|
||||||
|
&& (iter+1) < end
|
||||||
|
&& Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
|
||||||
|
{
|
||||||
|
*ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
|
||||||
|
iter += 2;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
*ucs4_out++ = *iter;
|
||||||
|
iter++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(ucs4_out == ucs4_end);
|
||||||
|
#else
|
||||||
|
assert(num_surrogates == 0);
|
||||||
|
memcpy(data, u, size * 4);
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
Py_UNREACHABLE();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
|
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
|
||||||
{
|
{
|
||||||
|
@ -1989,36 +2005,65 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
|
||||||
if (!unicode)
|
if (!unicode)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
switch (PyUnicode_KIND(unicode)) {
|
unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
|
||||||
case PyUnicode_1BYTE_KIND:
|
u, size, num_surrogates);
|
||||||
_PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
|
|
||||||
u, u + size, PyUnicode_1BYTE_DATA(unicode));
|
|
||||||
break;
|
|
||||||
case PyUnicode_2BYTE_KIND:
|
|
||||||
#if Py_UNICODE_SIZE == 2
|
|
||||||
memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
|
|
||||||
#else
|
|
||||||
_PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
|
|
||||||
u, u + size, PyUnicode_2BYTE_DATA(unicode));
|
|
||||||
#endif
|
|
||||||
break;
|
|
||||||
case PyUnicode_4BYTE_KIND:
|
|
||||||
#if SIZEOF_WCHAR_T == 2
|
|
||||||
/* This is the only case which has to process surrogates, thus
|
|
||||||
a simple copy loop is not enough and we need a function. */
|
|
||||||
unicode_convert_wchar_to_ucs4(u, u + size, unicode);
|
|
||||||
#else
|
|
||||||
assert(num_surrogates == 0);
|
|
||||||
memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
|
|
||||||
#endif
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
Py_UNREACHABLE();
|
|
||||||
}
|
|
||||||
|
|
||||||
return unicode_result(unicode);
|
return unicode_result(unicode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
|
||||||
|
const wchar_t *str,
|
||||||
|
Py_ssize_t size)
|
||||||
|
{
|
||||||
|
_PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
|
||||||
|
|
||||||
|
if (size < 0) {
|
||||||
|
size = wcslen(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (size == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
|
||||||
|
/* Oracle Solaris uses non-Unicode internal wchar_t form for
|
||||||
|
non-Unicode locales and hence needs conversion to UCS-4 first. */
|
||||||
|
if (_Py_LocaleUsesNonUnicodeWchar()) {
|
||||||
|
wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
|
||||||
|
if (!converted) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
|
||||||
|
PyMem_Free(converted);
|
||||||
|
|
||||||
|
int res = _PyUnicodeWriter_WriteStr(writer, unicode);
|
||||||
|
Py_DECREF(unicode);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
Py_UCS4 maxchar = 0;
|
||||||
|
Py_ssize_t num_surrogates;
|
||||||
|
if (find_maxchar_surrogates(str, str + size,
|
||||||
|
&maxchar, &num_surrogates) == -1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int kind = writer->kind;
|
||||||
|
void *data = (Py_UCS1*)writer->data + writer->pos * kind;
|
||||||
|
unicode_write_widechar(kind, data, str, size, num_surrogates);
|
||||||
|
|
||||||
|
writer->pos += size - num_surrogates;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
|
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
|
||||||
{
|
{
|
||||||
|
@ -13649,6 +13694,32 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
|
||||||
|
const char *string,
|
||||||
|
Py_ssize_t length,
|
||||||
|
const char *errors,
|
||||||
|
Py_ssize_t *consumed)
|
||||||
|
{
|
||||||
|
if (length < 0) {
|
||||||
|
length = strlen(string);
|
||||||
|
}
|
||||||
|
|
||||||
|
_PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
|
||||||
|
Py_ssize_t old_pos = _writer->pos;
|
||||||
|
int res = unicode_decode_utf8_writer(_writer, string, length,
|
||||||
|
_Py_ERROR_UNKNOWN, errors, consumed);
|
||||||
|
if (res < 0) {
|
||||||
|
_writer->pos = old_pos;
|
||||||
|
if (consumed) {
|
||||||
|
*consumed = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
|
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
|
||||||
const char *str, Py_ssize_t len)
|
const char *str, Py_ssize_t len)
|
||||||
|
|
Loading…
Reference in New Issue