Change PyUnicode_FromString[AndSize] to expect UTF-8.
This commit is contained in:
parent
64ce5052e1
commit
9c121069d3
|
@ -996,10 +996,11 @@ use these APIs:
|
|||
\var{u} is \NULL{}.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromString}{const char *u}
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromStringAndSize}{const char *u,
|
||||
Py_ssize_t size}
|
||||
Create a Unicode Object from the char buffer \var{u}.
|
||||
\var{u} must be 0-terminated, the bytes will be interpreted as
|
||||
being latin-1 encoded. \var{u} may also be \NULL{} which causes the
|
||||
The bytes will be interpreted as being UTF-8 encoded.
|
||||
\var{u} may also be \NULL{} which causes the
|
||||
contents to be undefined. It is the user's responsibility to fill
|
||||
in the needed data. The buffer is copied into the new object.
|
||||
If the buffer is not \NULL{}, the return value might be a shared object.
|
||||
|
@ -1008,6 +1009,12 @@ use these APIs:
|
|||
\versionadded{3.0}
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromString}{const char*u}
|
||||
Create a Unicode object from an UTF-8 encoded null-terminated
|
||||
char buffer \var{u}.
|
||||
\versionadded{3.0}
|
||||
\end{funcdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromFormat}{const char *format, ...}
|
||||
Take a C \cfunction{printf()}-style \var{format} string and a
|
||||
variable number of arguments, calculate the size of the resulting
|
||||
|
|
|
@ -2724,11 +2724,13 @@ PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
|
|||
static PyObject *
|
||||
bytes_reduce(PyBytesObject *self)
|
||||
{
|
||||
return Py_BuildValue("(O(s#s))",
|
||||
Py_Type(self),
|
||||
self->ob_bytes == NULL ? "" : self->ob_bytes,
|
||||
Py_Size(self),
|
||||
"latin-1");
|
||||
PyObject *latin1;
|
||||
if (self->ob_bytes)
|
||||
latin1 = PyUnicode_DecodeLatin1(self->ob_bytes,
|
||||
Py_Size(self), NULL);
|
||||
else
|
||||
latin1 = PyUnicode_FromString("");
|
||||
return Py_BuildValue("(O(Ns))", Py_Type(self), latin1, "latin-1");
|
||||
}
|
||||
|
||||
static PySequenceMethods bytes_as_sequence = {
|
||||
|
|
|
@ -427,7 +427,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
|
|||
{
|
||||
PyUnicodeObject *unicode;
|
||||
/* If the Unicode data is known at construction time, we can apply
|
||||
some optimizations which share commonly used objects. */
|
||||
some optimizations which share commonly used objects.
|
||||
Also, this means the input must be UTF-8, so fall back to the
|
||||
UTF-8 decoder at the end. */
|
||||
if (u != NULL) {
|
||||
|
||||
/* Optimization for empty strings */
|
||||
|
@ -436,8 +438,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
|
|||
return (PyObject *)unicode_empty;
|
||||
}
|
||||
|
||||
/* Single characters are shared when using this constructor */
|
||||
if (size == 1) {
|
||||
/* Single characters are shared when using this constructor.
|
||||
Restrict to ASCII, since the input must be UTF-8. */
|
||||
if (size == 1 && Py_CHARMASK(*u) < 128) {
|
||||
unicode = unicode_latin1[Py_CHARMASK(*u)];
|
||||
if (!unicode) {
|
||||
unicode = _PyUnicode_New(1);
|
||||
|
@ -449,21 +452,14 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
|
|||
Py_INCREF(unicode);
|
||||
return (PyObject *)unicode;
|
||||
}
|
||||
|
||||
return PyUnicode_DecodeUTF8(u, size, NULL);
|
||||
}
|
||||
|
||||
unicode = _PyUnicode_New(size);
|
||||
if (!unicode)
|
||||
return NULL;
|
||||
|
||||
/* Copy the Unicode data into the new object */
|
||||
if (u != NULL) {
|
||||
Py_UNICODE *p = unicode->str;
|
||||
while (size--)
|
||||
*p++ = Py_CHARMASK(*u++);
|
||||
/* Don't need to write trailing 0 because
|
||||
that's already done by _PyUnicode_New */
|
||||
}
|
||||
|
||||
return (PyObject *)unicode;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue