Move the codec decode type checks to bytes/bytearray.decode().
Use faster PyUnicode_FromEncodedObject() for bytes/bytearray.decode(). Add new PyCodec_KnownEncoding() API. Add new PyUnicode_AsDecodedUnicode() and PyUnicode_AsEncodedUnicode() APIs. Add missing PyUnicode_AsDecodedObject() to unicodeobject.h Fix punicode codec to also work on memoryviews.
This commit is contained in:
parent
4efb518185
commit
b2750b5d33
|
@ -27,7 +27,7 @@ PyAPI_FUNC(int) PyCodec_Register(
|
||||||
PyObject *search_function
|
PyObject *search_function
|
||||||
);
|
);
|
||||||
|
|
||||||
/* Codec register lookup API.
|
/* Codec registry lookup API.
|
||||||
|
|
||||||
Looks up the given encoding and returns a CodecInfo object with
|
Looks up the given encoding and returns a CodecInfo object with
|
||||||
function attributes which implement the different aspects of
|
function attributes which implement the different aspects of
|
||||||
|
@ -49,6 +49,17 @@ PyAPI_FUNC(PyObject *) _PyCodec_Lookup(
|
||||||
const char *encoding
|
const char *encoding
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/* Codec registry encoding check API.
|
||||||
|
|
||||||
|
Returns 1/0 depending on whether there is a registered codec for
|
||||||
|
the given encoding.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) PyCodec_KnownEncoding(
|
||||||
|
const char *encoding
|
||||||
|
);
|
||||||
|
|
||||||
/* Generic codec based encoding API.
|
/* Generic codec based encoding API.
|
||||||
|
|
||||||
object is passed through the encoder function found for the given
|
object is passed through the encoder function found for the given
|
||||||
|
|
|
@ -139,8 +139,11 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
|
|
||||||
# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
|
# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
|
||||||
# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
|
# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
|
||||||
|
# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject
|
||||||
|
# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode
|
||||||
# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
|
# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
|
||||||
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
|
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
|
||||||
|
# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode
|
||||||
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
|
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
|
||||||
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
|
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
|
||||||
# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
|
# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
|
||||||
|
@ -233,8 +236,11 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
|
|
||||||
# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
|
# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
|
||||||
# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
|
# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
|
||||||
|
# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject
|
||||||
|
# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode
|
||||||
# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
|
# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
|
||||||
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
|
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
|
||||||
|
# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode
|
||||||
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
|
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
|
||||||
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
|
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
|
||||||
# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
|
# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
|
||||||
|
@ -744,6 +750,24 @@ PyAPI_FUNC(PyObject*) PyUnicode_Decode(
|
||||||
const char *errors /* error handling */
|
const char *errors /* error handling */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/* Decode a Unicode object unicode and return the result as Python
|
||||||
|
object. */
|
||||||
|
|
||||||
|
PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
|
||||||
|
PyObject *unicode, /* Unicode object */
|
||||||
|
const char *encoding, /* encoding */
|
||||||
|
const char *errors /* error handling */
|
||||||
|
);
|
||||||
|
|
||||||
|
/* Decode a Unicode object unicode and return the result as Unicode
|
||||||
|
object. */
|
||||||
|
|
||||||
|
PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
|
||||||
|
PyObject *unicode, /* Unicode object */
|
||||||
|
const char *encoding, /* encoding */
|
||||||
|
const char *errors /* error handling */
|
||||||
|
);
|
||||||
|
|
||||||
/* Encodes a Py_UNICODE buffer of the given size and returns a
|
/* Encodes a Py_UNICODE buffer of the given size and returns a
|
||||||
Python string object. */
|
Python string object. */
|
||||||
|
|
||||||
|
@ -772,11 +796,21 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
|
||||||
const char *errors /* error handling */
|
const char *errors /* error handling */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/* Encodes a Unicode object and returns the result as Unicode
|
||||||
|
object. */
|
||||||
|
|
||||||
|
PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
|
||||||
|
PyObject *unicode, /* Unicode object */
|
||||||
|
const char *encoding, /* encoding */
|
||||||
|
const char *errors /* error handling */
|
||||||
|
);
|
||||||
|
|
||||||
|
/* Build an encoding map. */
|
||||||
|
|
||||||
PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
|
PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
|
||||||
PyObject* string /* 256 character map */
|
PyObject* string /* 256 character map */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
/* --- UTF-7 Codecs ------------------------------------------------------- */
|
/* --- UTF-7 Codecs ------------------------------------------------------- */
|
||||||
|
|
||||||
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
|
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
|
||||||
|
|
|
@ -183,6 +183,8 @@ def insertion_sort(base, extended, errors):
|
||||||
def punycode_decode(text, errors):
|
def punycode_decode(text, errors):
|
||||||
if isinstance(text, str):
|
if isinstance(text, str):
|
||||||
text = text.encode("ascii")
|
text = text.encode("ascii")
|
||||||
|
if isinstance(text, memoryview):
|
||||||
|
text = bytes(text)
|
||||||
pos = text.rfind(b"-")
|
pos = text.rfind(b"-")
|
||||||
if pos == -1:
|
if pos == -1:
|
||||||
base = ""
|
base = ""
|
||||||
|
|
|
@ -725,7 +725,7 @@ bytes_init(PyByteArrayObject *self, PyObject *args, PyObject *kwds)
|
||||||
"string argument without an encoding");
|
"string argument without an encoding");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
encoded = PyCodec_Encode(arg, encoding, errors);
|
encoded = PyUnicode_AsEncodedString(arg, encoding, errors);
|
||||||
if (encoded == NULL)
|
if (encoded == NULL)
|
||||||
return -1;
|
return -1;
|
||||||
assert(PyBytes_Check(encoded));
|
assert(PyBytes_Check(encoded));
|
||||||
|
@ -2854,7 +2854,7 @@ bytes_decode(PyObject *self, PyObject *args)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (encoding == NULL)
|
if (encoding == NULL)
|
||||||
encoding = PyUnicode_GetDefaultEncoding();
|
encoding = PyUnicode_GetDefaultEncoding();
|
||||||
return PyCodec_Decode(self, encoding, errors);
|
return PyUnicode_FromEncodedObject(self, encoding, errors);
|
||||||
}
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(alloc_doc,
|
PyDoc_STRVAR(alloc_doc,
|
||||||
|
|
|
@ -2713,7 +2713,7 @@ string_decode(PyObject *self, PyObject *args)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (encoding == NULL)
|
if (encoding == NULL)
|
||||||
encoding = PyUnicode_GetDefaultEncoding();
|
encoding = PyUnicode_GetDefaultEncoding();
|
||||||
return PyCodec_Decode(self, encoding, errors);
|
return PyUnicode_FromEncodedObject(self, encoding, errors);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -2899,7 +2899,7 @@ string_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
||||||
"string argument without an encoding");
|
"string argument without an encoding");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
new = PyCodec_Encode(x, encoding, errors);
|
new = PyUnicode_AsEncodedString(x, encoding, errors);
|
||||||
if (new == NULL)
|
if (new == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
assert(PyBytes_Check(new));
|
assert(PyBytes_Check(new));
|
||||||
|
|
|
@ -1099,14 +1099,18 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
|
||||||
|
|
||||||
/* Coerce object */
|
/* Coerce object */
|
||||||
if (PyBytes_Check(obj)) {
|
if (PyBytes_Check(obj)) {
|
||||||
s = PyBytes_AS_STRING(obj);
|
s = PyBytes_AS_STRING(obj);
|
||||||
len = PyBytes_GET_SIZE(obj);
|
len = PyBytes_GET_SIZE(obj);
|
||||||
}
|
}
|
||||||
|
else if (PyByteArray_Check(obj)) {
|
||||||
|
s = PyByteArray_AS_STRING(obj);
|
||||||
|
len = PyByteArray_GET_SIZE(obj);
|
||||||
|
}
|
||||||
else if (PyObject_AsCharBuffer(obj, &s, &len)) {
|
else if (PyObject_AsCharBuffer(obj, &s, &len)) {
|
||||||
/* Overwrite the error message with something more useful in
|
/* Overwrite the error message with something more useful in
|
||||||
case of a TypeError. */
|
case of a TypeError. */
|
||||||
if (PyErr_ExceptionMatches(PyExc_TypeError))
|
if (PyErr_ExceptionMatches(PyExc_TypeError))
|
||||||
PyErr_Format(PyExc_TypeError,
|
PyErr_Format(PyExc_TypeError,
|
||||||
"coercing to Unicode: need string or buffer, "
|
"coercing to Unicode: need string or buffer, "
|
||||||
"%.80s found",
|
"%.80s found",
|
||||||
Py_TYPE(obj)->tp_name);
|
Py_TYPE(obj)->tp_name);
|
||||||
|
@ -1188,7 +1192,7 @@ PyObject *PyUnicode_Decode(const char *s,
|
||||||
goto onError;
|
goto onError;
|
||||||
if (!PyUnicode_Check(unicode)) {
|
if (!PyUnicode_Check(unicode)) {
|
||||||
PyErr_Format(PyExc_TypeError,
|
PyErr_Format(PyExc_TypeError,
|
||||||
"decoder did not return an unicode object (type=%.400s)",
|
"decoder did not return a unicode object (type=%.400s)",
|
||||||
Py_TYPE(unicode)->tp_name);
|
Py_TYPE(unicode)->tp_name);
|
||||||
Py_DECREF(unicode);
|
Py_DECREF(unicode);
|
||||||
goto onError;
|
goto onError;
|
||||||
|
@ -1225,6 +1229,37 @@ PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
|
||||||
|
const char *encoding,
|
||||||
|
const char *errors)
|
||||||
|
{
|
||||||
|
PyObject *v;
|
||||||
|
|
||||||
|
if (!PyUnicode_Check(unicode)) {
|
||||||
|
PyErr_BadArgument();
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (encoding == NULL)
|
||||||
|
encoding = PyUnicode_GetDefaultEncoding();
|
||||||
|
|
||||||
|
/* Decode via the codec registry */
|
||||||
|
v = PyCodec_Decode(unicode, encoding, errors);
|
||||||
|
if (v == NULL)
|
||||||
|
goto onError;
|
||||||
|
if (!PyUnicode_Check(v)) {
|
||||||
|
PyErr_Format(PyExc_TypeError,
|
||||||
|
"decoder did not return a unicode object (type=%.400s)",
|
||||||
|
Py_TYPE(v)->tp_name);
|
||||||
|
Py_DECREF(v);
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
return v;
|
||||||
|
|
||||||
|
onError:
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
PyObject *PyUnicode_Encode(const Py_UNICODE *s,
|
PyObject *PyUnicode_Encode(const Py_UNICODE *s,
|
||||||
Py_ssize_t size,
|
Py_ssize_t size,
|
||||||
const char *encoding,
|
const char *encoding,
|
||||||
|
@ -1296,7 +1331,54 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
|
||||||
v = PyCodec_Encode(unicode, encoding, errors);
|
v = PyCodec_Encode(unicode, encoding, errors);
|
||||||
if (v == NULL)
|
if (v == NULL)
|
||||||
goto onError;
|
goto onError;
|
||||||
assert(PyBytes_Check(v));
|
if (PyByteArray_Check(v)) {
|
||||||
|
char msg[100];
|
||||||
|
PyOS_snprintf(msg, sizeof(msg),
|
||||||
|
"encoder %s returned buffer instead of bytes",
|
||||||
|
encoding);
|
||||||
|
if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
|
||||||
|
v = NULL;
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
|
||||||
|
}
|
||||||
|
else if (!PyBytes_Check(v)) {
|
||||||
|
PyErr_Format(PyExc_TypeError,
|
||||||
|
"encoder did not return a bytes object (type=%.400s)",
|
||||||
|
Py_TYPE(v)->tp_name);
|
||||||
|
v = NULL;
|
||||||
|
}
|
||||||
|
return v;
|
||||||
|
|
||||||
|
onError:
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
|
||||||
|
const char *encoding,
|
||||||
|
const char *errors)
|
||||||
|
{
|
||||||
|
PyObject *v;
|
||||||
|
|
||||||
|
if (!PyUnicode_Check(unicode)) {
|
||||||
|
PyErr_BadArgument();
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (encoding == NULL)
|
||||||
|
encoding = PyUnicode_GetDefaultEncoding();
|
||||||
|
|
||||||
|
/* Encode via the codec registry */
|
||||||
|
v = PyCodec_Encode(unicode, encoding, errors);
|
||||||
|
if (v == NULL)
|
||||||
|
goto onError;
|
||||||
|
if (!PyUnicode_Check(v)) {
|
||||||
|
PyErr_Format(PyExc_TypeError,
|
||||||
|
"encoder did not return an unicode object (type=%.400s)",
|
||||||
|
Py_TYPE(v)->tp_name);
|
||||||
|
Py_DECREF(v);
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
return v;
|
return v;
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
|
@ -6617,7 +6699,7 @@ unicode_encode(PyUnicodeObject *self, PyObject *args)
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
|
if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
|
||||||
return NULL;
|
return NULL;
|
||||||
v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
|
v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
|
||||||
if (v == NULL)
|
if (v == NULL)
|
||||||
goto onError;
|
goto onError;
|
||||||
if (!PyBytes_Check(v)) {
|
if (!PyBytes_Check(v)) {
|
||||||
|
|
|
@ -183,6 +183,23 @@ PyObject *_PyCodec_Lookup(const char *encoding)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Codec registry encoding check API. */
|
||||||
|
|
||||||
|
int PyCodec_KnownEncoding(const char *encoding)
|
||||||
|
{
|
||||||
|
PyObject *codecs;
|
||||||
|
|
||||||
|
codecs = _PyCodec_Lookup(encoding);
|
||||||
|
if (!codecs) {
|
||||||
|
PyErr_Clear();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Py_DECREF(codecs);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
PyObject *args_tuple(PyObject *object,
|
PyObject *args_tuple(PyObject *object,
|
||||||
const char *errors)
|
const char *errors)
|
||||||
|
@ -344,32 +361,20 @@ PyObject *PyCodec_Encode(PyObject *object,
|
||||||
"encoder must return a tuple (object, integer)");
|
"encoder must return a tuple (object, integer)");
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
v = PyTuple_GET_ITEM(result, 0);
|
v = PyTuple_GET_ITEM(result,0);
|
||||||
if (PyByteArray_Check(v)) {
|
Py_INCREF(v);
|
||||||
char msg[100];
|
|
||||||
PyOS_snprintf(msg, sizeof(msg),
|
|
||||||
"encoder %s returned buffer instead of bytes",
|
|
||||||
encoding);
|
|
||||||
if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
|
|
||||||
v = NULL;
|
|
||||||
goto onError;
|
|
||||||
}
|
|
||||||
v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
|
|
||||||
}
|
|
||||||
else if (PyBytes_Check(v))
|
|
||||||
Py_INCREF(v);
|
|
||||||
else {
|
|
||||||
PyErr_SetString(PyExc_TypeError,
|
|
||||||
"encoding must return a tuple(bytes, integer)");
|
|
||||||
v = NULL;
|
|
||||||
}
|
|
||||||
/* We don't check or use the second (integer) entry. */
|
/* We don't check or use the second (integer) entry. */
|
||||||
|
|
||||||
|
Py_DECREF(args);
|
||||||
|
Py_DECREF(encoder);
|
||||||
|
Py_DECREF(result);
|
||||||
|
return v;
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
Py_XDECREF(result);
|
Py_XDECREF(result);
|
||||||
Py_XDECREF(args);
|
Py_XDECREF(args);
|
||||||
Py_XDECREF(encoder);
|
Py_XDECREF(encoder);
|
||||||
return v;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Decode an object (usually a Python string) using the given encoding
|
/* Decode an object (usually a Python string) using the given encoding
|
||||||
|
|
|
@ -261,14 +261,10 @@ Py_InitializeEx(int install_sigs)
|
||||||
|
|
||||||
codeset = nl_langinfo(CODESET);
|
codeset = nl_langinfo(CODESET);
|
||||||
if (codeset && *codeset) {
|
if (codeset && *codeset) {
|
||||||
PyObject *enc = PyCodec_Encoder(codeset);
|
if (PyCodec_KnownEncoding(codeset))
|
||||||
if (enc) {
|
codeset = strdup(codeset);
|
||||||
codeset = strdup(codeset);
|
else
|
||||||
Py_DECREF(enc);
|
codeset = NULL;
|
||||||
} else {
|
|
||||||
codeset = NULL;
|
|
||||||
PyErr_Clear();
|
|
||||||
}
|
|
||||||
} else
|
} else
|
||||||
codeset = NULL;
|
codeset = NULL;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue