Move the codec decode type checks to bytes/bytearray.decode().

Use faster PyUnicode_FromEncodedObject() for bytes/bytearray.decode().

Add new PyCodec_KnownEncoding() API.

Add new PyUnicode_AsDecodedUnicode() and PyUnicode_AsEncodedUnicode() APIs.

Add missing PyUnicode_AsDecodedObject() to unicodeobject.h

Fix punicode codec to also work on memoryviews.
This commit is contained in:
Marc-André Lemburg 2008-06-06 12:18:17 +00:00
parent 4efb518185
commit b2750b5d33
8 changed files with 171 additions and 41 deletions

View File

@ -27,7 +27,7 @@ PyAPI_FUNC(int) PyCodec_Register(
PyObject *search_function PyObject *search_function
); );
/* Codec register lookup API. /* Codec registry lookup API.
Looks up the given encoding and returns a CodecInfo object with Looks up the given encoding and returns a CodecInfo object with
function attributes which implement the different aspects of function attributes which implement the different aspects of
@ -49,6 +49,17 @@ PyAPI_FUNC(PyObject *) _PyCodec_Lookup(
const char *encoding const char *encoding
); );
/* Codec registry encoding check API.
Returns 1/0 depending on whether there is a registered codec for
the given encoding.
*/
PyAPI_FUNC(int) PyCodec_KnownEncoding(
const char *encoding
);
/* Generic codec based encoding API. /* Generic codec based encoding API.
object is passed through the encoder function found for the given object is passed through the encoder function found for the given

View File

@ -139,8 +139,11 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString # define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString # define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject
# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode
# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject # define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String # define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
@ -233,8 +236,11 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString # define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString # define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject
# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode
# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject # define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String # define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
@ -744,6 +750,24 @@ PyAPI_FUNC(PyObject*) PyUnicode_Decode(
const char *errors /* error handling */ const char *errors /* error handling */
); );
/* Decode a Unicode object unicode and return the result as Python
object. */
PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
PyObject *unicode, /* Unicode object */
const char *encoding, /* encoding */
const char *errors /* error handling */
);
/* Decode a Unicode object unicode and return the result as Unicode
object. */
PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
PyObject *unicode, /* Unicode object */
const char *encoding, /* encoding */
const char *errors /* error handling */
);
/* Encodes a Py_UNICODE buffer of the given size and returns a /* Encodes a Py_UNICODE buffer of the given size and returns a
Python string object. */ Python string object. */
@ -772,11 +796,21 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
const char *errors /* error handling */ const char *errors /* error handling */
); );
/* Encodes a Unicode object and returns the result as Unicode
object. */
PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
PyObject *unicode, /* Unicode object */
const char *encoding, /* encoding */
const char *errors /* error handling */
);
/* Build an encoding map. */
PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
PyObject* string /* 256 character map */ PyObject* string /* 256 character map */
); );
/* --- UTF-7 Codecs ------------------------------------------------------- */ /* --- UTF-7 Codecs ------------------------------------------------------- */
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(

View File

@ -183,6 +183,8 @@ def insertion_sort(base, extended, errors):
def punycode_decode(text, errors): def punycode_decode(text, errors):
if isinstance(text, str): if isinstance(text, str):
text = text.encode("ascii") text = text.encode("ascii")
if isinstance(text, memoryview):
text = bytes(text)
pos = text.rfind(b"-") pos = text.rfind(b"-")
if pos == -1: if pos == -1:
base = "" base = ""

View File

@ -725,7 +725,7 @@ bytes_init(PyByteArrayObject *self, PyObject *args, PyObject *kwds)
"string argument without an encoding"); "string argument without an encoding");
return -1; return -1;
} }
encoded = PyCodec_Encode(arg, encoding, errors); encoded = PyUnicode_AsEncodedString(arg, encoding, errors);
if (encoded == NULL) if (encoded == NULL)
return -1; return -1;
assert(PyBytes_Check(encoded)); assert(PyBytes_Check(encoded));
@ -2854,7 +2854,7 @@ bytes_decode(PyObject *self, PyObject *args)
return NULL; return NULL;
if (encoding == NULL) if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding(); encoding = PyUnicode_GetDefaultEncoding();
return PyCodec_Decode(self, encoding, errors); return PyUnicode_FromEncodedObject(self, encoding, errors);
} }
PyDoc_STRVAR(alloc_doc, PyDoc_STRVAR(alloc_doc,

View File

@ -2713,7 +2713,7 @@ string_decode(PyObject *self, PyObject *args)
return NULL; return NULL;
if (encoding == NULL) if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding(); encoding = PyUnicode_GetDefaultEncoding();
return PyCodec_Decode(self, encoding, errors); return PyUnicode_FromEncodedObject(self, encoding, errors);
} }
@ -2899,7 +2899,7 @@ string_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
"string argument without an encoding"); "string argument without an encoding");
return NULL; return NULL;
} }
new = PyCodec_Encode(x, encoding, errors); new = PyUnicode_AsEncodedString(x, encoding, errors);
if (new == NULL) if (new == NULL)
return NULL; return NULL;
assert(PyBytes_Check(new)); assert(PyBytes_Check(new));

View File

@ -1099,14 +1099,18 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
/* Coerce object */ /* Coerce object */
if (PyBytes_Check(obj)) { if (PyBytes_Check(obj)) {
s = PyBytes_AS_STRING(obj); s = PyBytes_AS_STRING(obj);
len = PyBytes_GET_SIZE(obj); len = PyBytes_GET_SIZE(obj);
} }
else if (PyByteArray_Check(obj)) {
s = PyByteArray_AS_STRING(obj);
len = PyByteArray_GET_SIZE(obj);
}
else if (PyObject_AsCharBuffer(obj, &s, &len)) { else if (PyObject_AsCharBuffer(obj, &s, &len)) {
/* Overwrite the error message with something more useful in /* Overwrite the error message with something more useful in
case of a TypeError. */ case of a TypeError. */
if (PyErr_ExceptionMatches(PyExc_TypeError)) if (PyErr_ExceptionMatches(PyExc_TypeError))
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
"coercing to Unicode: need string or buffer, " "coercing to Unicode: need string or buffer, "
"%.80s found", "%.80s found",
Py_TYPE(obj)->tp_name); Py_TYPE(obj)->tp_name);
@ -1188,7 +1192,7 @@ PyObject *PyUnicode_Decode(const char *s,
goto onError; goto onError;
if (!PyUnicode_Check(unicode)) { if (!PyUnicode_Check(unicode)) {
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
"decoder did not return an unicode object (type=%.400s)", "decoder did not return a unicode object (type=%.400s)",
Py_TYPE(unicode)->tp_name); Py_TYPE(unicode)->tp_name);
Py_DECREF(unicode); Py_DECREF(unicode);
goto onError; goto onError;
@ -1225,6 +1229,37 @@ PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
return NULL; return NULL;
} }
PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
const char *encoding,
const char *errors)
{
PyObject *v;
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
goto onError;
}
if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
/* Decode via the codec registry */
v = PyCodec_Decode(unicode, encoding, errors);
if (v == NULL)
goto onError;
if (!PyUnicode_Check(v)) {
PyErr_Format(PyExc_TypeError,
"decoder did not return a unicode object (type=%.400s)",
Py_TYPE(v)->tp_name);
Py_DECREF(v);
goto onError;
}
return v;
onError:
return NULL;
}
PyObject *PyUnicode_Encode(const Py_UNICODE *s, PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Py_ssize_t size, Py_ssize_t size,
const char *encoding, const char *encoding,
@ -1296,7 +1331,54 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
v = PyCodec_Encode(unicode, encoding, errors); v = PyCodec_Encode(unicode, encoding, errors);
if (v == NULL) if (v == NULL)
goto onError; goto onError;
assert(PyBytes_Check(v)); if (PyByteArray_Check(v)) {
char msg[100];
PyOS_snprintf(msg, sizeof(msg),
"encoder %s returned buffer instead of bytes",
encoding);
if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
v = NULL;
goto onError;
}
v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
}
else if (!PyBytes_Check(v)) {
PyErr_Format(PyExc_TypeError,
"encoder did not return a bytes object (type=%.400s)",
Py_TYPE(v)->tp_name);
v = NULL;
}
return v;
onError:
return NULL;
}
PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
const char *encoding,
const char *errors)
{
PyObject *v;
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
goto onError;
}
if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
/* Encode via the codec registry */
v = PyCodec_Encode(unicode, encoding, errors);
if (v == NULL)
goto onError;
if (!PyUnicode_Check(v)) {
PyErr_Format(PyExc_TypeError,
"encoder did not return an unicode object (type=%.400s)",
Py_TYPE(v)->tp_name);
Py_DECREF(v);
goto onError;
}
return v; return v;
onError: onError:
@ -6617,7 +6699,7 @@ unicode_encode(PyUnicodeObject *self, PyObject *args)
if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
return NULL; return NULL;
v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
if (v == NULL) if (v == NULL)
goto onError; goto onError;
if (!PyBytes_Check(v)) { if (!PyBytes_Check(v)) {

View File

@ -183,6 +183,23 @@ PyObject *_PyCodec_Lookup(const char *encoding)
return NULL; return NULL;
} }
/* Codec registry encoding check API. */
int PyCodec_KnownEncoding(const char *encoding)
{
PyObject *codecs;
codecs = _PyCodec_Lookup(encoding);
if (!codecs) {
PyErr_Clear();
return 0;
}
else {
Py_DECREF(codecs);
return 1;
}
}
static static
PyObject *args_tuple(PyObject *object, PyObject *args_tuple(PyObject *object,
const char *errors) const char *errors)
@ -344,32 +361,20 @@ PyObject *PyCodec_Encode(PyObject *object,
"encoder must return a tuple (object, integer)"); "encoder must return a tuple (object, integer)");
goto onError; goto onError;
} }
v = PyTuple_GET_ITEM(result, 0); v = PyTuple_GET_ITEM(result,0);
if (PyByteArray_Check(v)) { Py_INCREF(v);
char msg[100];
PyOS_snprintf(msg, sizeof(msg),
"encoder %s returned buffer instead of bytes",
encoding);
if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
v = NULL;
goto onError;
}
v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
}
else if (PyBytes_Check(v))
Py_INCREF(v);
else {
PyErr_SetString(PyExc_TypeError,
"encoding must return a tuple(bytes, integer)");
v = NULL;
}
/* We don't check or use the second (integer) entry. */ /* We don't check or use the second (integer) entry. */
Py_DECREF(args);
Py_DECREF(encoder);
Py_DECREF(result);
return v;
onError: onError:
Py_XDECREF(result); Py_XDECREF(result);
Py_XDECREF(args); Py_XDECREF(args);
Py_XDECREF(encoder); Py_XDECREF(encoder);
return v; return NULL;
} }
/* Decode an object (usually a Python string) using the given encoding /* Decode an object (usually a Python string) using the given encoding

View File

@ -261,14 +261,10 @@ Py_InitializeEx(int install_sigs)
codeset = nl_langinfo(CODESET); codeset = nl_langinfo(CODESET);
if (codeset && *codeset) { if (codeset && *codeset) {
PyObject *enc = PyCodec_Encoder(codeset); if (PyCodec_KnownEncoding(codeset))
if (enc) { codeset = strdup(codeset);
codeset = strdup(codeset); else
Py_DECREF(enc); codeset = NULL;
} else {
codeset = NULL;
PyErr_Clear();
}
} else } else
codeset = NULL; codeset = NULL;