1249 lines
35 KiB
C
1249 lines
35 KiB
C
/* ------------------------------------------------------------------------
|
|
|
|
_codecs -- Provides access to the codec registry and the builtin
|
|
codecs.
|
|
|
|
This module should never be imported directly. The standard library
|
|
module "codecs" wraps this builtin module for use within Python.
|
|
|
|
The codec registry is accessible via:
|
|
|
|
register(search_function) -> None
|
|
|
|
lookup(encoding) -> CodecInfo object
|
|
|
|
The builtin Unicode codecs use the following interface:
|
|
|
|
<encoding>_encode(Unicode_object[,errors='strict']) ->
|
|
(string object, bytes consumed)
|
|
|
|
<encoding>_decode(char_buffer_obj[,errors='strict']) ->
|
|
(Unicode object, bytes consumed)
|
|
|
|
<encoding>_encode() interfaces also accept non-Unicode object as
|
|
input. The objects are then converted to Unicode using
|
|
PyUnicode_FromObject() prior to applying the conversion.
|
|
|
|
These <encoding>s are available: utf_8, unicode_escape,
|
|
raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
|
|
mbcs (on win32).
|
|
|
|
|
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
|
|
|
Copyright (c) Corporation for National Research Initiatives.
|
|
|
|
------------------------------------------------------------------------ */
|
|
|
|
#define PY_SSIZE_T_CLEAN
|
|
#include "Python.h"
|
|
|
|
#ifdef MS_WINDOWS
|
|
#include <windows.h>
|
|
#endif
|
|
|
|
/*[clinic input]
|
|
module _codecs
|
|
[clinic start generated code]*/
|
|
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e1390e3da3cb9deb]*/
|
|
|
|
|
|
/* --- Registry ----------------------------------------------------------- */
|
|
|
|
PyDoc_STRVAR(register__doc__,
|
|
"register(search_function)\n\
|
|
\n\
|
|
Register a codec search function. Search functions are expected to take\n\
|
|
one argument, the encoding name in all lower case letters, and return\n\
|
|
a tuple of functions (encoder, decoder, stream_reader, stream_writer)\n\
|
|
(or a CodecInfo object).");
|
|
|
|
static
|
|
PyObject *codec_register(PyObject *self, PyObject *search_function)
|
|
{
|
|
if (PyCodec_Register(search_function))
|
|
return NULL;
|
|
|
|
Py_RETURN_NONE;
|
|
}
|
|
|
|
PyDoc_STRVAR(lookup__doc__,
|
|
"lookup(encoding) -> CodecInfo\n\
|
|
\n\
|
|
Looks up a codec tuple in the Python codec registry and returns\n\
|
|
a CodecInfo object.");
|
|
|
|
static
|
|
PyObject *codec_lookup(PyObject *self, PyObject *args)
|
|
{
|
|
char *encoding;
|
|
|
|
if (!PyArg_ParseTuple(args, "s:lookup", &encoding))
|
|
return NULL;
|
|
|
|
return _PyCodec_Lookup(encoding);
|
|
}
|
|
|
|
PyDoc_STRVAR(encode__doc__,
|
|
"encode(obj, [encoding[,errors]]) -> object\n\
|
|
\n\
|
|
Encodes obj using the codec registered for encoding. encoding defaults\n\
|
|
to the default encoding. errors may be given to set a different error\n\
|
|
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
|
|
a ValueError. Other possible values are 'ignore', 'replace' and\n\
|
|
'xmlcharrefreplace' as well as any other name registered with\n\
|
|
codecs.register_error that can handle ValueErrors.");
|
|
|
|
static PyObject *
|
|
codec_encode(PyObject *self, PyObject *args, PyObject *kwargs)
|
|
{
|
|
static char *kwlist[] = {"obj", "encoding", "errors", NULL};
|
|
const char *encoding = NULL;
|
|
const char *errors = NULL;
|
|
PyObject *v;
|
|
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|ss:encode", kwlist,
|
|
&v, &encoding, &errors))
|
|
return NULL;
|
|
|
|
if (encoding == NULL)
|
|
encoding = PyUnicode_GetDefaultEncoding();
|
|
|
|
/* Encode via the codec registry */
|
|
return PyCodec_Encode(v, encoding, errors);
|
|
}
|
|
|
|
PyDoc_STRVAR(decode__doc__,
|
|
"decode(obj, [encoding[,errors]]) -> object\n\
|
|
\n\
|
|
Decodes obj using the codec registered for encoding. encoding defaults\n\
|
|
to the default encoding. errors may be given to set a different error\n\
|
|
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
|
|
a ValueError. Other possible values are 'ignore' and 'replace'\n\
|
|
as well as any other name registered with codecs.register_error that is\n\
|
|
able to handle ValueErrors.");
|
|
|
|
static PyObject *
|
|
codec_decode(PyObject *self, PyObject *args, PyObject *kwargs)
|
|
{
|
|
static char *kwlist[] = {"obj", "encoding", "errors", NULL};
|
|
const char *encoding = NULL;
|
|
const char *errors = NULL;
|
|
PyObject *v;
|
|
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|ss:decode", kwlist,
|
|
&v, &encoding, &errors))
|
|
return NULL;
|
|
|
|
if (encoding == NULL)
|
|
encoding = PyUnicode_GetDefaultEncoding();
|
|
|
|
/* Decode via the codec registry */
|
|
return PyCodec_Decode(v, encoding, errors);
|
|
}
|
|
|
|
/* --- Helpers ------------------------------------------------------------ */
|
|
|
|
/*[clinic input]
|
|
_codecs._forget_codec
|
|
|
|
encoding: str
|
|
/
|
|
|
|
Purge the named codec from the internal codec lookup cache
|
|
[clinic start generated code]*/
|
|
|
|
PyDoc_STRVAR(_codecs__forget_codec__doc__,
|
|
"_forget_codec($module, encoding, /)\n"
|
|
"--\n"
|
|
"\n"
|
|
"Purge the named codec from the internal codec lookup cache");
|
|
|
|
#define _CODECS__FORGET_CODEC_METHODDEF \
|
|
{"_forget_codec", (PyCFunction)_codecs__forget_codec, METH_VARARGS, _codecs__forget_codec__doc__},
|
|
|
|
static PyObject *
|
|
_codecs__forget_codec_impl(PyModuleDef *module, const char *encoding);
|
|
|
|
static PyObject *
|
|
_codecs__forget_codec(PyModuleDef *module, PyObject *args)
|
|
{
|
|
PyObject *return_value = NULL;
|
|
const char *encoding;
|
|
|
|
if (!PyArg_ParseTuple(args,
|
|
"s:_forget_codec",
|
|
&encoding))
|
|
goto exit;
|
|
return_value = _codecs__forget_codec_impl(module, encoding);
|
|
|
|
exit:
|
|
return return_value;
|
|
}
|
|
|
|
static PyObject *
|
|
_codecs__forget_codec_impl(PyModuleDef *module, const char *encoding)
|
|
/*[clinic end generated code: output=a75e631591702a5c input=18d5d92d0e386c38]*/
|
|
{
|
|
if (_PyCodec_Forget(encoding) < 0) {
|
|
return NULL;
|
|
};
|
|
Py_RETURN_NONE;
|
|
}
|
|
|
|
static
|
|
PyObject *codec_tuple(PyObject *unicode,
|
|
Py_ssize_t len)
|
|
{
|
|
PyObject *v;
|
|
if (unicode == NULL)
|
|
return NULL;
|
|
v = Py_BuildValue("On", unicode, len);
|
|
Py_DECREF(unicode);
|
|
return v;
|
|
}
|
|
|
|
/* --- String codecs ------------------------------------------------------ */
|
|
static PyObject *
|
|
escape_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *errors = NULL;
|
|
const char *data;
|
|
Py_ssize_t size;
|
|
|
|
if (!PyArg_ParseTuple(args, "s#|z:escape_decode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
return codec_tuple(PyBytes_DecodeEscape(data, size, errors, 0, NULL),
|
|
size);
|
|
}
|
|
|
|
static PyObject *
|
|
escape_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str;
|
|
Py_ssize_t size;
|
|
Py_ssize_t newsize;
|
|
const char *errors = NULL;
|
|
PyObject *v;
|
|
|
|
if (!PyArg_ParseTuple(args, "O!|z:escape_encode",
|
|
&PyBytes_Type, &str, &errors))
|
|
return NULL;
|
|
|
|
size = PyBytes_GET_SIZE(str);
|
|
if (size > PY_SSIZE_T_MAX / 4) {
|
|
PyErr_SetString(PyExc_OverflowError,
|
|
"string is too large to encode");
|
|
return NULL;
|
|
}
|
|
newsize = 4*size;
|
|
v = PyBytes_FromStringAndSize(NULL, newsize);
|
|
|
|
if (v == NULL) {
|
|
return NULL;
|
|
}
|
|
else {
|
|
Py_ssize_t i;
|
|
char c;
|
|
char *p = PyBytes_AS_STRING(v);
|
|
|
|
for (i = 0; i < size; i++) {
|
|
/* There's at least enough room for a hex escape */
|
|
assert(newsize - (p - PyBytes_AS_STRING(v)) >= 4);
|
|
c = PyBytes_AS_STRING(str)[i];
|
|
if (c == '\'' || c == '\\')
|
|
*p++ = '\\', *p++ = c;
|
|
else if (c == '\t')
|
|
*p++ = '\\', *p++ = 't';
|
|
else if (c == '\n')
|
|
*p++ = '\\', *p++ = 'n';
|
|
else if (c == '\r')
|
|
*p++ = '\\', *p++ = 'r';
|
|
else if (c < ' ' || c >= 0x7f) {
|
|
*p++ = '\\';
|
|
*p++ = 'x';
|
|
*p++ = Py_hexdigits[(c & 0xf0) >> 4];
|
|
*p++ = Py_hexdigits[c & 0xf];
|
|
}
|
|
else
|
|
*p++ = c;
|
|
}
|
|
*p = '\0';
|
|
if (_PyBytes_Resize(&v, (p - PyBytes_AS_STRING(v)))) {
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
return codec_tuple(v, size);
|
|
}
|
|
|
|
/* --- Decoder ------------------------------------------------------------ */
|
|
|
|
static PyObject *
|
|
unicode_internal_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *obj;
|
|
const char *errors = NULL;
|
|
const char *data;
|
|
Py_ssize_t size;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:unicode_internal_decode",
|
|
&obj, &errors))
|
|
return NULL;
|
|
|
|
if (PyUnicode_Check(obj)) {
|
|
if (PyUnicode_READY(obj) < 0)
|
|
return NULL;
|
|
Py_INCREF(obj);
|
|
return codec_tuple(obj, PyUnicode_GET_LENGTH(obj));
|
|
}
|
|
else {
|
|
if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
|
|
return NULL;
|
|
|
|
return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors),
|
|
size);
|
|
}
|
|
}
|
|
|
|
static PyObject *
|
|
utf_7_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
int final = 0;
|
|
Py_ssize_t consumed;
|
|
PyObject *decoded = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|zi:utf_7_decode",
|
|
&pbuf, &errors, &final))
|
|
return NULL;
|
|
consumed = pbuf.len;
|
|
|
|
decoded = PyUnicode_DecodeUTF7Stateful(pbuf.buf, pbuf.len, errors,
|
|
final ? NULL : &consumed);
|
|
PyBuffer_Release(&pbuf);
|
|
if (decoded == NULL)
|
|
return NULL;
|
|
return codec_tuple(decoded, consumed);
|
|
}
|
|
|
|
static PyObject *
|
|
utf_8_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
int final = 0;
|
|
Py_ssize_t consumed;
|
|
PyObject *decoded = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|zi:utf_8_decode",
|
|
&pbuf, &errors, &final))
|
|
return NULL;
|
|
consumed = pbuf.len;
|
|
|
|
decoded = PyUnicode_DecodeUTF8Stateful(pbuf.buf, pbuf.len, errors,
|
|
final ? NULL : &consumed);
|
|
PyBuffer_Release(&pbuf);
|
|
if (decoded == NULL)
|
|
return NULL;
|
|
return codec_tuple(decoded, consumed);
|
|
}
|
|
|
|
static PyObject *
|
|
utf_16_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
int byteorder = 0;
|
|
int final = 0;
|
|
Py_ssize_t consumed;
|
|
PyObject *decoded;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|zi:utf_16_decode",
|
|
&pbuf, &errors, &final))
|
|
return NULL;
|
|
consumed = pbuf.len; /* This is overwritten unless final is true. */
|
|
decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
|
|
&byteorder, final ? NULL : &consumed);
|
|
PyBuffer_Release(&pbuf);
|
|
if (decoded == NULL)
|
|
return NULL;
|
|
return codec_tuple(decoded, consumed);
|
|
}
|
|
|
|
static PyObject *
|
|
utf_16_le_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
int byteorder = -1;
|
|
int final = 0;
|
|
Py_ssize_t consumed;
|
|
PyObject *decoded = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|zi:utf_16_le_decode",
|
|
&pbuf, &errors, &final))
|
|
return NULL;
|
|
|
|
consumed = pbuf.len; /* This is overwritten unless final is true. */
|
|
decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
|
|
&byteorder, final ? NULL : &consumed);
|
|
PyBuffer_Release(&pbuf);
|
|
if (decoded == NULL)
|
|
return NULL;
|
|
return codec_tuple(decoded, consumed);
|
|
}
|
|
|
|
static PyObject *
|
|
utf_16_be_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
int byteorder = 1;
|
|
int final = 0;
|
|
Py_ssize_t consumed;
|
|
PyObject *decoded = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|zi:utf_16_be_decode",
|
|
&pbuf, &errors, &final))
|
|
return NULL;
|
|
|
|
consumed = pbuf.len; /* This is overwritten unless final is true. */
|
|
decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
|
|
&byteorder, final ? NULL : &consumed);
|
|
PyBuffer_Release(&pbuf);
|
|
if (decoded == NULL)
|
|
return NULL;
|
|
return codec_tuple(decoded, consumed);
|
|
}
|
|
|
|
/* This non-standard version also provides access to the byteorder
|
|
parameter of the builtin UTF-16 codec.
|
|
|
|
It returns a tuple (unicode, bytesread, byteorder) with byteorder
|
|
being the value in effect at the end of data.
|
|
|
|
*/
|
|
|
|
static PyObject *
|
|
utf_16_ex_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
int byteorder = 0;
|
|
PyObject *unicode, *tuple;
|
|
int final = 0;
|
|
Py_ssize_t consumed;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|zii:utf_16_ex_decode",
|
|
&pbuf, &errors, &byteorder, &final))
|
|
return NULL;
|
|
consumed = pbuf.len; /* This is overwritten unless final is true. */
|
|
unicode = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
|
|
&byteorder, final ? NULL : &consumed);
|
|
PyBuffer_Release(&pbuf);
|
|
if (unicode == NULL)
|
|
return NULL;
|
|
tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
|
|
Py_DECREF(unicode);
|
|
return tuple;
|
|
}
|
|
|
|
static PyObject *
|
|
utf_32_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
int byteorder = 0;
|
|
int final = 0;
|
|
Py_ssize_t consumed;
|
|
PyObject *decoded;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|zi:utf_32_decode",
|
|
&pbuf, &errors, &final))
|
|
return NULL;
|
|
consumed = pbuf.len; /* This is overwritten unless final is true. */
|
|
decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
|
|
&byteorder, final ? NULL : &consumed);
|
|
PyBuffer_Release(&pbuf);
|
|
if (decoded == NULL)
|
|
return NULL;
|
|
return codec_tuple(decoded, consumed);
|
|
}
|
|
|
|
static PyObject *
|
|
utf_32_le_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
int byteorder = -1;
|
|
int final = 0;
|
|
Py_ssize_t consumed;
|
|
PyObject *decoded;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|zi:utf_32_le_decode",
|
|
&pbuf, &errors, &final))
|
|
return NULL;
|
|
consumed = pbuf.len; /* This is overwritten unless final is true. */
|
|
decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
|
|
&byteorder, final ? NULL : &consumed);
|
|
PyBuffer_Release(&pbuf);
|
|
if (decoded == NULL)
|
|
return NULL;
|
|
return codec_tuple(decoded, consumed);
|
|
}
|
|
|
|
static PyObject *
|
|
utf_32_be_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
int byteorder = 1;
|
|
int final = 0;
|
|
Py_ssize_t consumed;
|
|
PyObject *decoded;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|zi:utf_32_be_decode",
|
|
&pbuf, &errors, &final))
|
|
return NULL;
|
|
consumed = pbuf.len; /* This is overwritten unless final is true. */
|
|
decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
|
|
&byteorder, final ? NULL : &consumed);
|
|
PyBuffer_Release(&pbuf);
|
|
if (decoded == NULL)
|
|
return NULL;
|
|
return codec_tuple(decoded, consumed);
|
|
}
|
|
|
|
/* This non-standard version also provides access to the byteorder
|
|
parameter of the builtin UTF-32 codec.
|
|
|
|
It returns a tuple (unicode, bytesread, byteorder) with byteorder
|
|
being the value in effect at the end of data.
|
|
|
|
*/
|
|
|
|
static PyObject *
|
|
utf_32_ex_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
int byteorder = 0;
|
|
PyObject *unicode, *tuple;
|
|
int final = 0;
|
|
Py_ssize_t consumed;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|zii:utf_32_ex_decode",
|
|
&pbuf, &errors, &byteorder, &final))
|
|
return NULL;
|
|
consumed = pbuf.len; /* This is overwritten unless final is true. */
|
|
unicode = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
|
|
&byteorder, final ? NULL : &consumed);
|
|
PyBuffer_Release(&pbuf);
|
|
if (unicode == NULL)
|
|
return NULL;
|
|
tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
|
|
Py_DECREF(unicode);
|
|
return tuple;
|
|
}
|
|
|
|
static PyObject *
|
|
unicode_escape_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
PyObject *unicode;
|
|
|
|
if (!PyArg_ParseTuple(args, "s*|z:unicode_escape_decode",
|
|
&pbuf, &errors))
|
|
return NULL;
|
|
|
|
unicode = PyUnicode_DecodeUnicodeEscape(pbuf.buf, pbuf.len, errors);
|
|
PyBuffer_Release(&pbuf);
|
|
return codec_tuple(unicode, pbuf.len);
|
|
}
|
|
|
|
static PyObject *
|
|
raw_unicode_escape_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
PyObject *unicode;
|
|
|
|
if (!PyArg_ParseTuple(args, "s*|z:raw_unicode_escape_decode",
|
|
&pbuf, &errors))
|
|
return NULL;
|
|
|
|
unicode = PyUnicode_DecodeRawUnicodeEscape(pbuf.buf, pbuf.len, errors);
|
|
PyBuffer_Release(&pbuf);
|
|
return codec_tuple(unicode, pbuf.len);
|
|
}
|
|
|
|
static PyObject *
|
|
latin_1_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
PyObject *unicode;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|z:latin_1_decode",
|
|
&pbuf, &errors))
|
|
return NULL;
|
|
|
|
unicode = PyUnicode_DecodeLatin1(pbuf.buf, pbuf.len, errors);
|
|
PyBuffer_Release(&pbuf);
|
|
return codec_tuple(unicode, pbuf.len);
|
|
}
|
|
|
|
static PyObject *
|
|
ascii_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
PyObject *unicode;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|z:ascii_decode",
|
|
&pbuf, &errors))
|
|
return NULL;
|
|
|
|
unicode = PyUnicode_DecodeASCII(pbuf.buf, pbuf.len, errors);
|
|
PyBuffer_Release(&pbuf);
|
|
return codec_tuple(unicode, pbuf.len);
|
|
}
|
|
|
|
static PyObject *
|
|
charmap_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
PyObject *unicode;
|
|
const char *errors = NULL;
|
|
PyObject *mapping = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|zO:charmap_decode",
|
|
&pbuf, &errors, &mapping))
|
|
return NULL;
|
|
if (mapping == Py_None)
|
|
mapping = NULL;
|
|
|
|
unicode = PyUnicode_DecodeCharmap(pbuf.buf, pbuf.len, mapping, errors);
|
|
PyBuffer_Release(&pbuf);
|
|
return codec_tuple(unicode, pbuf.len);
|
|
}
|
|
|
|
#ifdef HAVE_MBCS
|
|
|
|
static PyObject *
|
|
mbcs_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
int final = 0;
|
|
Py_ssize_t consumed;
|
|
PyObject *decoded = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "y*|zi:mbcs_decode",
|
|
&pbuf, &errors, &final))
|
|
return NULL;
|
|
consumed = pbuf.len;
|
|
|
|
decoded = PyUnicode_DecodeMBCSStateful(pbuf.buf, pbuf.len, errors,
|
|
final ? NULL : &consumed);
|
|
PyBuffer_Release(&pbuf);
|
|
if (decoded == NULL)
|
|
return NULL;
|
|
return codec_tuple(decoded, consumed);
|
|
}
|
|
|
|
static PyObject *
|
|
code_page_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pbuf;
|
|
const char *errors = NULL;
|
|
int final = 0;
|
|
Py_ssize_t consumed;
|
|
PyObject *decoded = NULL;
|
|
int code_page;
|
|
|
|
if (!PyArg_ParseTuple(args, "iy*|zi:code_page_decode",
|
|
&code_page, &pbuf, &errors, &final))
|
|
return NULL;
|
|
consumed = pbuf.len;
|
|
|
|
decoded = PyUnicode_DecodeCodePageStateful(code_page,
|
|
pbuf.buf, pbuf.len, errors,
|
|
final ? NULL : &consumed);
|
|
PyBuffer_Release(&pbuf);
|
|
if (decoded == NULL)
|
|
return NULL;
|
|
return codec_tuple(decoded, consumed);
|
|
}
|
|
|
|
#endif /* HAVE_MBCS */
|
|
|
|
/* --- Encoder ------------------------------------------------------------ */
|
|
|
|
static PyObject *
|
|
readbuffer_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
Py_buffer pdata;
|
|
const char *data;
|
|
Py_ssize_t size;
|
|
const char *errors = NULL;
|
|
PyObject *result;
|
|
|
|
if (!PyArg_ParseTuple(args, "s*|z:readbuffer_encode",
|
|
&pdata, &errors))
|
|
return NULL;
|
|
data = pdata.buf;
|
|
size = pdata.len;
|
|
|
|
result = PyBytes_FromStringAndSize(data, size);
|
|
PyBuffer_Release(&pdata);
|
|
return codec_tuple(result, size);
|
|
}
|
|
|
|
static PyObject *
|
|
unicode_internal_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *obj;
|
|
const char *errors = NULL;
|
|
const char *data;
|
|
Py_ssize_t len, size;
|
|
|
|
if (PyErr_WarnEx(PyExc_DeprecationWarning,
|
|
"unicode_internal codec has been deprecated",
|
|
1))
|
|
return NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode",
|
|
&obj, &errors))
|
|
return NULL;
|
|
|
|
if (PyUnicode_Check(obj)) {
|
|
Py_UNICODE *u;
|
|
|
|
if (PyUnicode_READY(obj) < 0)
|
|
return NULL;
|
|
|
|
u = PyUnicode_AsUnicodeAndSize(obj, &len);
|
|
if (u == NULL)
|
|
return NULL;
|
|
if ((size_t)len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_UNICODE))
|
|
return PyErr_NoMemory();
|
|
size = len * sizeof(Py_UNICODE);
|
|
return codec_tuple(PyBytes_FromStringAndSize((const char*)u, size),
|
|
PyUnicode_GET_LENGTH(obj));
|
|
}
|
|
else {
|
|
if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
|
|
return NULL;
|
|
return codec_tuple(PyBytes_FromStringAndSize(data, size), size);
|
|
}
|
|
}
|
|
|
|
static PyObject *
|
|
utf_7_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:utf_7_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(_PyUnicode_EncodeUTF7(str, 0, 0, errors),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
utf_8_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:utf_8_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(PyUnicode_AsEncodedString(str, "utf-8", errors),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
/* This version provides access to the byteorder parameter of the
|
|
builtin UTF-16 codecs as optional third argument. It defaults to 0
|
|
which means: use the native byte order and prepend the data with a
|
|
BOM mark.
|
|
|
|
*/
|
|
|
|
static PyObject *
|
|
utf_16_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
int byteorder = 0;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|zi:utf_16_encode",
|
|
&str, &errors, &byteorder))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(_PyUnicode_EncodeUTF16(str, errors, byteorder),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
utf_16_le_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:utf_16_le_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(_PyUnicode_EncodeUTF16(str, errors, -1),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
utf_16_be_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:utf_16_be_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(_PyUnicode_EncodeUTF16(str, errors, +1),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
/* This version provides access to the byteorder parameter of the
|
|
builtin UTF-32 codecs as optional third argument. It defaults to 0
|
|
which means: use the native byte order and prepend the data with a
|
|
BOM mark.
|
|
|
|
*/
|
|
|
|
static PyObject *
|
|
utf_32_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
int byteorder = 0;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
|
|
&str, &errors, &byteorder))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(_PyUnicode_EncodeUTF32(str, errors, byteorder),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
utf_32_le_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(_PyUnicode_EncodeUTF32(str, errors, -1),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
utf_32_be_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(_PyUnicode_EncodeUTF32(str, errors, +1),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
unicode_escape_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:unicode_escape_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(PyUnicode_AsUnicodeEscapeString(str),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
raw_unicode_escape_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:raw_unicode_escape_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(PyUnicode_AsRawUnicodeEscapeString(str),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
latin_1_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:latin_1_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(_PyUnicode_AsLatin1String(str, errors),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
ascii_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:ascii_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(_PyUnicode_AsASCIIString(str, errors),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
charmap_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
PyObject *mapping = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|zO:charmap_encode",
|
|
&str, &errors, &mapping))
|
|
return NULL;
|
|
if (mapping == Py_None)
|
|
mapping = NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(_PyUnicode_EncodeCharmap(str, mapping, errors),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
static PyObject*
|
|
charmap_build(PyObject *self, PyObject *args)
|
|
{
|
|
PyObject *map;
|
|
if (!PyArg_ParseTuple(args, "U:charmap_build", &map))
|
|
return NULL;
|
|
return PyUnicode_BuildEncodingMap(map);
|
|
}
|
|
|
|
#ifdef HAVE_MBCS
|
|
|
|
static PyObject *
|
|
mbcs_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "O|z:mbcs_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(PyUnicode_EncodeCodePage(CP_ACP, str, errors),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
code_page_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str, *v;
|
|
const char *errors = NULL;
|
|
int code_page;
|
|
|
|
if (!PyArg_ParseTuple(args, "iO|z:code_page_encode",
|
|
&code_page, &str, &errors))
|
|
return NULL;
|
|
|
|
str = PyUnicode_FromObject(str);
|
|
if (str == NULL || PyUnicode_READY(str) < 0) {
|
|
Py_XDECREF(str);
|
|
return NULL;
|
|
}
|
|
v = codec_tuple(PyUnicode_EncodeCodePage(code_page,
|
|
str,
|
|
errors),
|
|
PyUnicode_GET_LENGTH(str));
|
|
Py_DECREF(str);
|
|
return v;
|
|
}
|
|
|
|
#endif /* HAVE_MBCS */
|
|
|
|
/* --- Error handler registry --------------------------------------------- */
|
|
|
|
PyDoc_STRVAR(register_error__doc__,
|
|
"register_error(errors, handler)\n\
|
|
\n\
|
|
Register the specified error handler under the name\n\
|
|
errors. handler must be a callable object, that\n\
|
|
will be called with an exception instance containing\n\
|
|
information about the location of the encoding/decoding\n\
|
|
error and must return a (replacement, new position) tuple.");
|
|
|
|
static PyObject *register_error(PyObject *self, PyObject *args)
|
|
{
|
|
const char *name;
|
|
PyObject *handler;
|
|
|
|
if (!PyArg_ParseTuple(args, "sO:register_error",
|
|
&name, &handler))
|
|
return NULL;
|
|
if (PyCodec_RegisterError(name, handler))
|
|
return NULL;
|
|
Py_RETURN_NONE;
|
|
}
|
|
|
|
PyDoc_STRVAR(lookup_error__doc__,
|
|
"lookup_error(errors) -> handler\n\
|
|
\n\
|
|
Return the error handler for the specified error handling name\n\
|
|
or raise a LookupError, if no handler exists under this name.");
|
|
|
|
static PyObject *lookup_error(PyObject *self, PyObject *args)
|
|
{
|
|
const char *name;
|
|
|
|
if (!PyArg_ParseTuple(args, "s:lookup_error",
|
|
&name))
|
|
return NULL;
|
|
return PyCodec_LookupError(name);
|
|
}
|
|
|
|
/* --- Module API --------------------------------------------------------- */
|
|
|
|
static PyMethodDef _codecs_functions[] = {
|
|
{"register", codec_register, METH_O,
|
|
register__doc__},
|
|
{"lookup", codec_lookup, METH_VARARGS,
|
|
lookup__doc__},
|
|
{"encode", (PyCFunction)codec_encode, METH_VARARGS|METH_KEYWORDS,
|
|
encode__doc__},
|
|
{"decode", (PyCFunction)codec_decode, METH_VARARGS|METH_KEYWORDS,
|
|
decode__doc__},
|
|
{"escape_encode", escape_encode, METH_VARARGS},
|
|
{"escape_decode", escape_decode, METH_VARARGS},
|
|
{"utf_8_encode", utf_8_encode, METH_VARARGS},
|
|
{"utf_8_decode", utf_8_decode, METH_VARARGS},
|
|
{"utf_7_encode", utf_7_encode, METH_VARARGS},
|
|
{"utf_7_decode", utf_7_decode, METH_VARARGS},
|
|
{"utf_16_encode", utf_16_encode, METH_VARARGS},
|
|
{"utf_16_le_encode", utf_16_le_encode, METH_VARARGS},
|
|
{"utf_16_be_encode", utf_16_be_encode, METH_VARARGS},
|
|
{"utf_16_decode", utf_16_decode, METH_VARARGS},
|
|
{"utf_16_le_decode", utf_16_le_decode, METH_VARARGS},
|
|
{"utf_16_be_decode", utf_16_be_decode, METH_VARARGS},
|
|
{"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS},
|
|
{"utf_32_encode", utf_32_encode, METH_VARARGS},
|
|
{"utf_32_le_encode", utf_32_le_encode, METH_VARARGS},
|
|
{"utf_32_be_encode", utf_32_be_encode, METH_VARARGS},
|
|
{"utf_32_decode", utf_32_decode, METH_VARARGS},
|
|
{"utf_32_le_decode", utf_32_le_decode, METH_VARARGS},
|
|
{"utf_32_be_decode", utf_32_be_decode, METH_VARARGS},
|
|
{"utf_32_ex_decode", utf_32_ex_decode, METH_VARARGS},
|
|
{"unicode_escape_encode", unicode_escape_encode, METH_VARARGS},
|
|
{"unicode_escape_decode", unicode_escape_decode, METH_VARARGS},
|
|
{"unicode_internal_encode", unicode_internal_encode, METH_VARARGS},
|
|
{"unicode_internal_decode", unicode_internal_decode, METH_VARARGS},
|
|
{"raw_unicode_escape_encode", raw_unicode_escape_encode, METH_VARARGS},
|
|
{"raw_unicode_escape_decode", raw_unicode_escape_decode, METH_VARARGS},
|
|
{"latin_1_encode", latin_1_encode, METH_VARARGS},
|
|
{"latin_1_decode", latin_1_decode, METH_VARARGS},
|
|
{"ascii_encode", ascii_encode, METH_VARARGS},
|
|
{"ascii_decode", ascii_decode, METH_VARARGS},
|
|
{"charmap_encode", charmap_encode, METH_VARARGS},
|
|
{"charmap_decode", charmap_decode, METH_VARARGS},
|
|
{"charmap_build", charmap_build, METH_VARARGS},
|
|
{"readbuffer_encode", readbuffer_encode, METH_VARARGS},
|
|
#ifdef HAVE_MBCS
|
|
{"mbcs_encode", mbcs_encode, METH_VARARGS},
|
|
{"mbcs_decode", mbcs_decode, METH_VARARGS},
|
|
{"code_page_encode", code_page_encode, METH_VARARGS},
|
|
{"code_page_decode", code_page_decode, METH_VARARGS},
|
|
#endif
|
|
{"register_error", register_error, METH_VARARGS,
|
|
register_error__doc__},
|
|
{"lookup_error", lookup_error, METH_VARARGS,
|
|
lookup_error__doc__},
|
|
_CODECS__FORGET_CODEC_METHODDEF
|
|
{NULL, NULL} /* sentinel */
|
|
};
|
|
|
|
static struct PyModuleDef codecsmodule = {
|
|
PyModuleDef_HEAD_INIT,
|
|
"_codecs",
|
|
NULL,
|
|
-1,
|
|
_codecs_functions,
|
|
NULL,
|
|
NULL,
|
|
NULL,
|
|
NULL
|
|
};
|
|
|
|
PyMODINIT_FUNC
|
|
PyInit__codecs(void)
|
|
{
|
|
return PyModule_Create(&codecsmodule);
|
|
}
|