cpython/Modules/_iconv_codec.c

724 lines
24 KiB
C

/*
* _iconv_codec.c
*
* libiconv adaptor for Python iconvcodec
*
* Author : Hye-Shik Chang <perky@FreeBSD.org>
* Created : 17 January 2003
*/
#include "Python.h"
#include <string.h>
#include <iconv.h>
static const char *__version__ = "$Revision$";
#if Py_USING_UNICODE
# if Py_UNICODE_SIZE == 2
# ifdef __GNU_LIBRARY__
# define UNICODE_ENCODING "ucs-2"
# else
# define UNICODE_ENCODING "ucs-2-internal"
# endif
# define MBENCODED_LENGTH_MAX 4
# elif Py_UNICODE_SIZE == 4
# ifdef __GNU_LIBRARY__
# define UNICODE_ENCODING "ucs-4"
# else
# define UNICODE_ENCODING "ucs-4-internal"
# endif
# define MBENCODED_LENGTH_MAX 6
# endif
#else
# error "Unicode is not available"
#endif
typedef struct {
PyObject_HEAD
iconv_t enchdl, dechdl;
char *encoding;
} iconvcodecObject;
PyDoc_STRVAR(iconvcodec_doc, "iconvcodec object");
/* does the chosen internal encoding require
* byteswapping to get native endianness?
* 0=no, 1=yes, -1=unknown */
static int byteswap = -1;
#define ERROR_STRICT (PyObject *)(1)
#define ERROR_IGNORE (PyObject *)(2)
#define ERROR_REPLACE (PyObject *)(3)
#define ERROR_MAX ERROR_REPLACE
#define REPLACEMENT_CHAR_DECODE 0xFFFD
#define REPLACEMENT_CHAR_ENCODE '?'
#define DEFAULT_ENCODING "utf-8"
static PyObject *
get_errorcallback(const char *errors)
{
if (errors == NULL || strcmp(errors, "strict") == 0)
return ERROR_STRICT;
else if (strcmp(errors, "ignore") == 0)
return ERROR_IGNORE;
else if (strcmp(errors, "replace") == 0)
return ERROR_REPLACE;
else
return PyCodec_LookupError(errors);
}
PyDoc_STRVAR(iconvcodec_encode__doc__,
"I.encode(unicode, [,errors]) -> (string, length consumed)\n\
\n\
Return an encoded string version of `unicode'. errors may be given to\n\
set a different error handling scheme. Default is 'strict' meaning that\n\
encoding errors raise a UnicodeEncodeError. Other possible values are\n\
'ignore', 'replace' and 'xmlcharrefreplace' as well as any other name\n\
registered with codecs.register_error that can handle UnicodeEncodeErrors.");
static PyObject *
iconvcodec_encode(iconvcodecObject *self, PyObject *args, PyObject *kwargs)
{
static char *kwlist[] = { "input", "errors", NULL };
Py_UNICODE *input;
int inputlen;
char *errors = NULL/*strict*/, *out, *out_top;
const char *inp, *inp_top;
size_t inplen, inplen_total, outlen, outlen_total, estep;
PyObject *outputobj = NULL, *errorcb = NULL,
*exceptionobj = NULL;
Py_UNICODE *swappedinput = NULL;
int swapi;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "u#|s:encode",
kwlist, &input, &inputlen, &errors))
return NULL; /* TypeError */
errorcb = get_errorcallback(errors);
if (errorcb == NULL)
return NULL; /* LookupError or something else from error handler */
inp = inp_top = (char *)input;
inplen = inplen_total = (size_t)(inputlen * Py_UNICODE_SIZE);
outlen = inputlen * MBENCODED_LENGTH_MAX;
if (outlen < 16)
outlen = 16; /* for iso-2022 codecs */
outputobj = PyString_FromStringAndSize(NULL, outlen);
if (outputobj == NULL)
return NULL;
out = out_top = PyString_AS_STRING(outputobj);
outlen_total = outlen;
estep = inputlen * Py_UNICODE_SIZE / 2;
#define RESIZE_OUTBUFFER(size) { \
size_t toadd = (size); \
outlen_total += toadd; \
outlen += toadd; \
if (_PyString_Resize(&outputobj, outlen_total) == -1) \
goto errorexit; \
out = PyString_AS_STRING(outputobj) + (out - out_top); \
out_top = PyString_AS_STRING(outputobj); \
}
if (byteswap) {
swappedinput = PyMem_Malloc(inplen);
if (swappedinput == NULL)
return NULL;
for (swapi = 0; swapi<inputlen; ++swapi)
{
Py_UNICODE c = input[swapi];
#if Py_UNICODE_SIZE == 2
c = ((char *)&c)[0]<<8 | ((char *)&c)[1];
#else
c = ((char *)&c)[0]<<24 | ((char *)&c)[1]<<16 |
((char *)&c)[2]<<8 | ((char *)&c)[3];
#endif
swappedinput[swapi] = c;
}
inp = inp_top = (char *)swappedinput;
}
while (inplen > 0) {
if (iconv(self->enchdl, (char**)&inp, &inplen, &out, &outlen)
== (size_t)-1)
{
char reason[128];
int errpos;
if (errno == E2BIG) {
RESIZE_OUTBUFFER(estep);
continue;
}
if (errorcb == ERROR_IGNORE || errorcb == ERROR_REPLACE) {
inplen -= Py_UNICODE_SIZE;
inp += Py_UNICODE_SIZE;
if (errorcb == ERROR_REPLACE) {
if (outlen < 1)
RESIZE_OUTBUFFER(errno == EINVAL ? 1 : estep);
outlen--;
*out++ = REPLACEMENT_CHAR_ENCODE;
}
if (errno == EINVAL) break;
else continue;
}
errpos = (int)(inp - inp_top) / Py_UNICODE_SIZE;
sprintf(reason, "Undefined character map from "
#if Py_UNICODE_SIZE == 2
"\\u%04x"
#elif Py_UNICODE_SIZE == 4
"\\u%08x"
#endif
, *(Py_UNICODE *)inp);
if (exceptionobj == NULL) {
if ((exceptionobj = PyUnicodeEncodeError_Create(
self->encoding, input, inputlen,
errpos, errpos + 1, reason)) == NULL)
goto errorexit;
} else {
if (PyUnicodeEncodeError_SetStart(exceptionobj, errpos) != 0)
goto errorexit;
if (PyUnicodeEncodeError_SetEnd(exceptionobj, errpos + 1) != 0)
goto errorexit;
if (PyUnicodeEncodeError_SetReason(exceptionobj, reason) != 0)
goto errorexit;
}
if (errorcb == ERROR_STRICT) {
PyCodec_StrictErrors(exceptionobj);
goto errorexit;
} else {
PyObject *argsobj, *retobj, *retuni;
long newpos;
argsobj = PyTuple_New(1);
if (argsobj == NULL)
goto errorexit;
PyTuple_SET_ITEM(argsobj, 0, exceptionobj);
Py_INCREF(exceptionobj);
retobj = PyObject_CallObject(errorcb, argsobj);
Py_DECREF(argsobj);
if (retobj == NULL)
goto errorexit;
if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 ||
!PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) ||
!PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) {
Py_DECREF(retobj);
PyErr_SetString(PyExc_ValueError, "encoding error handler "
"must return (unicode, int) tuple");
goto errorexit;
}
if (PyUnicode_GET_SIZE(retuni) > 0) {
#define errorexit errorexit_cbpad
PyObject *retstr = NULL;
int retstrsize;
retstr = PyUnicode_AsEncodedString(
retuni, self->encoding, NULL);
if (retstr == NULL || !PyString_Check(retstr))
goto errorexit;
retstrsize = PyString_GET_SIZE(retstr);
if (outlen < retstrsize)
RESIZE_OUTBUFFER(errno == EINVAL || retstrsize > estep
? retstrsize - outlen : estep);
memcpy(out, PyString_AS_STRING(retstr), retstrsize);
out += retstrsize;
outlen -= retstrsize;
#undef errorexit
if (0) {
errorexit_cbpad: Py_XDECREF(retobj);
Py_XDECREF(retstr);
goto errorexit;
}
Py_DECREF(retstr);
}
newpos = PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1));
Py_DECREF(retobj);
if (newpos < 0)
newpos = inputlen + newpos;
if (newpos < 0 || newpos > inputlen) {
PyErr_Format(PyExc_IndexError,
"position %ld from error handler out of bounds",
newpos);
goto errorexit;
}
if (newpos == inputlen)
break;
inp = inp_top + Py_UNICODE_SIZE * newpos;
inplen = inplen_total - Py_UNICODE_SIZE * newpos;
}
} else
break;
}
#undef RESIZE_OUTBUFFER
{
PyObject *rettup;
int finalsize;
finalsize = (int)(out - out_top);
if (finalsize != outlen_total) {
if (_PyString_Resize(&outputobj, finalsize) == -1)
goto errorexit;
}
if (errorcb > ERROR_MAX) {
Py_DECREF(errorcb);
}
Py_XDECREF(exceptionobj);
rettup = PyTuple_New(2);
if (rettup == NULL) {
Py_DECREF(outputobj);
if (byteswap)
PyMem_Free(swappedinput);
return NULL;
}
PyTuple_SET_ITEM(rettup, 0, outputobj);
PyTuple_SET_ITEM(rettup, 1, PyInt_FromLong(inputlen));
return rettup;
}
errorexit:
Py_XDECREF(outputobj);
if (errorcb > ERROR_MAX) {
Py_DECREF(errorcb);
}
Py_XDECREF(exceptionobj);
if (byteswap)
PyMem_Free(swappedinput);
return NULL;
}
PyDoc_STRVAR(iconvcodec_decode__doc__,
"I.decode(string, [,errors]) -> (unicodeobject, length consumed)\n\
\n\
Decodes `string' using I, an iconvcodec instance. errors may be given\n\
to set a different error handling scheme. Default is 'strict' meaning\n\
that encoding errors raise a UnicodeDecodeError. Other possible values\n\
are 'ignore' and 'replace' as well as any other name registerd with\n\
codecs.register_error that is able to handle UnicodeDecodeErrors.");
static PyObject *
iconvcodec_decode(iconvcodecObject *self, PyObject *args, PyObject *kwargs)
{
static char *kwlist[] = { "input", "errors", NULL };
char *errors = NULL/*strict*/, *out, *out_top;
const char *inp, *inp_top;
int inplen_int;
size_t inplen, inplen_total, outlen, outlen_total, estep;
PyObject *outputobj = NULL, *errorcb = NULL,
*exceptionobj = NULL;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|s:decode",
kwlist, &inp, &inplen_int, &errors))
return NULL; /* TypeError */
errorcb = get_errorcallback(errors);
if (errorcb == NULL)
return NULL; /* LookupError or something else from error handler */
inp_top = inp;
inplen_total = inplen = (size_t)inplen_int;
outputobj = PyUnicode_FromUnicode(NULL, inplen);
if (outputobj == NULL)
return NULL;
outlen_total = outlen = PyUnicode_GET_DATA_SIZE(outputobj);
out = out_top = (char *)PyUnicode_AS_UNICODE(outputobj);
estep = outlen / 2;
#define RESIZE_OUTBUFFER(size) { \
size_t toadd = (size); \
outlen_total += toadd; \
outlen += toadd; \
if (PyUnicode_Resize(&outputobj, outlen_total/Py_UNICODE_SIZE) == -1) \
goto errorexit; \
out = (char *)PyUnicode_AS_UNICODE(outputobj) + (out - out_top); \
out_top = (char *)PyUnicode_AS_UNICODE(outputobj); \
}
while (inplen > 0) {
char *oldout = out;
size_t res = iconv(self->dechdl, (char**)&inp, &inplen, &out, &outlen);
if (byteswap) {
while (oldout < out)
{
char c0 = oldout[0];
#if Py_UNICODE_SIZE == 2
oldout[0] = oldout[1];
oldout[1] = c0;
#else
char c1 = oldout[1];
oldout[0] = oldout[3];
oldout[1] = oldout[2];
oldout[2] = c1;
oldout[3] = c0;
#endif
oldout += sizeof(Py_UNICODE);
}
}
if (res == (size_t)-1) {
char reason[128], *reasonpos = (char *)reason;
int errpos;
if (errno == E2BIG) {
RESIZE_OUTBUFFER(estep);
continue;
}
if (errorcb == ERROR_IGNORE || errorcb == ERROR_REPLACE) {
inplen--; inp++;
if (errorcb == ERROR_REPLACE) {
Py_UNICODE *replp;
if (outlen < Py_UNICODE_SIZE)
RESIZE_OUTBUFFER(
errno == EINVAL || Py_UNICODE_SIZE > estep
? Py_UNICODE_SIZE : estep);
/* some compilers hate casted lvalue */
replp = (Py_UNICODE *)out;
assert((long)replp % Py_UNICODE_SIZE == 0);/* aligned? */
*replp = REPLACEMENT_CHAR_DECODE;
out += Py_UNICODE_SIZE;
outlen -= Py_UNICODE_SIZE;
}
if (errno == EINVAL) break;
else continue;
}
errpos = (int)(inp - inp_top);
reasonpos += sprintf(reason, "Invalid multibyte sequence \\x%02x",
(unsigned char)*inp);
if (inplen > 1) {
reasonpos += sprintf(reasonpos,
"\\x%02x", (unsigned char)*(inp+1));
if (inplen > 2)
sprintf(reasonpos, "\\x%02x", (unsigned char)*(inp+2));
}
if (exceptionobj == NULL) {
exceptionobj = PyUnicodeDecodeError_Create(
self->encoding, inp_top, inplen_total,
errpos, errpos + 1, reason);
if (exceptionobj == NULL)
goto errorexit;
} else {
if (PyUnicodeDecodeError_SetStart(exceptionobj, errpos) != 0)
goto errorexit;
if (PyUnicodeDecodeError_SetEnd(exceptionobj, errpos + 1) != 0)
goto errorexit;
if (PyUnicodeDecodeError_SetReason(exceptionobj, reason) != 0)
goto errorexit;
}
if (errorcb == ERROR_STRICT) {
PyCodec_StrictErrors(exceptionobj);
goto errorexit;
} else {
PyObject *argsobj, *retobj, *retuni;
long newpos;
argsobj = PyTuple_New(1);
if (argsobj == NULL)
goto errorexit;
PyTuple_SET_ITEM(argsobj, 0, exceptionobj);
Py_INCREF(exceptionobj);
retobj = PyObject_CallObject(errorcb, argsobj);
Py_DECREF(argsobj);
if (retobj == NULL)
goto errorexit;
if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 ||
!PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) ||
!PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) {
Py_DECREF(retobj);
PyErr_SetString(PyExc_ValueError, "decoding error handler "
"must return (unicode, int) tuple");
goto errorexit;
}
if (PyUnicode_GET_SIZE(retuni) > 0) {
#define errorexit errorexit_cbpad
size_t retunisize;
retunisize = PyUnicode_GET_DATA_SIZE(retuni);
if (outlen < retunisize)
RESIZE_OUTBUFFER(errno == EINVAL || retunisize > estep
? retunisize - outlen : estep);
memcpy(out, PyUnicode_AS_DATA(retuni), retunisize);
out += retunisize;
outlen -= retunisize;
#undef errorexit
if (0) {
errorexit_cbpad: Py_DECREF(retobj);
goto errorexit;
}
}
newpos = PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1));
Py_DECREF(retobj);
if (newpos < 0)
newpos = inplen_total + newpos;
if (newpos < 0 || newpos > inplen_total) {
PyErr_Format(PyExc_IndexError,
"position %ld from error handler out of bounds",
newpos);
goto errorexit;
}
if (newpos == inplen_total)
break;
inp = inp_top + newpos;
inplen = inplen_total - newpos;
}
} else
break;
}
#undef RESIZE_OUTBUFFER
{
PyObject *rettup;
int finalsize;
finalsize = (int)(out - out_top);
if (finalsize != outlen_total) {
if (PyUnicode_Resize(&outputobj, finalsize / Py_UNICODE_SIZE)
== -1)
goto errorexit;
}
if (errorcb > ERROR_MAX) {
Py_DECREF(errorcb);
}
Py_XDECREF(exceptionobj);
rettup = PyTuple_New(2);
if (rettup == NULL) {
Py_DECREF(outputobj);
return NULL;
}
PyTuple_SET_ITEM(rettup, 0, outputobj);
PyTuple_SET_ITEM(rettup, 1, PyInt_FromLong(inplen_total));
return rettup;
}
errorexit:
Py_XDECREF(outputobj);
if (errorcb > ERROR_MAX) {
Py_DECREF(errorcb);
}
Py_XDECREF(exceptionobj);
return NULL;
}
static struct PyMethodDef iconvcodec_methods[] = {
{"encode", (PyCFunction)iconvcodec_encode,
METH_VARARGS | METH_KEYWORDS,
iconvcodec_encode__doc__},
{"decode", (PyCFunction)iconvcodec_decode,
METH_VARARGS | METH_KEYWORDS,
iconvcodec_decode__doc__},
{NULL, NULL},
};
static PyObject *
iconvcodec_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
{
PyObject *encobj = NULL;
iconvcodecObject *new = NULL;
new = (iconvcodecObject *)type->tp_alloc(type, 0);
if (new == NULL)
return NULL;
new->encoding = NULL;
new->enchdl = new->dechdl = (iconv_t)(-1);
encobj = PyObject_GetAttrString((PyObject *)new, "encoding");
if (encobj == NULL) {
PyErr_Clear();
new->encoding = PyMem_Malloc(sizeof(DEFAULT_ENCODING));
strcpy(new->encoding, DEFAULT_ENCODING);
} else if (!PyString_Check(encobj)) {
Py_DECREF(encobj);
PyErr_SetString(PyExc_TypeError,
"`encoding' attribute must be a string.");
goto errorexit;
} else {
new->encoding = PyMem_Malloc(PyString_GET_SIZE(encobj) + 1);
strcpy(new->encoding, PyString_AS_STRING(encobj));
Py_DECREF(encobj);
}
new->dechdl = iconv_open(UNICODE_ENCODING, new->encoding);
if (new->dechdl == (iconv_t)(-1)) {
PyErr_SetString(PyExc_ValueError, "unsupported decoding");
goto errorexit;
}
new->enchdl = iconv_open(new->encoding, UNICODE_ENCODING);
if (new->enchdl == (iconv_t)(-1)) {
PyErr_SetString(PyExc_ValueError, "unsupported encoding");
iconv_close(new->dechdl);
new->dechdl = (iconv_t)(-1);
goto errorexit;
}
return (PyObject *)new;
errorexit:
Py_XDECREF(new);
return NULL;
}
static void
iconvcodec_dealloc(iconvcodecObject *self)
{
if (self->enchdl != (iconv_t)-1)
iconv_close(self->enchdl);
if (self->dechdl != (iconv_t)-1)
iconv_close(self->dechdl);
if (self->encoding != NULL)
PyMem_Free(self->encoding);
self->ob_type->tp_free((PyObject *)self);
}
static PyObject *
iconvcodec_repr(PyObject *self)
{
return PyString_FromFormat("<iconvcodec encoding='%s'>",
((iconvcodecObject *)self)->encoding);
}
static PyTypeObject iconvcodec_Type = {
PyObject_HEAD_INIT(NULL)
0, /* Number of items for varobject */
"iconvcodec", /* Name of this type */
sizeof(iconvcodecObject), /* Basic object size */
0, /* Item size for varobject */
(destructor)iconvcodec_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
iconvcodec_repr, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
iconvcodec_doc, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iterext */
iconvcodec_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
PyType_GenericAlloc, /* tp_alloc */
iconvcodec_new, /* tp_new */
PyObject_Del, /* tp_free */
};
static struct PyMethodDef _iconv_codec_methods[] = {
{NULL, NULL},
};
void
init_iconv_codec(void)
{
PyObject *m;
char in = '0';
char *inptr = &in;
size_t insize = 1;
Py_UNICODE out = 0;
char *outptr = (char *)&out;
size_t outsize = sizeof(out);
size_t res;
iconv_t hdl = iconv_open(UNICODE_ENCODING, "ISO-8859-1");
if (hdl == (iconv_t)-1) {
PyErr_SetString(PyExc_RuntimeError,
"can't initialize the _iconv_codec module: iconv_open() failed");
return;
}
res = iconv(hdl, &inptr, &insize, &outptr, &outsize);
if (res == (size_t)-1) {
PyErr_SetString(PyExc_RuntimeError,
"can't initialize the _iconv_codec module: iconv() failed");
return;
}
/* Check whether conv() returned native endianess or not for the chosen
encoding */
if (out == 0x30)
byteswap = 0;
#if Py_UNICODE_SIZE == 2
else if (out == 0x3000)
#else
else if (out == 0x30000000)
#endif
byteswap = 1;
else {
iconv_close(hdl);
PyErr_SetString(PyExc_RuntimeError,
"can't initialize the _iconv_codec module: mixed endianess");
return;
}
iconv_close(hdl);
iconvcodec_Type.ob_type = &PyType_Type;
m = Py_InitModule("_iconv_codec", _iconv_codec_methods);
PyModule_AddStringConstant(m, "__version__", (char*)__version__);
Py_INCREF(&iconvcodec_Type);
PyModule_AddObject(m, "iconvcodec", (PyObject *)(&iconvcodec_Type));
PyModule_AddStringConstant(m, "internal_encoding", UNICODE_ENCODING);
if (PyErr_Occurred())
PyErr_SetString(PyExc_RuntimeError,
"can't initialize the _iconv_codec module");
}
/*
* ex: ts=8 sts=4 et
* $Id$
*/