/* * _iconv_codec.c * * libiconv adaptor for Python iconvcodec * * Author : Hye-Shik Chang * Created : 17 January 2003 */ #include "Python.h" #include #include static const char *__version__ = "$Revision$"; #if Py_USING_UNICODE # if Py_UNICODE_SIZE == 2 # ifdef __GNU_LIBRARY__ # define UNICODE_ENCODING "ucs-2" # else # define UNICODE_ENCODING "ucs-2-internal" # endif # define MBENCODED_LENGTH_MAX 4 # elif Py_UNICODE_SIZE == 4 # ifdef __GNU_LIBRARY__ # define UNICODE_ENCODING "ucs-4" # else # define UNICODE_ENCODING "ucs-4-internal" # endif # define MBENCODED_LENGTH_MAX 6 # endif #else # error "Unicode is not available" #endif typedef struct { PyObject_HEAD iconv_t enchdl, dechdl; char *encoding; } iconvcodecObject; PyDoc_STRVAR(iconvcodec_doc, "iconvcodec object"); /* does the chosen internal encoding require * byteswapping to get native endianness? * 0=no, 1=yes, -1=unknown */ static int byteswap = -1; #define ERROR_STRICT (PyObject *)(1) #define ERROR_IGNORE (PyObject *)(2) #define ERROR_REPLACE (PyObject *)(3) #define ERROR_MAX ERROR_REPLACE #define REPLACEMENT_CHAR_DECODE 0xFFFD #define REPLACEMENT_CHAR_ENCODE '?' #define DEFAULT_ENCODING "utf-8" static PyObject * get_errorcallback(const char *errors) { if (errors == NULL || strcmp(errors, "strict") == 0) return ERROR_STRICT; else if (strcmp(errors, "ignore") == 0) return ERROR_IGNORE; else if (strcmp(errors, "replace") == 0) return ERROR_REPLACE; else return PyCodec_LookupError(errors); } PyDoc_STRVAR(iconvcodec_encode__doc__, "I.encode(unicode, [,errors]) -> (string, length consumed)\n\ \n\ Return an encoded string version of `unicode'. errors may be given to\n\ set a different error handling scheme. Default is 'strict' meaning that\n\ encoding errors raise a UnicodeEncodeError. Other possible values are\n\ 'ignore', 'replace' and 'xmlcharrefreplace' as well as any other name\n\ registered with codecs.register_error that can handle UnicodeEncodeErrors."); static PyObject * iconvcodec_encode(iconvcodecObject *self, PyObject *args, PyObject *kwargs) { static char *kwlist[] = { "input", "errors", NULL }; Py_UNICODE *input; int inputlen; char *errors = NULL/*strict*/, *out, *out_top; const char *inp, *inp_top; size_t inplen, inplen_total, outlen, outlen_total, estep; PyObject *outputobj = NULL, *errorcb = NULL, *exceptionobj = NULL; Py_UNICODE *swappedinput = NULL; int swapi; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "u#|s:encode", kwlist, &input, &inputlen, &errors)) return NULL; /* TypeError */ errorcb = get_errorcallback(errors); if (errorcb == NULL) return NULL; /* LookupError or something else from error handler */ inp = inp_top = (char *)input; inplen = inplen_total = (size_t)(inputlen * Py_UNICODE_SIZE); outlen = inputlen * MBENCODED_LENGTH_MAX; if (outlen < 16) outlen = 16; /* for iso-2022 codecs */ outputobj = PyString_FromStringAndSize(NULL, outlen); if (outputobj == NULL) return NULL; out = out_top = PyString_AS_STRING(outputobj); outlen_total = outlen; estep = inputlen * Py_UNICODE_SIZE / 2; #define RESIZE_OUTBUFFER(size) { \ size_t toadd = (size); \ outlen_total += toadd; \ outlen += toadd; \ if (_PyString_Resize(&outputobj, outlen_total) == -1) \ goto errorexit; \ out = PyString_AS_STRING(outputobj) + (out - out_top); \ out_top = PyString_AS_STRING(outputobj); \ } if (byteswap) { swappedinput = PyMem_Malloc(inplen); if (swappedinput == NULL) return NULL; for (swapi = 0; swapi 0) { if (iconv(self->enchdl, (char**)&inp, &inplen, &out, &outlen) == (size_t)-1) { char reason[128]; int errpos; if (errno == E2BIG) { RESIZE_OUTBUFFER(estep); continue; } if (errorcb == ERROR_IGNORE || errorcb == ERROR_REPLACE) { inplen -= Py_UNICODE_SIZE; inp += Py_UNICODE_SIZE; if (errorcb == ERROR_REPLACE) { if (outlen < 1) RESIZE_OUTBUFFER(errno == EINVAL ? 1 : estep); outlen--; *out++ = REPLACEMENT_CHAR_ENCODE; } if (errno == EINVAL) break; else continue; } errpos = (int)(inp - inp_top) / Py_UNICODE_SIZE; sprintf(reason, "Undefined character map from " #if Py_UNICODE_SIZE == 2 "\\u%04x" #elif Py_UNICODE_SIZE == 4 "\\u%08x" #endif , *(Py_UNICODE *)inp); if (exceptionobj == NULL) { if ((exceptionobj = PyUnicodeEncodeError_Create( self->encoding, input, inputlen, errpos, errpos + 1, reason)) == NULL) goto errorexit; } else { if (PyUnicodeEncodeError_SetStart(exceptionobj, errpos) != 0) goto errorexit; if (PyUnicodeEncodeError_SetEnd(exceptionobj, errpos + 1) != 0) goto errorexit; if (PyUnicodeEncodeError_SetReason(exceptionobj, reason) != 0) goto errorexit; } if (errorcb == ERROR_STRICT) { PyCodec_StrictErrors(exceptionobj); goto errorexit; } else { PyObject *argsobj, *retobj, *retuni; long newpos; argsobj = PyTuple_New(1); if (argsobj == NULL) goto errorexit; PyTuple_SET_ITEM(argsobj, 0, exceptionobj); Py_INCREF(exceptionobj); retobj = PyObject_CallObject(errorcb, argsobj); Py_DECREF(argsobj); if (retobj == NULL) goto errorexit; if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 || !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) || !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) { Py_DECREF(retobj); PyErr_SetString(PyExc_ValueError, "encoding error handler " "must return (unicode, int) tuple"); goto errorexit; } if (PyUnicode_GET_SIZE(retuni) > 0) { #define errorexit errorexit_cbpad PyObject *retstr = NULL; int retstrsize; retstr = PyUnicode_AsEncodedString( retuni, self->encoding, NULL); if (retstr == NULL || !PyString_Check(retstr)) goto errorexit; retstrsize = PyString_GET_SIZE(retstr); if (outlen < retstrsize) RESIZE_OUTBUFFER(errno == EINVAL || retstrsize > estep ? retstrsize - outlen : estep); memcpy(out, PyString_AS_STRING(retstr), retstrsize); out += retstrsize; outlen -= retstrsize; #undef errorexit if (0) { errorexit_cbpad: Py_XDECREF(retobj); Py_XDECREF(retstr); goto errorexit; } Py_DECREF(retstr); } newpos = PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1)); Py_DECREF(retobj); if (newpos < 0) newpos = inputlen + newpos; if (newpos < 0 || newpos > inputlen) { PyErr_Format(PyExc_IndexError, "position %ld from error handler out of bounds", newpos); goto errorexit; } if (newpos == inputlen) break; inp = inp_top + Py_UNICODE_SIZE * newpos; inplen = inplen_total - Py_UNICODE_SIZE * newpos; } } else break; } #undef RESIZE_OUTBUFFER { PyObject *rettup; int finalsize; finalsize = (int)(out - out_top); if (finalsize != outlen_total) { if (_PyString_Resize(&outputobj, finalsize) == -1) goto errorexit; } if (errorcb > ERROR_MAX) { Py_DECREF(errorcb); } Py_XDECREF(exceptionobj); rettup = PyTuple_New(2); if (rettup == NULL) { Py_DECREF(outputobj); if (byteswap) PyMem_Free(swappedinput); return NULL; } PyTuple_SET_ITEM(rettup, 0, outputobj); PyTuple_SET_ITEM(rettup, 1, PyInt_FromLong(inputlen)); return rettup; } errorexit: Py_XDECREF(outputobj); if (errorcb > ERROR_MAX) { Py_DECREF(errorcb); } Py_XDECREF(exceptionobj); if (byteswap) PyMem_Free(swappedinput); return NULL; } PyDoc_STRVAR(iconvcodec_decode__doc__, "I.decode(string, [,errors]) -> (unicodeobject, length consumed)\n\ \n\ Decodes `string' using I, an iconvcodec instance. errors may be given\n\ to set a different error handling scheme. Default is 'strict' meaning\n\ that encoding errors raise a UnicodeDecodeError. Other possible values\n\ are 'ignore' and 'replace' as well as any other name registerd with\n\ codecs.register_error that is able to handle UnicodeDecodeErrors."); static PyObject * iconvcodec_decode(iconvcodecObject *self, PyObject *args, PyObject *kwargs) { static char *kwlist[] = { "input", "errors", NULL }; char *errors = NULL/*strict*/, *out, *out_top; const char *inp, *inp_top; int inplen_int; size_t inplen, inplen_total, outlen, outlen_total, estep; PyObject *outputobj = NULL, *errorcb = NULL, *exceptionobj = NULL; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|s:decode", kwlist, &inp, &inplen_int, &errors)) return NULL; /* TypeError */ errorcb = get_errorcallback(errors); if (errorcb == NULL) return NULL; /* LookupError or something else from error handler */ inp_top = inp; inplen_total = inplen = (size_t)inplen_int; outputobj = PyUnicode_FromUnicode(NULL, inplen); if (outputobj == NULL) return NULL; outlen_total = outlen = PyUnicode_GET_DATA_SIZE(outputobj); out = out_top = (char *)PyUnicode_AS_UNICODE(outputobj); estep = outlen / 2; #define RESIZE_OUTBUFFER(size) { \ size_t toadd = (size); \ outlen_total += toadd; \ outlen += toadd; \ if (PyUnicode_Resize(&outputobj, outlen_total/Py_UNICODE_SIZE) == -1) \ goto errorexit; \ out = (char *)PyUnicode_AS_UNICODE(outputobj) + (out - out_top); \ out_top = (char *)PyUnicode_AS_UNICODE(outputobj); \ } while (inplen > 0) { char *oldout = out; size_t res = iconv(self->dechdl, (char**)&inp, &inplen, &out, &outlen); if (byteswap) { while (oldout < out) { char c0 = oldout[0]; #if Py_UNICODE_SIZE == 2 oldout[0] = oldout[1]; oldout[1] = c0; #else char c1 = oldout[1]; oldout[0] = oldout[3]; oldout[1] = oldout[2]; oldout[2] = c1; oldout[3] = c0; #endif oldout += sizeof(Py_UNICODE); } } if (res == (size_t)-1) { char reason[128], *reasonpos = (char *)reason; int errpos; if (errno == E2BIG) { RESIZE_OUTBUFFER(estep); continue; } if (errorcb == ERROR_IGNORE || errorcb == ERROR_REPLACE) { inplen--; inp++; if (errorcb == ERROR_REPLACE) { Py_UNICODE *replp; if (outlen < Py_UNICODE_SIZE) RESIZE_OUTBUFFER( errno == EINVAL || Py_UNICODE_SIZE > estep ? Py_UNICODE_SIZE : estep); /* some compilers hate casted lvalue */ replp = (Py_UNICODE *)out; assert((long)replp % Py_UNICODE_SIZE == 0);/* aligned? */ *replp = REPLACEMENT_CHAR_DECODE; out += Py_UNICODE_SIZE; outlen -= Py_UNICODE_SIZE; } if (errno == EINVAL) break; else continue; } errpos = (int)(inp - inp_top); reasonpos += sprintf(reason, "Invalid multibyte sequence \\x%02x", (unsigned char)*inp); if (inplen > 1) { reasonpos += sprintf(reasonpos, "\\x%02x", (unsigned char)*(inp+1)); if (inplen > 2) sprintf(reasonpos, "\\x%02x", (unsigned char)*(inp+2)); } if (exceptionobj == NULL) { exceptionobj = PyUnicodeDecodeError_Create( self->encoding, inp_top, inplen_total, errpos, errpos + 1, reason); if (exceptionobj == NULL) goto errorexit; } else { if (PyUnicodeDecodeError_SetStart(exceptionobj, errpos) != 0) goto errorexit; if (PyUnicodeDecodeError_SetEnd(exceptionobj, errpos + 1) != 0) goto errorexit; if (PyUnicodeDecodeError_SetReason(exceptionobj, reason) != 0) goto errorexit; } if (errorcb == ERROR_STRICT) { PyCodec_StrictErrors(exceptionobj); goto errorexit; } else { PyObject *argsobj, *retobj, *retuni; long newpos; argsobj = PyTuple_New(1); if (argsobj == NULL) goto errorexit; PyTuple_SET_ITEM(argsobj, 0, exceptionobj); Py_INCREF(exceptionobj); retobj = PyObject_CallObject(errorcb, argsobj); Py_DECREF(argsobj); if (retobj == NULL) goto errorexit; if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 || !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) || !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) { Py_DECREF(retobj); PyErr_SetString(PyExc_ValueError, "decoding error handler " "must return (unicode, int) tuple"); goto errorexit; } if (PyUnicode_GET_SIZE(retuni) > 0) { #define errorexit errorexit_cbpad size_t retunisize; retunisize = PyUnicode_GET_DATA_SIZE(retuni); if (outlen < retunisize) RESIZE_OUTBUFFER(errno == EINVAL || retunisize > estep ? retunisize - outlen : estep); memcpy(out, PyUnicode_AS_DATA(retuni), retunisize); out += retunisize; outlen -= retunisize; #undef errorexit if (0) { errorexit_cbpad: Py_DECREF(retobj); goto errorexit; } } newpos = PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1)); Py_DECREF(retobj); if (newpos < 0) newpos = inplen_total + newpos; if (newpos < 0 || newpos > inplen_total) { PyErr_Format(PyExc_IndexError, "position %ld from error handler out of bounds", newpos); goto errorexit; } if (newpos == inplen_total) break; inp = inp_top + newpos; inplen = inplen_total - newpos; } } else break; } #undef RESIZE_OUTBUFFER { PyObject *rettup; int finalsize; finalsize = (int)(out - out_top); if (finalsize != outlen_total) { if (PyUnicode_Resize(&outputobj, finalsize / Py_UNICODE_SIZE) == -1) goto errorexit; } if (errorcb > ERROR_MAX) { Py_DECREF(errorcb); } Py_XDECREF(exceptionobj); rettup = PyTuple_New(2); if (rettup == NULL) { Py_DECREF(outputobj); return NULL; } PyTuple_SET_ITEM(rettup, 0, outputobj); PyTuple_SET_ITEM(rettup, 1, PyInt_FromLong(inplen_total)); return rettup; } errorexit: Py_XDECREF(outputobj); if (errorcb > ERROR_MAX) { Py_DECREF(errorcb); } Py_XDECREF(exceptionobj); return NULL; } static struct PyMethodDef iconvcodec_methods[] = { {"encode", (PyCFunction)iconvcodec_encode, METH_VARARGS | METH_KEYWORDS, iconvcodec_encode__doc__}, {"decode", (PyCFunction)iconvcodec_decode, METH_VARARGS | METH_KEYWORDS, iconvcodec_decode__doc__}, {NULL, NULL}, }; static PyObject * iconvcodec_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) { PyObject *encobj = NULL; iconvcodecObject *new = NULL; new = (iconvcodecObject *)type->tp_alloc(type, 0); if (new == NULL) return NULL; new->encoding = NULL; new->enchdl = new->dechdl = (iconv_t)(-1); encobj = PyObject_GetAttrString((PyObject *)new, "encoding"); if (encobj == NULL) { PyErr_Clear(); new->encoding = PyMem_Malloc(sizeof(DEFAULT_ENCODING)); strcpy(new->encoding, DEFAULT_ENCODING); } else if (!PyString_Check(encobj)) { Py_DECREF(encobj); PyErr_SetString(PyExc_TypeError, "`encoding' attribute must be a string."); goto errorexit; } else { new->encoding = PyMem_Malloc(PyString_GET_SIZE(encobj) + 1); strcpy(new->encoding, PyString_AS_STRING(encobj)); Py_DECREF(encobj); } new->dechdl = iconv_open(UNICODE_ENCODING, new->encoding); if (new->dechdl == (iconv_t)(-1)) { PyErr_SetString(PyExc_ValueError, "unsupported decoding"); goto errorexit; } new->enchdl = iconv_open(new->encoding, UNICODE_ENCODING); if (new->enchdl == (iconv_t)(-1)) { PyErr_SetString(PyExc_ValueError, "unsupported encoding"); iconv_close(new->dechdl); new->dechdl = (iconv_t)(-1); goto errorexit; } return (PyObject *)new; errorexit: Py_XDECREF(new); return NULL; } static void iconvcodec_dealloc(iconvcodecObject *self) { if (self->enchdl != (iconv_t)-1) iconv_close(self->enchdl); if (self->dechdl != (iconv_t)-1) iconv_close(self->dechdl); if (self->encoding != NULL) PyMem_Free(self->encoding); self->ob_type->tp_free((PyObject *)self); } static PyObject * iconvcodec_repr(PyObject *self) { return PyString_FromFormat("", ((iconvcodecObject *)self)->encoding); } static PyTypeObject iconvcodec_Type = { PyObject_HEAD_INIT(NULL) 0, /* Number of items for varobject */ "iconvcodec", /* Name of this type */ sizeof(iconvcodecObject), /* Basic object size */ 0, /* Item size for varobject */ (destructor)iconvcodec_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ iconvcodec_repr, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ PyObject_GenericGetAttr, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ iconvcodec_doc, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iterext */ iconvcodec_methods, /* tp_methods */ 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ 0, /* tp_init */ PyType_GenericAlloc, /* tp_alloc */ iconvcodec_new, /* tp_new */ PyObject_Del, /* tp_free */ }; static struct PyMethodDef _iconv_codec_methods[] = { {NULL, NULL}, }; void init_iconv_codec(void) { PyObject *m; char in = '0'; char *inptr = ∈ size_t insize = 1; Py_UNICODE out = 0; char *outptr = (char *)&out; size_t outsize = sizeof(out); size_t res; iconv_t hdl = iconv_open(UNICODE_ENCODING, "ISO-8859-1"); if (hdl == (iconv_t)-1) { PyErr_SetString(PyExc_RuntimeError, "can't initialize the _iconv_codec module: iconv_open() failed"); return; } res = iconv(hdl, &inptr, &insize, &outptr, &outsize); if (res == (size_t)-1) { PyErr_SetString(PyExc_RuntimeError, "can't initialize the _iconv_codec module: iconv() failed"); return; } /* Check whether conv() returned native endianess or not for the chosen encoding */ if (out == 0x30) byteswap = 0; #if Py_UNICODE_SIZE == 2 else if (out == 0x3000) #else else if (out == 0x30000000) #endif byteswap = 1; else { iconv_close(hdl); PyErr_SetString(PyExc_RuntimeError, "can't initialize the _iconv_codec module: mixed endianess"); return; } iconv_close(hdl); iconvcodec_Type.ob_type = &PyType_Type; m = Py_InitModule("_iconv_codec", _iconv_codec_methods); PyModule_AddStringConstant(m, "__version__", (char*)__version__); Py_INCREF(&iconvcodec_Type); PyModule_AddObject(m, "iconvcodec", (PyObject *)(&iconvcodec_Type)); PyModule_AddStringConstant(m, "internal_encoding", UNICODE_ENCODING); if (PyErr_Occurred()) PyErr_SetString(PyExc_RuntimeError, "can't initialize the _iconv_codec module"); } /* * ex: ts=8 sts=4 et * $Id$ */