cpython/Modules/_codecsmodule.c

1116 lines
33 KiB
C
Raw Normal View History

/* ------------------------------------------------------------------------
_codecs -- Provides access to the codec registry and the builtin
codecs.
This module should never be imported directly. The standard library
module "codecs" wraps this builtin module for use within Python.
The codec registry is accessible via:
register(search_function) -> None
lookup(encoding) -> CodecInfo object
The builtin Unicode codecs use the following interface:
2005-11-02 04:30:08 -04:00
<encoding>_encode(Unicode_object[,errors='strict']) ->
(string object, bytes consumed)
2005-11-02 04:30:08 -04:00
<encoding>_decode(char_buffer_obj[,errors='strict']) ->
(Unicode object, bytes consumed)
<encoding>_encode() interfaces also accept non-Unicode object as
input. The objects are then converted to Unicode using
PyUnicode_FromObject() prior to applying the conversion.
These <encoding>s are available: utf_8, unicode_escape,
raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
mbcs (on win32).
Written by Marc-Andre Lemburg (mal@lemburg.com).
Copyright (c) Corporation for National Research Initiatives.
------------------------------------------------------------------------ */
2006-02-15 13:27:45 -04:00
#define PY_SSIZE_T_CLEAN
#include "Python.h"
/* --- Registry ----------------------------------------------------------- */
PyDoc_STRVAR(register__doc__,
"register(search_function)\n\
\n\
Register a codec search function. Search functions are expected to take\n\
one argument, the encoding name in all lower case letters, and return\n\
a tuple of functions (encoder, decoder, stream_reader, stream_writer)\n\
(or a CodecInfo object).");
static
PyObject *codec_register(PyObject *self, PyObject *search_function)
{
if (PyCodec_Register(search_function))
return NULL;
Py_RETURN_NONE;
}
PyDoc_STRVAR(lookup__doc__,
"lookup(encoding) -> CodecInfo\n\
\n\
Looks up a codec tuple in the Python codec registry and returns\n\
Merged revisions 69578-69580,69901,69907,69994,70022-70023,70025-70026,70166,70273,70275,70342,70386-70387,70389-70390,70392-70393,70395,70397,70400,70418 via svnmerge ........ r69578 | georg.brandl | 2009-02-13 12:03:59 +0100 (Fr, 13 Feb 2009) | 1 line #3694: add test for fix committed in r66693. ........ r69579 | georg.brandl | 2009-02-13 12:06:59 +0100 (Fr, 13 Feb 2009) | 2 lines Fix warnings GCC emits where the argument of PyErr_Format is a single variable. ........ r69580 | georg.brandl | 2009-02-13 12:10:04 +0100 (Fr, 13 Feb 2009) | 2 lines Fix warnings GCC emits where the argument of PyErr_Format is a single variable. ........ r69901 | georg.brandl | 2009-02-23 12:24:46 +0100 (Mo, 23 Feb 2009) | 2 lines #5349: C++ pure virtuals can also have an implementation. ........ r69907 | georg.brandl | 2009-02-23 19:33:48 +0100 (Mo, 23 Feb 2009) | 1 line Fix grammar. ........ r69994 | georg.brandl | 2009-02-26 18:36:26 +0100 (Do, 26 Feb 2009) | 1 line Document that setting sys.py3kwarning wont do anything. ........ r70022 | georg.brandl | 2009-02-27 17:23:18 +0100 (Fr, 27 Feb 2009) | 1 line #5361: fix typo. ........ r70023 | georg.brandl | 2009-02-27 17:39:26 +0100 (Fr, 27 Feb 2009) | 1 line #5363: fix cmpfiles() docs. Another instance where a prose description is twice as long as the code. ........ r70025 | georg.brandl | 2009-02-27 17:52:55 +0100 (Fr, 27 Feb 2009) | 1 line #5344: fix punctuation. ........ r70026 | georg.brandl | 2009-02-27 17:59:03 +0100 (Fr, 27 Feb 2009) | 1 line #5365: add quick look conversion table for different time representations. ........ r70166 | georg.brandl | 2009-03-04 19:24:41 +0100 (Mi, 04 Mär 2009) | 2 lines Remove obsolete stuff from string module docs. ........ r70273 | georg.brandl | 2009-03-09 15:25:07 +0100 (Mo, 09 Mär 2009) | 2 lines #5458: add a note when we started to raise RuntimeErrors. ........ r70275 | georg.brandl | 2009-03-09 17:35:48 +0100 (Mo, 09 Mär 2009) | 2 lines Add missing space. ........ r70342 | georg.brandl | 2009-03-13 20:03:58 +0100 (Fr, 13 Mär 2009) | 1 line #5486: typos. ........ r70386 | georg.brandl | 2009-03-15 22:32:06 +0100 (So, 15 Mär 2009) | 1 line #5496: fix docstring of lookup(). ........ r70387 | georg.brandl | 2009-03-15 22:37:16 +0100 (So, 15 Mär 2009) | 1 line #5493: clarify __nonzero__ docs. ........ r70389 | georg.brandl | 2009-03-15 22:43:38 +0100 (So, 15 Mär 2009) | 1 line Fix a small nit in the error message if bool() falls back on __len__ and it returns the wrong type: it would tell the user that __nonzero__ should return bool or int. ........ r70390 | georg.brandl | 2009-03-15 22:44:43 +0100 (So, 15 Mär 2009) | 1 line #5491: clarify nested() semantics. ........ r70392 | georg.brandl | 2009-03-15 22:46:00 +0100 (So, 15 Mär 2009) | 1 line #5488: add missing struct member. ........ r70393 | georg.brandl | 2009-03-15 22:47:42 +0100 (So, 15 Mär 2009) | 1 line #5478: fix copy-paste oversight in function signature. ........ r70395 | georg.brandl | 2009-03-15 22:51:48 +0100 (So, 15 Mär 2009) | 1 line #5276: document IDLESTARTUP and .Idle.py. ........ r70397 | georg.brandl | 2009-03-15 22:53:56 +0100 (So, 15 Mär 2009) | 1 line #5469: add with statement to list of name-binding constructs. ........ r70400 | georg.brandl | 2009-03-15 22:59:37 +0100 (So, 15 Mär 2009) | 3 lines Fix markup in re docs and give a mail address in regex howto, so that the recommendation to send suggestions to the author can be followed. ........ r70418 | georg.brandl | 2009-03-16 20:42:03 +0100 (Mo, 16 Mär 2009) | 1 line Add token markup. ........
2009-04-05 18:48:06 -03:00
a CodecInfo object.");
static
PyObject *codec_lookup(PyObject *self, PyObject *args)
{
char *encoding;
if (!PyArg_ParseTuple(args, "s:lookup", &encoding))
return NULL;
return _PyCodec_Lookup(encoding);
}
PyDoc_STRVAR(encode__doc__,
"encode(obj, [encoding[,errors]]) -> object\n\
\n\
Encodes obj using the codec registered for encoding. encoding defaults\n\
to the default encoding. errors may be given to set a different error\n\
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
a ValueError. Other possible values are 'ignore', 'replace' and\n\
'xmlcharrefreplace' as well as any other name registered with\n\
codecs.register_error that can handle ValueErrors.");
static PyObject *
codec_encode(PyObject *self, PyObject *args)
{
const char *encoding = NULL;
const char *errors = NULL;
PyObject *v;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "O|ss:encode", &v, &encoding, &errors))
return NULL;
#ifdef Py_USING_UNICODE
if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
#else
if (encoding == NULL) {
PyErr_SetString(PyExc_ValueError, "no encoding specified");
return NULL;
}
#endif
/* Encode via the codec registry */
return PyCodec_Encode(v, encoding, errors);
}
PyDoc_STRVAR(decode__doc__,
"decode(obj, [encoding[,errors]]) -> object\n\
\n\
Decodes obj using the codec registered for encoding. encoding defaults\n\
to the default encoding. errors may be given to set a different error\n\
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
a ValueError. Other possible values are 'ignore' and 'replace'\n\
Merged revisions 66766-66767,66771-66772,66774,66776,66783-66787,66790,66793,66797 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ................ r66766 | benjamin.peterson | 2008-10-03 06:52:06 -0500 (Fri, 03 Oct 2008) | 1 line update the mac installer script ................ r66767 | andrew.kuchling | 2008-10-03 07:26:42 -0500 (Fri, 03 Oct 2008) | 1 line Docstring typo. ................ r66771 | hirokazu.yamamoto | 2008-10-03 11:18:42 -0500 (Fri, 03 Oct 2008) | 2 lines Fixed following error when DocXMLRPCServer failed. UnboundLocalError: local variable 'serv' referenced before assignment ................ r66772 | andrew.kuchling | 2008-10-03 11:29:19 -0500 (Fri, 03 Oct 2008) | 1 line Mention exception in docstring ................ r66774 | andrew.kuchling | 2008-10-03 11:42:52 -0500 (Fri, 03 Oct 2008) | 1 line Typo fix ................ r66776 | hirokazu.yamamoto | 2008-10-03 12:34:49 -0500 (Fri, 03 Oct 2008) | 2 lines Issue #1706863: Fixed "'NoneType' object has no attribute 'rfind'" error when sqlite libfile not found. ................ r66783 | andrew.kuchling | 2008-10-03 20:02:29 -0500 (Fri, 03 Oct 2008) | 1 line Use correct capitalization of NaN ................ r66784 | andrew.kuchling | 2008-10-03 20:03:42 -0500 (Fri, 03 Oct 2008) | 1 line Docstring change: Specify exception raised ................ r66785 | andrew.kuchling | 2008-10-03 20:04:24 -0500 (Fri, 03 Oct 2008) | 1 line Docstring changes: Specify exceptions raised ................ r66786 | andrew.kuchling | 2008-10-03 20:05:56 -0500 (Fri, 03 Oct 2008) | 3 lines Docstring change for *partition: use same tense as other docstrings. Hyphenate left- and right-justified. Fix 'registerd' typo ................ r66787 | andrew.kuchling | 2008-10-03 22:08:56 -0500 (Fri, 03 Oct 2008) | 1 line two corrections ................ r66790 | andrew.kuchling | 2008-10-04 11:52:01 -0500 (Sat, 04 Oct 2008) | 1 line Set svn:keywords ................ r66793 | georg.brandl | 2008-10-04 13:26:01 -0500 (Sat, 04 Oct 2008) | 2 lines #4041: don't refer to removed and outdated modules. ................ r66797 | benjamin.peterson | 2008-10-04 15:55:50 -0500 (Sat, 04 Oct 2008) | 19 lines Merged revisions 66707,66775,66782 via svnmerge from svn+ssh://pythondev@svn.python.org/sandbox/trunk/2to3/lib2to3 ........ r66707 | benjamin.peterson | 2008-09-30 18:27:10 -0500 (Tue, 30 Sep 2008) | 1 line fix #4001: fix_imports didn't check for __init__.py before converting to relative imports ........ r66775 | collin.winter | 2008-10-03 12:08:26 -0500 (Fri, 03 Oct 2008) | 4 lines Add an alternative iterative pattern matching system that, while slower, correctly parses files that cause the faster recursive pattern matcher to fail with a recursion error. lib2to3 falls back to the iterative matcher if the recursive one fails. Fixes http://bugs.python.org/issue2532. Thanks to Nick Edds. ........ r66782 | benjamin.peterson | 2008-10-03 17:51:36 -0500 (Fri, 03 Oct 2008) | 1 line add Victor Stinner's fixer for os.getcwdu -> os.getcwd #4023 ........ ................
2008-10-04 18:33:08 -03:00
as well as any other name registered with codecs.register_error that is\n\
able to handle ValueErrors.");
static PyObject *
codec_decode(PyObject *self, PyObject *args)
{
const char *encoding = NULL;
const char *errors = NULL;
PyObject *v;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "O|ss:decode", &v, &encoding, &errors))
return NULL;
#ifdef Py_USING_UNICODE
if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
#else
if (encoding == NULL) {
PyErr_SetString(PyExc_ValueError, "no encoding specified");
return NULL;
}
#endif
/* Decode via the codec registry */
return PyCodec_Decode(v, encoding, errors);
}
/* --- Helpers ------------------------------------------------------------ */
static
PyObject *codec_tuple(PyObject *unicode,
Py_ssize_t len)
{
PyObject *v;
if (unicode == NULL)
return NULL;
v = Py_BuildValue("On", unicode, len);
Py_DECREF(unicode);
return v;
}
/* --- String codecs ------------------------------------------------------ */
static PyObject *
escape_decode(PyObject *self,
PyObject *args)
{
const char *errors = NULL;
const char *data;
2006-02-15 13:27:45 -04:00
Py_ssize_t size;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "s#|z:escape_decode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyString_DecodeEscape(data, size, errors, 0, NULL),
size);
}
static PyObject *
escape_encode(PyObject *self,
PyObject *args)
{
PyObject *str;
const char *errors = NULL;
char *buf;
Py_ssize_t len;
if (!PyArg_ParseTuple(args, "O!|z:escape_encode",
&PyString_Type, &str, &errors))
return NULL;
str = PyString_Repr(str, 0);
if (!str)
return NULL;
/* The string will be quoted. Unquote, similar to unicode-escape. */
buf = PyString_AS_STRING (str);
len = PyString_GET_SIZE (str);
memmove(buf, buf+1, len-2);
if (_PyString_Resize(&str, len-2) < 0)
return NULL;
return codec_tuple(str, PyString_Size(str));
}
#ifdef Py_USING_UNICODE
/* --- Decoder ------------------------------------------------------------ */
static PyObject *
unicode_internal_decode(PyObject *self,
PyObject *args)
{
PyObject *obj;
const char *errors = NULL;
const char *data;
2006-02-15 13:27:45 -04:00
Py_ssize_t size;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "O|z:unicode_internal_decode",
&obj, &errors))
return NULL;
if (PyUnicode_Check(obj)) {
Py_INCREF(obj);
return codec_tuple(obj, PyUnicode_GET_SIZE(obj));
}
else {
if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
return NULL;
return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors),
size);
}
}
static PyObject *
utf_7_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded = NULL;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "s*|zi:utf_7_decode",
&pbuf, &errors, &final))
return NULL;
consumed = pbuf.len;
decoded = PyUnicode_DecodeUTF7Stateful(pbuf.buf, pbuf.len, errors,
final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
static PyObject *
utf_8_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int final = 0;
2006-02-15 13:27:45 -04:00
Py_ssize_t consumed;
PyObject *decoded = NULL;
if (!PyArg_ParseTuple(args, "s*|zi:utf_8_decode",
&pbuf, &errors, &final))
return NULL;
consumed = pbuf.len;
decoded = PyUnicode_DecodeUTF8Stateful(pbuf.buf, pbuf.len, errors,
final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
static PyObject *
utf_16_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int byteorder = 0;
int final = 0;
2006-02-15 13:27:45 -04:00
Py_ssize_t consumed;
PyObject *decoded;
if (!PyArg_ParseTuple(args, "s*|zi:utf_16_decode",
&pbuf, &errors, &final))
return NULL;
consumed = pbuf.len; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
&byteorder, final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
static PyObject *
utf_16_le_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int byteorder = -1;
int final = 0;
2006-02-15 13:27:45 -04:00
Py_ssize_t consumed;
PyObject *decoded = NULL;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "s*|zi:utf_16_le_decode",
&pbuf, &errors, &final))
return NULL;
2006-02-15 13:27:45 -04:00
consumed = pbuf.len; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
&byteorder, final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
static PyObject *
utf_16_be_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int byteorder = 1;
int final = 0;
2006-02-15 13:27:45 -04:00
Py_ssize_t consumed;
PyObject *decoded = NULL;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "s*|zi:utf_16_be_decode",
&pbuf, &errors, &final))
return NULL;
consumed = pbuf.len; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
&byteorder, final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
/* This non-standard version also provides access to the byteorder
parameter of the builtin UTF-16 codec.
It returns a tuple (unicode, bytesread, byteorder) with byteorder
being the value in effect at the end of data.
*/
static PyObject *
utf_16_ex_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int byteorder = 0;
PyObject *unicode, *tuple;
int final = 0;
2006-02-15 13:27:45 -04:00
Py_ssize_t consumed;
if (!PyArg_ParseTuple(args, "s*|zii:utf_16_ex_decode",
&pbuf, &errors, &byteorder, &final))
return NULL;
consumed = pbuf.len; /* This is overwritten unless final is true. */
unicode = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
&byteorder, final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (unicode == NULL)
return NULL;
tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
Py_DECREF(unicode);
return tuple;
}
static PyObject *
utf_32_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int byteorder = 0;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded;
if (!PyArg_ParseTuple(args, "s*|zi:utf_32_decode",
&pbuf, &errors, &final))
return NULL;
consumed = pbuf.len; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
&byteorder, final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
static PyObject *
utf_32_le_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int byteorder = -1;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded;
if (!PyArg_ParseTuple(args, "s*|zi:utf_32_le_decode",
&pbuf, &errors, &final))
return NULL;
consumed = pbuf.len; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
&byteorder, final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
static PyObject *
utf_32_be_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int byteorder = 1;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded;
if (!PyArg_ParseTuple(args, "s*|zi:utf_32_be_decode",
&pbuf, &errors, &final))
return NULL;
consumed = pbuf.len; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
&byteorder, final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
/* This non-standard version also provides access to the byteorder
parameter of the builtin UTF-32 codec.
It returns a tuple (unicode, bytesread, byteorder) with byteorder
being the value in effect at the end of data.
*/
static PyObject *
utf_32_ex_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int byteorder = 0;
PyObject *unicode, *tuple;
int final = 0;
Py_ssize_t consumed;
if (!PyArg_ParseTuple(args, "s*|zii:utf_32_ex_decode",
&pbuf, &errors, &byteorder, &final))
return NULL;
consumed = pbuf.len; /* This is overwritten unless final is true. */
unicode = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
&byteorder, final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (unicode == NULL)
return NULL;
tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
Py_DECREF(unicode);
return tuple;
}
static PyObject *
unicode_escape_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
PyObject *unicode;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "s*|z:unicode_escape_decode",
&pbuf, &errors))
return NULL;
unicode = PyUnicode_DecodeUnicodeEscape(pbuf.buf, pbuf.len, errors);
PyBuffer_Release(&pbuf);
return codec_tuple(unicode, pbuf.len);
}
static PyObject *
raw_unicode_escape_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
PyObject *unicode;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "s*|z:raw_unicode_escape_decode",
&pbuf, &errors))
return NULL;
unicode = PyUnicode_DecodeRawUnicodeEscape(pbuf.buf, pbuf.len, errors);
PyBuffer_Release(&pbuf);
return codec_tuple(unicode, pbuf.len);
}
static PyObject *
latin_1_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
PyObject *unicode;
const char *errors = NULL;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "s*|z:latin_1_decode",
&pbuf, &errors))
return NULL;
unicode = PyUnicode_DecodeLatin1(pbuf.buf, pbuf.len, errors);
PyBuffer_Release(&pbuf);
return codec_tuple(unicode, pbuf.len);
}
static PyObject *
ascii_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
PyObject *unicode;
const char *errors = NULL;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "s*|z:ascii_decode",
&pbuf, &errors))
return NULL;
unicode = PyUnicode_DecodeASCII(pbuf.buf, pbuf.len, errors);
PyBuffer_Release(&pbuf);
return codec_tuple(unicode, pbuf.len);
}
static PyObject *
charmap_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
PyObject *unicode;
const char *errors = NULL;
PyObject *mapping = NULL;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "s*|zO:charmap_decode",
&pbuf, &errors, &mapping))
return NULL;
if (mapping == Py_None)
mapping = NULL;
unicode = PyUnicode_DecodeCharmap(pbuf.buf, pbuf.len, mapping, errors);
PyBuffer_Release(&pbuf);
return codec_tuple(unicode, pbuf.len);
}
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
static PyObject *
mbcs_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded = NULL;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "s*|zi:mbcs_decode",
&pbuf, &errors, &final))
return NULL;
consumed = pbuf.len;
decoded = PyUnicode_DecodeMBCSStateful(pbuf.buf, pbuf.len, errors,
final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
#endif /* MS_WINDOWS */
/* --- Encoder ------------------------------------------------------------ */
static PyObject *
readbuffer_encode(PyObject *self,
PyObject *args)
{
const char *data;
2006-02-15 13:27:45 -04:00
Py_ssize_t size;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyString_FromStringAndSize(data, size),
size);
}
static PyObject *
charbuffer_encode(PyObject *self,
PyObject *args)
{
const char *data;
2006-02-15 13:27:45 -04:00
Py_ssize_t size;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode",
&data, &size, &errors))
return NULL;
return codec_tuple(PyString_FromStringAndSize(data, size),
size);
}
static PyObject *
unicode_internal_encode(PyObject *self,
PyObject *args)
{
PyObject *obj;
const char *errors = NULL;
const char *data;
2006-02-15 13:27:45 -04:00
Py_ssize_t size;
2005-11-02 04:30:08 -04:00
if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode",
&obj, &errors))
return NULL;
if (PyUnicode_Check(obj)) {
data = PyUnicode_AS_DATA(obj);
size = PyUnicode_GET_DATA_SIZE(obj);
return codec_tuple(PyString_FromStringAndSize(data, size),
size);
}
else {
if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
return NULL;
return codec_tuple(PyString_FromStringAndSize(data, size),
size);
}
}
static PyObject *
utf_7_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_7_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
0,
0,
errors),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
utf_8_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_8_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
/* This version provides access to the byteorder parameter of the
builtin UTF-16 codecs as optional third argument. It defaults to 0
which means: use the native byte order and prepend the data with a
2005-11-02 04:30:08 -04:00
BOM mark.
*/
static PyObject *
utf_16_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
int byteorder = 0;
if (!PyArg_ParseTuple(args, "O|zi:utf_16_encode",
&str, &errors, &byteorder))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
byteorder),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
utf_16_le_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_16_le_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
-1),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
utf_16_be_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_16_be_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
+1),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
/* This version provides access to the byteorder parameter of the
builtin UTF-32 codecs as optional third argument. It defaults to 0
which means: use the native byte order and prepend the data with a
BOM mark.
*/
static PyObject *
utf_32_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
int byteorder = 0;
if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
&str, &errors, &byteorder))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
byteorder),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
utf_32_le_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
-1),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
utf_32_be_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
+1),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
unicode_escape_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:unicode_escape_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
2005-11-02 04:30:08 -04:00
v = codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str)),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
raw_unicode_escape_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:raw_unicode_escape_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str)),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
latin_1_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:latin_1_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeLatin1(
PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
ascii_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:ascii_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeASCII(
PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
charmap_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
PyObject *mapping = NULL;
if (!PyArg_ParseTuple(args, "O|zO:charmap_encode",
&str, &errors, &mapping))
return NULL;
if (mapping == Py_None)
mapping = NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeCharmap(
PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
mapping,
errors),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject*
charmap_build(PyObject *self, PyObject *args)
{
PyObject *map;
if (!PyArg_ParseTuple(args, "U:charmap_build", &map))
return NULL;
return PyUnicode_BuildEncodingMap(map);
}
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
static PyObject *
mbcs_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:mbcs_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeMBCS(
PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
#endif /* MS_WINDOWS */
#endif /* Py_USING_UNICODE */
/* --- Error handler registry --------------------------------------------- */
PyDoc_STRVAR(register_error__doc__,
"register_error(errors, handler)\n\
\n\
Register the specified error handler under the name\n\
errors. handler must be a callable object, that\n\
will be called with an exception instance containing\n\
information about the location of the encoding/decoding\n\
error and must return a (replacement, new position) tuple.");
static PyObject *register_error(PyObject *self, PyObject *args)
{
const char *name;
PyObject *handler;
if (!PyArg_ParseTuple(args, "sO:register_error",
&name, &handler))
return NULL;
if (PyCodec_RegisterError(name, handler))
return NULL;
Py_RETURN_NONE;
}
PyDoc_STRVAR(lookup_error__doc__,
"lookup_error(errors) -> handler\n\
\n\
Return the error handler for the specified error handling name\n\
or raise a LookupError, if no handler exists under this name.");
static PyObject *lookup_error(PyObject *self, PyObject *args)
{
const char *name;
if (!PyArg_ParseTuple(args, "s:lookup_error",
&name))
return NULL;
return PyCodec_LookupError(name);
}
/* --- Module API --------------------------------------------------------- */
static PyMethodDef _codecs_functions[] = {
{"register", codec_register, METH_O,
register__doc__},
{"lookup", codec_lookup, METH_VARARGS,
lookup__doc__},
{"encode", codec_encode, METH_VARARGS,
encode__doc__},
{"decode", codec_decode, METH_VARARGS,
decode__doc__},
{"escape_encode", escape_encode, METH_VARARGS},
{"escape_decode", escape_decode, METH_VARARGS},
#ifdef Py_USING_UNICODE
{"utf_8_encode", utf_8_encode, METH_VARARGS},
{"utf_8_decode", utf_8_decode, METH_VARARGS},
{"utf_7_encode", utf_7_encode, METH_VARARGS},
{"utf_7_decode", utf_7_decode, METH_VARARGS},
{"utf_16_encode", utf_16_encode, METH_VARARGS},
{"utf_16_le_encode", utf_16_le_encode, METH_VARARGS},
{"utf_16_be_encode", utf_16_be_encode, METH_VARARGS},
{"utf_16_decode", utf_16_decode, METH_VARARGS},
{"utf_16_le_decode", utf_16_le_decode, METH_VARARGS},
{"utf_16_be_decode", utf_16_be_decode, METH_VARARGS},
{"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS},
{"utf_32_encode", utf_32_encode, METH_VARARGS},
{"utf_32_le_encode", utf_32_le_encode, METH_VARARGS},
{"utf_32_be_encode", utf_32_be_encode, METH_VARARGS},
{"utf_32_decode", utf_32_decode, METH_VARARGS},
{"utf_32_le_decode", utf_32_le_decode, METH_VARARGS},
{"utf_32_be_decode", utf_32_be_decode, METH_VARARGS},
{"utf_32_ex_decode", utf_32_ex_decode, METH_VARARGS},
{"unicode_escape_encode", unicode_escape_encode, METH_VARARGS},
{"unicode_escape_decode", unicode_escape_decode, METH_VARARGS},
{"unicode_internal_encode", unicode_internal_encode, METH_VARARGS},
{"unicode_internal_decode", unicode_internal_decode, METH_VARARGS},
{"raw_unicode_escape_encode", raw_unicode_escape_encode, METH_VARARGS},
{"raw_unicode_escape_decode", raw_unicode_escape_decode, METH_VARARGS},
{"latin_1_encode", latin_1_encode, METH_VARARGS},
{"latin_1_decode", latin_1_decode, METH_VARARGS},
{"ascii_encode", ascii_encode, METH_VARARGS},
{"ascii_decode", ascii_decode, METH_VARARGS},
{"charmap_encode", charmap_encode, METH_VARARGS},
{"charmap_decode", charmap_decode, METH_VARARGS},
{"charmap_build", charmap_build, METH_VARARGS},
{"readbuffer_encode", readbuffer_encode, METH_VARARGS},
{"charbuffer_encode", charbuffer_encode, METH_VARARGS},
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
{"mbcs_encode", mbcs_encode, METH_VARARGS},
{"mbcs_decode", mbcs_decode, METH_VARARGS},
#endif
#endif /* Py_USING_UNICODE */
{"register_error", register_error, METH_VARARGS,
register_error__doc__},
{"lookup_error", lookup_error, METH_VARARGS,
lookup_error__doc__},
{NULL, NULL} /* sentinel */
};
PyMODINIT_FUNC
init_codecs(void)
{
Py_InitModule("_codecs", _codecs_functions);
}