diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex index 136c5289923..85ca7a57266 100644 --- a/Doc/lib/libcodecs.tex +++ b/Doc/lib/libcodecs.tex @@ -17,7 +17,7 @@ This module defines base classes for standard Python codecs (encoders and decoders) and provides access to the internal Python codec -registry which manages the codec lookup process. +registry which manages the codec and error handling lookup process. It defines the following functions: @@ -98,6 +98,43 @@ Raises a \exception{LookupError} in case the encoding cannot be found. To simplify working with encoded files or stream, the module also defines these utility functions: +\begin{funcdesc}{register_error}{name, error_handler} +Register the error handling function \var{error_handler} under the +name \var{name}. \vari{error_handler} will be called during encoding +and decoding in case of an error, when \var{name} is specified as the +errors parameter. \var{error_handler} will be called with an +\exception{UnicodeEncodeError}, \exception{UnicodeDecodeError} or +\exception{UnicodeTranslateError} instance and must return a tuple +with a replacement for the unencodable/undecodable part of the input +and a position where encoding/decoding should continue. +\end{funcdesc} + +\begin{funcdesc}{lookup_error}{name} +Return the error handler previously register under the name \var{name}. + +Raises a \exception{LookupError} in case the handler cannot be found. +\end{funcdesc} + +\begin{funcdesc}{strict_errors}{exception} +Implements the \code{strict} error handling. +\end{funcdesc} + +\begin{funcdesc}{replace_errors}{exception} +Implements the \code{replace} error handling. +\end{funcdesc} + +\begin{funcdesc}{ignore_errors}{exception} +Implements the \code{ignore} error handling. +\end{funcdesc} + +\begin{funcdesc}{xmlcharrefreplace_errors_errors}{exception} +Implements the \code{xmlcharrefreplace} error handling. +\end{funcdesc} + +\begin{funcdesc}{backslashreplace_errors_errors}{exception} +Implements the \code{backslashreplace} error handling. +\end{funcdesc} + \begin{funcdesc}{open}{filename, mode\optional{, encoding\optional{, errors\optional{, buffering}}}} Open an encoded file using the given \var{mode} and return diff --git a/Doc/lib/libexcs.tex b/Doc/lib/libexcs.tex index 078fe3c12aa..54b141a9a14 100644 --- a/Doc/lib/libexcs.tex +++ b/Doc/lib/libexcs.tex @@ -335,6 +335,24 @@ Raised when an \keyword{assert} statement fails. \versionadded{2.0} \end{excdesc} +\begin{excdesc}{UnicodeEncodeError} + Raised when a Unicode-related error occurs during encoding. It + is a subclass of \exception{UnicodeError}. +\versionadded{2.3} +\end{excdesc} + +\begin{excdesc}{UnicodeDecodeError} + Raised when a Unicode-related error occurs during decoding. It + is a subclass of \exception{UnicodeError}. +\versionadded{2.3} +\end{excdesc} + +\begin{excdesc}{UnicodeTranslateError} + Raised when a Unicode-related error occurs during translating. It + is a subclass of \exception{UnicodeError}. +\versionadded{2.3} +\end{excdesc} + \begin{excdesc}{ValueError} Raised when a built-in operation or function receives an argument that has the right type but an inappropriate value, and the @@ -426,6 +444,9 @@ The class hierarchy for built-in exceptions is: | | +-- FloatingPointError | +-- ValueError | | +-- UnicodeError + | | +-- UnicodeEncodeError + | | +-- UnicodeDecodeError + | | +-- UnicodeTranslateError | +-- ReferenceError | +-- SystemError | +-- MemoryError diff --git a/Include/codecs.h b/Include/codecs.h index 2cc4d7d350a..82f18cdc5e7 100644 --- a/Include/codecs.h +++ b/Include/codecs.h @@ -117,6 +117,36 @@ PyAPI_FUNC(PyObject *) PyCodec_StreamWriter( const char *errors ); +/* Unicode encoding error handling callback registry API */ + +/* Register the error handling callback function error under the name + name. This function will be called by the codec when it encounters + unencodable characters/undecodable bytes and doesn't know the + callback name, when name is specified as the error parameter + in the call to the encode/decode function. + Return 0 on success, -1 on error */ +PyAPI_FUNC(int) PyCodec_RegisterError(const char *name, PyObject *error); + +/* Lookup the error handling callback function registered under the + name error. As a special case NULL can be passed, in which case + the error handling callback for "strict" will be returned. */ +PyAPI_FUNC(PyObject *) PyCodec_LookupError(const char *name); + +/* raise exc as an exception */ +PyAPI_FUNC(PyObject *) PyCodec_StrictErrors(PyObject *exc); + +/* ignore the unicode error, skipping the faulty input */ +PyAPI_FUNC(PyObject *) PyCodec_IgnoreErrors(PyObject *exc); + +/* replace the unicode error with ? or U+FFFD */ +PyAPI_FUNC(PyObject *) PyCodec_ReplaceErrors(PyObject *exc); + +/* replace the unicode encode error with XML character references */ +PyAPI_FUNC(PyObject *) PyCodec_XMLCharRefReplaceErrors(PyObject *exc); + +/* replace the unicode encode error with backslash escapes (\x, \u and \U) */ +PyAPI_FUNC(PyObject *) PyCodec_BackslashReplaceErrors(PyObject *exc); + #ifdef __cplusplus } #endif diff --git a/Include/pyerrors.h b/Include/pyerrors.h index b783b7ba1d6..756c4b2fcec 100644 --- a/Include/pyerrors.h +++ b/Include/pyerrors.h @@ -54,6 +54,9 @@ PyAPI_DATA(PyObject *) PyExc_SystemExit; PyAPI_DATA(PyObject *) PyExc_TypeError; PyAPI_DATA(PyObject *) PyExc_UnboundLocalError; PyAPI_DATA(PyObject *) PyExc_UnicodeError; +PyAPI_DATA(PyObject *) PyExc_UnicodeEncodeError; +PyAPI_DATA(PyObject *) PyExc_UnicodeDecodeError; +PyAPI_DATA(PyObject *) PyExc_UnicodeTranslateError; PyAPI_DATA(PyObject *) PyExc_ValueError; PyAPI_DATA(PyObject *) PyExc_ZeroDivisionError; #ifdef MS_WINDOWS @@ -114,6 +117,69 @@ PyAPI_FUNC(void) PyErr_SetInterrupt(void); PyAPI_FUNC(void) PyErr_SyntaxLocation(char *, int); PyAPI_FUNC(PyObject *) PyErr_ProgramText(char *, int); +/* The following functions are used to create and modify unicode + exceptions from C */ +/* create a UnicodeDecodeError object */ +PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_Create( + const char *, const char *, int, int, int, const char *); + +/* create a UnicodeEncodeError object */ +PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_Create( + const char *, const Py_UNICODE *, int, int, int, const char *); + +/* create a UnicodeTranslateError object */ +PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_Create( + const Py_UNICODE *, int, int, int, const char *); + +/* get the encoding attribute */ +PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetEncoding(PyObject *); +PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetEncoding(PyObject *); +PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetEncoding(PyObject *); + +/* get the object attribute */ +PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetObject(PyObject *); +PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetObject(PyObject *); +PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetObject(PyObject *); + +/* get the value of the start attribute (the int * may not be NULL) + return 0 on success, -1 on failure */ +PyAPI_FUNC(int) PyUnicodeEncodeError_GetStart(PyObject *, int *); +PyAPI_FUNC(int) PyUnicodeDecodeError_GetStart(PyObject *, int *); +PyAPI_FUNC(int) PyUnicodeTranslateError_GetStart(PyObject *, int *); + +/* assign a new value to the start attribute + return 0 on success, -1 on failure */ +PyAPI_FUNC(int) PyUnicodeEncodeError_SetStart(PyObject *, int); +PyAPI_FUNC(int) PyUnicodeDecodeError_SetStart(PyObject *, int); +PyAPI_FUNC(int) PyUnicodeTranslateError_SetStart(PyObject *, int); + +/* get the value of the end attribute (the int *may not be NULL) + return 0 on success, -1 on failure */ +PyAPI_FUNC(int) PyUnicodeEncodeError_GetEnd(PyObject *, int *); +PyAPI_FUNC(int) PyUnicodeDecodeError_GetEnd(PyObject *, int *); +PyAPI_FUNC(int) PyUnicodeTranslateError_GetEnd(PyObject *, int *); + +/* assign a new value to the end attribute + return 0 on success, -1 on failure */ +PyAPI_FUNC(int) PyUnicodeEncodeError_SetEnd(PyObject *, int); +PyAPI_FUNC(int) PyUnicodeDecodeError_SetEnd(PyObject *, int); +PyAPI_FUNC(int) PyUnicodeTranslateError_SetEnd(PyObject *, int); + +/* get the value of the reason attribute */ +PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetReason(PyObject *); +PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetReason(PyObject *); +PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetReason(PyObject *); + +/* assign a new value to the reason attribute + return 0 on success, -1 on failure */ +PyAPI_FUNC(int) PyUnicodeEncodeError_SetReason( + PyObject *, const char *); +PyAPI_FUNC(int) PyUnicodeDecodeError_SetReason( + PyObject *, const char *); +PyAPI_FUNC(int) PyUnicodeTranslateError_SetReason( + PyObject *, const char *); + + /* These APIs aren't really part of the error implementation, but often needed to format error messages; the native C lib APIs are not available on all platforms, which is why we provide emulations diff --git a/Lib/codecs.py b/Lib/codecs.py index b089e907662..40f0a2e2262 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -20,7 +20,10 @@ except ImportError, why: __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", - "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE"] + "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", + "strict_errors", "ignore_errors", "replace_errors", + "xmlcharrefreplace_errors", + "register_error", "lookup_error"] ### Constants @@ -632,6 +635,14 @@ def make_encoding_map(decoding_map): m[v] = None return m +### error handlers + +strict_errors = lookup_error("strict") +ignore_errors = lookup_error("ignore") +replace_errors = lookup_error("replace") +xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") +backslashreplace_errors = lookup_error("backslashreplace") + # Tell modulefinder that using codecs probably needs the encodings # package _false = 0 diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py new file mode 100644 index 00000000000..1650965a99a --- /dev/null +++ b/Lib/test/test_codeccallbacks.py @@ -0,0 +1,483 @@ +import test.test_support, unittest +import sys, codecs, htmlentitydefs, unicodedata + +class CodecCallbackTest(unittest.TestCase): + + def test_xmlcharrefreplace(self): + # replace unencodable characters which numeric character entities. + # For ascii, latin-1 and charmaps this is completely implemented + # in C and should be reasonably fast. + s = u"\u30b9\u30d1\u30e2 \xe4nd eggs" + self.assertEqual( + s.encode("ascii", "xmlcharrefreplace"), + "スパモ änd eggs" + ) + self.assertEqual( + s.encode("latin-1", "xmlcharrefreplace"), + "スパモ \xe4nd eggs" + ) + + def test_xmlcharnamereplace(self): + # This time use a named character entity for unencodable + # characters, if one is available. + names = {} + for (key, value) in htmlentitydefs.entitydefs.items(): + if len(value)==1: + names[unicode(value, "latin-1")] = unicode(key, "latin-1") + else: + names[unichr(int(value[2:-1]))] = unicode(key, "latin-1") + + def xmlcharnamereplace(exc): + if not isinstance(exc, UnicodeEncodeError): + raise TypeError("don't know how to handle %r" % exc) + l = [] + for c in exc.object[exc.start:exc.end]: + try: + l.append(u"&%s;" % names[c]) + except KeyError: + l.append(u"&#%d;" % ord(c)) + return (u"".join(l), exc.end) + + codecs.register_error( + "test.xmlcharnamereplace", xmlcharnamereplace) + + sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a" + sout = "«ℜ» = ⟨ሴ€⟩" + self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout) + sout = "\xabℜ\xbb = ⟨ሴ€⟩" + self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout) + sout = "\xabℜ\xbb = ⟨ሴ\xa4⟩" + self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout) + + def test_uninamereplace(self): + # We're using the names from the unicode database this time, + # and we're doing "systax highlighting" here, i.e. we include + # the replaced text in ANSI escape sequences. For this it is + # useful that the error handler is not called for every single + # unencodable character, but for a complete sequence of + # unencodable characters, otherwise we would output many + # unneccessary escape sequences. + + def uninamereplace(exc): + if not isinstance(exc, UnicodeEncodeError): + raise TypeError("don't know how to handle %r" % exc) + l = [] + for c in exc.object[exc.start:exc.end]: + l.append(unicodedata.name(c, u"0x%x" % ord(c))) + return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end) + + codecs.register_error( + "test.uninamereplace", uninamereplace) + + sin = u"\xac\u1234\u20ac\u8000" + sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m" + self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout) + + sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m" + self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout) + + sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1m0x8000\033[0m" + self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout) + + def test_backslashescape(self): + # Does the same as the "unicode-escape" encoding, but with different + # base encodings. + sin = u"a\xac\u1234\u20ac\u8000" + if sys.maxunicode > 0xffff: + sin += unichr(sys.maxunicode) + sout = "a\\xac\\u1234\\u20ac\\u8000" + if sys.maxunicode > 0xffff: + sout += "\\U%08x" % sys.maxunicode + self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) + + sout = "a\xac\\u1234\\u20ac\\u8000" + if sys.maxunicode > 0xffff: + sout += "\\U%08x" % sys.maxunicode + self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) + + sout = "a\xac\\u1234\xa4\\u8000" + if sys.maxunicode > 0xffff: + sout += "\\U%08x" % sys.maxunicode + self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) + + def test_relaxedutf8(self): + # This is the test for a decoding callback handler, + # that relaxes the UTF-8 minimal encoding restriction. + # A null byte that is encoded as "\xc0\x80" will be + # decoded as a null byte. All other illegal sequences + # will be handled strictly. + def relaxedutf8(exc): + if not isinstance(exc, UnicodeDecodeError): + raise TypeError("don't know how to handle %r" % exc) + if exc.object[exc.start:exc.end].startswith("\xc0\x80"): + return (u"\x00", exc.start+2) # retry after two bytes + else: + raise exc + + codecs.register_error( + "test.relaxedutf8", relaxedutf8) + + sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80" + sout = u"a\x00b\x00c\xfc\x00\x00" + self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout) + sin = "\xc0\x80\xc0\x81" + self.assertRaises(UnicodeError, sin.decode, "utf-8", "test.relaxedutf8") + + def test_charmapencode(self): + # For charmap encodings the replacement string will be + # mapped through the encoding again. This means, that + # to be able to use e.g. the "replace" handler, the + # charmap has to have a mapping for "?". + charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"]) + sin = u"abc" + sout = "AABBCC" + self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout) + + sin = u"abcA" + self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) + + charmap[ord("?")] = "XYZ" + sin = u"abcDEF" + sout = "AABBCCXYZXYZXYZ" + self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout) + + charmap[ord("?")] = u"XYZ" + self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) + + charmap[ord("?")] = u"XYZ" + self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) + + def test_callbacks(self): + def handler1(exc): + if not isinstance(exc, UnicodeEncodeError) \ + and not isinstance(exc, UnicodeDecodeError): + raise TypeError("don't know how to handle %r" % exc) + l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] + return (u"[%s]" % u"".join(l), exc.end) + + codecs.register_error("test.handler1", handler1) + + def handler2(exc): + if not isinstance(exc, UnicodeDecodeError): + raise TypeError("don't know how to handle %r" % exc) + l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] + return (u"[%s]" % u"".join(l), exc.end+1) # skip one character + + codecs.register_error("test.handler2", handler2) + + s = "\x00\x81\x7f\x80\xff" + + self.assertEqual( + s.decode("ascii", "test.handler1"), + u"\x00[<129>]\x7f[<128>][<255>]" + ) + self.assertEqual( + s.decode("ascii", "test.handler2"), + u"\x00[<129>][<128>]" + ) + + self.assertEqual( + "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"), + u"\u3042[<92><117><51><120>]xx" + ) + + self.assertEqual( + "\\u3042\u3xx".decode("unicode-escape", "test.handler1"), + u"\u3042[<92><117><51><120><120>]" + ) + + self.assertEqual( + codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0], + u"z[<98>][<99>]" + ) + + self.assertEqual( + u"g\xfc\xdfrk".encode("ascii", "test.handler1"), + u"g[<252><223>]rk" + ) + + self.assertEqual( + u"g\xfc\xdf".encode("ascii", "test.handler1"), + u"g[<252><223>]" + ) + + def test_longstrings(self): + # test long strings to check for memory overflow problems + errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"] + # register the handlers under different names, + # to prevent the codec from recognizing the name + for err in errors: + codecs.register_error("test." + err, codecs.lookup_error(err)) + l = 1000 + errors += [ "test." + err for err in errors ] + for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]: + for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"): + for err in errors: + try: + uni.encode(enc, err) + except UnicodeError: + pass + + def check_exceptionobjectargs(self, exctype, args, msg): + # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion + # check with one missing argument + self.assertRaises(TypeError, exctype, *args[:-1]) + # check with one missing argument + self.assertRaises(TypeError, exctype, *(args + ["too much"])) + # check with one argument of the wrong type + wrongargs = [ "spam", u"eggs", 42, 1.0, None ] + for i in xrange(len(args)): + for wrongarg in wrongargs: + if type(wrongarg) is type(args[i]): + continue + # build argument array + callargs = [] + for j in xrange(len(args)): + if i==j: + callargs.append(wrongarg) + else: + callargs.append(args[i]) + self.assertRaises(TypeError, exctype, *callargs) + exc = exctype(*args) + self.assertEquals(str(exc), msg) + + def test_unicodeencodeerror(self): + self.check_exceptionobjectargs( + UnicodeEncodeError, + ["ascii", u"g\xfcrk", 1, 2, "ouch"], + "'ascii' codec can't encode character '\ufc' in position 1: ouch" + ) + self.check_exceptionobjectargs( + UnicodeEncodeError, + ["ascii", u"g\xfcrk", 1, 4, "ouch"], + "'ascii' codec can't encode characters in position 1-3: ouch" + ) + self.check_exceptionobjectargs( + UnicodeEncodeError, + ["ascii", u"\xfcx", 0, 1, "ouch"], + "'ascii' codec can't encode character '\ufc' in position 0: ouch" + ) + + def test_unicodedecodeerror(self): + self.check_exceptionobjectargs( + UnicodeDecodeError, + ["ascii", "g\xfcrk", 1, 2, "ouch"], + "'ascii' codec can't decode byte 0xfc in position 1: ouch" + ) + self.check_exceptionobjectargs( + UnicodeDecodeError, + ["ascii", "g\xfcrk", 1, 3, "ouch"], + "'ascii' codec can't decode bytes in position 1-2: ouch" + ) + + def test_unicodetranslateerror(self): + self.check_exceptionobjectargs( + UnicodeTranslateError, + [u"g\xfcrk", 1, 2, "ouch"], + "can't translate character '\\ufc' in position 1: ouch" + ) + self.check_exceptionobjectargs( + UnicodeTranslateError, + [u"g\xfcrk", 1, 3, "ouch"], + "can't translate characters in position 1-2: ouch" + ) + + def test_badandgoodstrictexceptions(self): + self.assertRaises( + TypeError, + codecs.strict_errors, + 42 + ) + self.assertRaises( + Exception, + codecs.strict_errors, + Exception("ouch") + ) + + self.assertRaises( + UnicodeEncodeError, + codecs.strict_errors, + UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch") + ) + + def test_badandgoodignoreexceptions(self): + self.assertRaises( + TypeError, + codecs.ignore_errors, + 42 + ) + self.assertRaises( + TypeError, + codecs.ignore_errors, + UnicodeError("ouch") + ) + self.assertEquals( + codecs.ignore_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), + (u"", 1) + ) + self.assertEquals( + codecs.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")), + (u"", 1) + ) + self.assertEquals( + codecs.ignore_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")), + (u"", 1) + ) + + def test_badandgoodreplaceexceptions(self): + self.assertRaises( + TypeError, + codecs.replace_errors, + 42 + ) + self.assertRaises( + TypeError, + codecs.replace_errors, + UnicodeError("ouch") + ) + self.assertEquals( + codecs.replace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), + (u"?", 1) + ) + self.assertEquals( + codecs.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")), + (u"\ufffd", 1) + ) + self.assertEquals( + codecs.replace_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")), + (u"\ufffd", 1) + ) + + def test_badandgoodxmlcharrefreplaceexceptions(self): + self.assertRaises( + TypeError, + codecs.xmlcharrefreplace_errors, + 42 + ) + self.assertRaises( + TypeError, + codecs.xmlcharrefreplace_errors, + UnicodeError("ouch") + ) + self.assertEquals( + codecs.xmlcharrefreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), + (u"&#%d;" % 0x3042, 1) + ) + self.assertRaises( + TypeError, + codecs.xmlcharrefreplace_errors, + UnicodeError("ouch") + ) + self.assertRaises( + TypeError, + codecs.xmlcharrefreplace_errors, + UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") + ) + self.assertRaises( + TypeError, + codecs.xmlcharrefreplace_errors, + UnicodeTranslateError(u"\u3042", 0, 1, "ouch") + ) + + def test_badandgoodbackslashreplaceexceptions(self): + self.assertRaises( + TypeError, + codecs.backslashreplace_errors, + 42 + ) + self.assertRaises( + TypeError, + codecs.backslashreplace_errors, + UnicodeError("ouch") + ) + self.assertEquals( + codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), + (u"\\u3042", 1) + ) + self.assertEquals( + codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\x00", 0, 1, "ouch")), + (u"\\x00", 1) + ) + self.assertEquals( + codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\xff", 0, 1, "ouch")), + (u"\\xff", 1) + ) + self.assertEquals( + codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u0100", 0, 1, "ouch")), + (u"\\u0100", 1) + ) + self.assertEquals( + codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\uffff", 0, 1, "ouch")), + (u"\\uffff", 1) + ) + if sys.maxunicode>0xffff: + self.assertEquals( + codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U00010000", 0, 1, "ouch")), + (u"\\U00010000", 1) + ) + self.assertEquals( + codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U0010ffff", 0, 1, "ouch")), + (u"\\U0010ffff", 1) + ) + + self.assertRaises( + TypeError, + codecs.backslashreplace_errors, + UnicodeError("ouch") + ) + self.assertRaises( + TypeError, + codecs.backslashreplace_errors, + UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") + ) + self.assertRaises( + TypeError, + codecs.backslashreplace_errors, + UnicodeTranslateError(u"\u3042", 0, 1, "ouch") + ) + + def test_badhandlerresults(self): + results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) + encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15") + + for res in results: + codecs.register_error("test.badhandler", lambda: res) + for enc in encs: + self.assertRaises( + TypeError, + u"\u3042".encode, + enc, + "test.badhandler" + ) + for (enc, bytes) in ( + ("ascii", "\xff"), + ("utf-8", "\xff"), + ("utf-7", "+x-") + ): + self.assertRaises( + TypeError, + bytes.decode, + enc, + "test.badhandler" + ) + + def test_lookup(self): + self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict")) + self.assertEquals(codecs.ignore_errors, codecs.lookup_error("ignore")) + self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict")) + self.assertEquals( + codecs.xmlcharrefreplace_errors, + codecs.lookup_error("xmlcharrefreplace") + ) + self.assertEquals( + codecs.backslashreplace_errors, + codecs.lookup_error("backslashreplace") + ) + +def test_main(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(CodecCallbackTest)) + test.test_support.run_suite(suite) + +if __name__ == "__main__": + test_main() diff --git a/Misc/NEWS b/Misc/NEWS index 7034d729633..ad8776225ae 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -57,6 +57,9 @@ Type/class unification and new-style classes Core and builtins +- Codec error handling callbacks (PEP 293) are implemented. + Error handling in unicode.encode or str.decode can now be customized. + - A subtle change to the semantics of the built-in function intern(): interned strings are no longer immortal. You must keep a reference to the return value intern() around to get the benefit. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 1e3fc5d5b8a..24fa1d54086 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -706,6 +706,32 @@ mbcs_encode(PyObject *self, #endif /* MS_WINDOWS */ #endif /* Py_USING_UNICODE */ +/* --- Error handler registry --------------------------------------------- */ + +static PyObject *register_error(PyObject *self, PyObject *args) +{ + const char *name; + PyObject *handler; + + if (!PyArg_ParseTuple(args, "sO:register_error", + &name, &handler)) + return NULL; + if (PyCodec_RegisterError(name, handler)) + return NULL; + Py_INCREF(Py_None); + return Py_None; +} + +static PyObject *lookup_error(PyObject *self, PyObject *args) +{ + const char *name; + + if (!PyArg_ParseTuple(args, "s:lookup_error", + &name)) + return NULL; + return PyCodec_LookupError(name); +} + /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { @@ -744,6 +770,8 @@ static PyMethodDef _codecs_functions[] = { {"mbcs_decode", mbcs_decode, METH_VARARGS}, #endif #endif /* Py_USING_UNICODE */ + {"register_error", register_error, METH_VARARGS}, + {"lookup_error", lookup_error, METH_VARARGS}, {NULL, NULL} /* sentinel */ }; diff --git a/Objects/stringobject.c b/Objects/stringobject.c index 8ae9407476e..31f188a5b9e 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -2468,7 +2468,9 @@ PyDoc_STRVAR(encode__doc__, Encodes S using the codec registered for encoding. encoding defaults\n\ to the default encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."); +a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ +'xmlcharrefreplace' as well as any other name registered with\n\ +codecs.register_error that is able to handle UnicodeEncodeErrors."); static PyObject * string_encode(PyStringObject *self, PyObject *args) @@ -2487,7 +2489,9 @@ PyDoc_STRVAR(decode__doc__, Decodes S using the codec registered for encoding. encoding defaults\n\ to the default encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."); +a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ +as well as any other name registerd with codecs.register_error that is\n\ +able to handle UnicodeDecodeErrors."); static PyObject * string_decode(PyStringObject *self, PyObject *args) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 920f9ea2d86..2108d948639 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -528,8 +528,8 @@ PyObject *PyUnicode_Decode(const char *s, const char *errors) { PyObject *buffer = NULL, *unicode; - - if (encoding == NULL) + + if (encoding == NULL) encoding = PyUnicode_GetDefaultEncoding(); /* Shortcuts for common default encodings */ @@ -680,6 +680,92 @@ int PyUnicode_SetDefaultEncoding(const char *encoding) return -1; } +/* error handling callback helper: + build arguments, call the callback and check the arguments, + if no exception occured, copy the replacement to the output + and adjust various state variables. + return 0 on success, -1 on error +*/ + +static +int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, + const char *encoding, const char *reason, + const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr, + PyObject **output, int *outpos, Py_UNICODE **outptr) +{ + static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple"; + + PyObject *restuple = NULL; + PyObject *repunicode = NULL; + int outsize = PyUnicode_GET_SIZE(*output); + int requiredsize; + int newpos; + Py_UNICODE *repptr; + int repsize; + int res = -1; + + if (*errorHandler == NULL) { + *errorHandler = PyCodec_LookupError(errors); + if (*errorHandler == NULL) + goto onError; + } + + if (*exceptionObject == NULL) { + *exceptionObject = PyUnicodeDecodeError_Create( + encoding, input, insize, *startinpos, *endinpos, reason); + if (*exceptionObject == NULL) + goto onError; + } + else { + if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) + goto onError; + if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) + goto onError; + if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) + goto onError; + } + + restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); + if (restuple == NULL) + goto onError; + if (!PyTuple_Check(restuple)) { + PyErr_Format(PyExc_TypeError, &argparse[4]); + goto onError; + } + if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) + goto onError; + if (newpos<0) + newpos = 0; + else if (newpos>insize) + newpos = insize; + + /* need more space? (at least enough for what we + have+the replacement+the rest of the string (starting + at the new input position), so we won't have to check space + when there are no errors in the rest of the string) */ + repptr = PyUnicode_AS_UNICODE(repunicode); + repsize = PyUnicode_GET_SIZE(repunicode); + requiredsize = *outpos + repsize + insize-newpos; + if (requiredsize > outsize) { + if (requiredsize<2*outsize) + requiredsize = 2*outsize; + if (PyUnicode_Resize(output, requiredsize)) + goto onError; + *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; + } + *endinpos = newpos; + *inptr = input + newpos; + Py_UNICODE_COPY(*outptr, repptr, repsize); + *outptr += repsize; + *outpos += repsize; + /* we made it! */ + res = 0; + + onError: + Py_XDECREF(restuple); + return res; +} + /* --- UTF-7 Codec -------------------------------------------------------- */ /* see RFC2152 for details */ @@ -738,40 +824,14 @@ char utf7_special[128] = { } \ } \ -static -int utf7_decoding_error(Py_UNICODE **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-7 decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - if (dest != NULL) { - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - } - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "UTF-7 decoding error; unknown error handling code: %.400s", - errors); - return -1; - } -} - PyObject *PyUnicode_DecodeUTF7(const char *s, int size, const char *errors) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; const char *e; PyUnicodeObject *unicode; Py_UNICODE *p; @@ -779,7 +839,9 @@ PyObject *PyUnicode_DecodeUTF7(const char *s, int inShift = 0; unsigned int bitsleft = 0; unsigned long charsleft = 0; - int surrogate = 0; + int surrogate = 0; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; unicode = _PyUnicode_New(size); if (!unicode) @@ -791,7 +853,9 @@ PyObject *PyUnicode_DecodeUTF7(const char *s, e = s + size; while (s < e) { - Py_UNICODE ch = *s; + Py_UNICODE ch; + restart: + ch = *s; if (inShift) { if ((ch == '-') || !B64CHAR(ch)) { @@ -836,6 +900,7 @@ PyObject *PyUnicode_DecodeUTF7(const char *s, } } else if ( ch == '+' ) { + startinpos = s-starts; s++; if (s < e && *s == '-') { s++; @@ -857,21 +922,39 @@ PyObject *PyUnicode_DecodeUTF7(const char *s, } continue; utf7Error: - if (utf7_decoding_error(&p, errors, errmsg)) - goto onError; + outpos = p-PyUnicode_AS_UNICODE(unicode); + endinpos = s-starts; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf7", errmsg, + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&unicode, &outpos, &p)) + goto onError; } if (inShift) { - if (utf7_decoding_error(&p, errors, "unterminated shift sequence")) + outpos = p-PyUnicode_AS_UNICODE(unicode); + endinpos = size; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf7", "unterminated shift sequence", + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&unicode, &outpos, &p)) goto onError; + if (s < e) + goto restart; } - if (_PyUnicode_Resize(&unicode, p - unicode->str)) + if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode))) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return (PyObject *)unicode; onError: + Py_XDECREF(errorHandler); + Py_XDECREF(exc); Py_DECREF(unicode); return NULL; } @@ -1001,46 +1084,21 @@ char utf8_code_length[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 }; -static -int utf8_decoding_error(const char **source, - Py_UNICODE **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-8 decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - (*source)++; - return 0; - } - else if (strcmp(errors,"replace") == 0) { - (*source)++; - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "UTF-8 decoding error; unknown error handling code: %.400s", - errors); - return -1; - } -} - PyObject *PyUnicode_DecodeUTF8(const char *s, int size, const char *errors) { + const char *starts = s; int n; + int startinpos; + int endinpos; + int outpos; const char *e; PyUnicodeObject *unicode; Py_UNICODE *p; const char *errmsg = ""; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; /* Note: size will always be longer than the resulting Unicode character count */ @@ -1067,6 +1125,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, if (s + n > e) { errmsg = "unexpected end of data"; + startinpos = s-starts; + endinpos = size; goto utf8Error; } @@ -1074,19 +1134,27 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, case 0: errmsg = "unexpected code byte"; + startinpos = s-starts; + endinpos = startinpos+1; goto utf8Error; case 1: errmsg = "internal error"; + startinpos = s-starts; + endinpos = startinpos+1; goto utf8Error; case 2: if ((s[1] & 0xc0) != 0x80) { errmsg = "invalid data"; + startinpos = s-starts; + endinpos = startinpos+2; goto utf8Error; } ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); if (ch < 0x80) { + startinpos = s-starts; + endinpos = startinpos+2; errmsg = "illegal encoding"; goto utf8Error; } @@ -1098,6 +1166,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, if ((s[1] & 0xc0) != 0x80 || (s[2] & 0xc0) != 0x80) { errmsg = "invalid data"; + startinpos = s-starts; + endinpos = startinpos+3; goto utf8Error; } ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); @@ -1110,6 +1180,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, unit. */ errmsg = "illegal encoding"; + startinpos = s-starts; + endinpos = startinpos+3; goto utf8Error; } else @@ -1121,6 +1193,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, (s[2] & 0xc0) != 0x80 || (s[3] & 0xc0) != 0x80) { errmsg = "invalid data"; + startinpos = s-starts; + endinpos = startinpos+4; goto utf8Error; } ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + @@ -1132,6 +1206,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, UTF-16 */ { errmsg = "illegal encoding"; + startinpos = s-starts; + endinpos = startinpos+4; goto utf8Error; } #ifdef Py_UNICODE_WIDE @@ -1153,23 +1229,34 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, default: /* Other sizes are only needed for UCS-4 */ errmsg = "unsupported Unicode code range"; + startinpos = s-starts; + endinpos = startinpos+n; goto utf8Error; } s += n; continue; utf8Error: - if (utf8_decoding_error(&s, &p, errors, errmsg)) - goto onError; + outpos = p-PyUnicode_AS_UNICODE(unicode); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf8", errmsg, + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&unicode, &outpos, &p)) + goto onError; } /* Adjust length */ if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return (PyObject *)unicode; onError: + Py_XDECREF(errorHandler); + Py_XDECREF(exc); Py_DECREF(unicode); return NULL; } @@ -1287,43 +1374,16 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode) /* --- UTF-16 Codec ------------------------------------------------------- */ -static -int utf16_decoding_error(Py_UNICODE **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-16 decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - if (dest) { - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - } - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "UTF-16 decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} - PyObject * PyUnicode_DecodeUTF16(const char *s, int size, const char *errors, int *byteorder) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; PyUnicodeObject *unicode; Py_UNICODE *p; const unsigned char *q, *e; @@ -1335,13 +1395,8 @@ PyUnicode_DecodeUTF16(const char *s, #else int ihi = 0, ilo = 1; #endif - - /* size should be an even number */ - if (size & 1) { - if (utf16_decoding_error(NULL, errors, "truncated data")) - return NULL; - --size; /* else ignore the oddball byte */ - } + PyObject *errorHandler = NULL; + PyObject *exc = NULL; /* Note: size will always be longer than the resulting Unicode character count */ @@ -1398,7 +1453,18 @@ PyUnicode_DecodeUTF16(const char *s, } while (q < e) { - Py_UNICODE ch = (q[ihi] << 8) | q[ilo]; + Py_UNICODE ch; + /* remaing bytes at the end? (size should be even) */ + if (e-q<2) { + errmsg = "truncated data"; + startinpos = ((const char *)q)-starts; + endinpos = ((const char *)e)-starts; + goto utf16Error; + /* The remaining input chars are ignored if the callback + chooses to skip the input */ + } + ch = (q[ihi] << 8) | q[ilo]; + q += 2; if (ch < 0xD800 || ch > 0xDFFF) { @@ -1409,6 +1475,8 @@ PyUnicode_DecodeUTF16(const char *s, /* UTF-16 code pair: */ if (q >= e) { errmsg = "unexpected end of data"; + startinpos = (((const char *)q)-2)-starts; + endinpos = ((const char *)e)-starts; goto utf16Error; } if (0xD800 <= ch && ch <= 0xDBFF) { @@ -1425,15 +1493,24 @@ PyUnicode_DecodeUTF16(const char *s, } else { errmsg = "illegal UTF-16 surrogate"; + startinpos = (((const char *)q)-4)-starts; + endinpos = startinpos+2; goto utf16Error; } } errmsg = "illegal encoding"; + startinpos = (((const char *)q)-2)-starts; + endinpos = startinpos+2; /* Fall through to report the error */ utf16Error: - if (utf16_decoding_error(&p, errors, errmsg)) + outpos = p-PyUnicode_AS_UNICODE(unicode); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf16", errmsg, + starts, size, &startinpos, &endinpos, &exc, (const char **)&q, + (PyObject **)&unicode, &outpos, &p)) goto onError; } @@ -1444,10 +1521,14 @@ PyUnicode_DecodeUTF16(const char *s, if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return (PyObject *)unicode; onError: Py_DECREF(unicode); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return NULL; } @@ -1528,63 +1609,43 @@ PyObject *PyUnicode_AsUTF16String(PyObject *unicode) /* --- Unicode Escape Codec ----------------------------------------------- */ -static -int unicodeescape_decoding_error(Py_UNICODE **x, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "Unicode-Escape decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **x = Py_UNICODE_REPLACEMENT_CHARACTER; - (*x)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "Unicode-Escape decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} - static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, int size, const char *errors) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; + int i; PyUnicodeObject *v; - Py_UNICODE *p, *buf; + Py_UNICODE *p; const char *end; char* message; Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ + PyObject *errorHandler = NULL; + PyObject *exc = NULL; /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the - length after conversion to the true value. */ + length after conversion to the true value. + (but if the error callback returns a long replacement string + we'll have to allocate more space) */ v = _PyUnicode_New(size); if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; - p = buf = PyUnicode_AS_UNICODE(v); + p = PyUnicode_AS_UNICODE(v); end = s + size; while (s < end) { unsigned char c; Py_UNICODE x; - int i, digits; + int digits; /* Non-escape characters are interpreted as Unicode ordinals */ if (*s != '\\') { @@ -1592,6 +1653,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, continue; } + startinpos = s-starts; /* \ - Escapes */ s++; switch (*s++) { @@ -1640,14 +1702,28 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, message = "truncated \\UXXXXXXXX escape"; hexescape: chr = 0; - for (i = 0; i < digits; i++) { + outpos = p-PyUnicode_AS_UNICODE(v); + if (s+digits>end) { + endinpos = size; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", "end of string in escape sequence", + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) + goto onError; + goto nextByte; + } + for (i = 0; i < digits; ++i) { c = (unsigned char) s[i]; if (!isxdigit(c)) { - if (unicodeescape_decoding_error(&p, errors, message)) + endinpos = (s+i+1)-starts; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", message, + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) goto onError; - chr = 0xffffffff; - i++; - break; + goto nextByte; } chr = (chr<<4) & ~0xF; if (c >= '0' && c <= '9') @@ -1659,9 +1735,9 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, } s += i; if (chr == 0xffffffff) - /* _decoding_error will have already written into the - target buffer. */ - break; + /* _decoding_error will have already written into the + target buffer. */ + break; store: /* when we get here, chr is a 32-bit unicode character */ if (chr <= 0xffff) @@ -1678,10 +1754,13 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); #endif } else { - if (unicodeescape_decoding_error( - &p, errors, - "illegal Unicode character") - ) + endinpos = s-starts; + outpos = p-PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", "illegal Unicode character", + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) goto onError; } break; @@ -1717,13 +1796,27 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, goto store; } } - if (unicodeescape_decoding_error(&p, errors, message)) + endinpos = s-starts; + outpos = p-PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", message, + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) goto onError; break; default: if (s > end) { - if (unicodeescape_decoding_error(&p, errors, "\\ at end of string")) + message = "\\ at end of string"; + s--; + endinpos = s-starts; + outpos = p-PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", message, + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) goto onError; } else { @@ -1732,9 +1825,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, } break; } + nextByte: + ; } - if (_PyUnicode_Resize(&v, (int)(p - buf))) - goto onError; + if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) + goto onError; return (PyObject *)v; ucnhashError: @@ -1742,10 +1837,14 @@ ucnhashError: PyExc_UnicodeError, "\\N escapes not supported (can't load unicodedata module)" ); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return NULL; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return NULL; } @@ -1909,20 +2008,27 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, int size, const char *errors) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; PyUnicodeObject *v; - Py_UNICODE *p, *buf; + Py_UNICODE *p; const char *end; const char *bs; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the - length after conversion to the true value. */ + length after conversion to the true value. (But decoding error + handler might have to resize the string) */ v = _PyUnicode_New(size); if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; - p = buf = PyUnicode_AS_UNICODE(v); + p = PyUnicode_AS_UNICODE(v); end = s + size; while (s < end) { unsigned char c; @@ -1934,6 +2040,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, *p++ = (unsigned char)*s++; continue; } + startinpos = s-starts; /* \u-escapes are only interpreted iff the number of leading backslashes if odd */ @@ -1952,15 +2059,18 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, s++; /* \uXXXX with 4 hex digits */ - for (x = 0, i = 0; i < 4; i++) { - c = (unsigned char)s[i]; + outpos = p-PyUnicode_AS_UNICODE(v); + for (x = 0, i = 0; i < 4; ++i, ++s) { + c = (unsigned char)*s; if (!isxdigit(c)) { - if (unicodeescape_decoding_error(&p, errors, - "truncated \\uXXXX")) + endinpos = s-starts; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "rawunicodeescape", "truncated \\uXXXX", + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) goto onError; - x = 0xffffffff; - i++; - break; + goto nextByte; } x = (x<<4) & ~0xF; if (c >= '0' && c <= '9') @@ -1970,16 +2080,20 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, else x += 10 + c - 'A'; } - s += i; - if (x != 0xffffffff) - *p++ = x; + *p++ = x; + nextByte: + ; } - if (_PyUnicode_Resize(&v, (int)(p - buf))) + if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return (PyObject *)v; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return NULL; } @@ -2059,69 +2173,269 @@ PyObject *PyUnicode_DecodeLatin1(const char *s, return NULL; } -static -int latin1_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) +/* create or adjust a UnicodeEncodeError */ +static void make_encode_exception(PyObject **exceptionObject, + const char *encoding, + const Py_UNICODE *unicode, int size, + int startpos, int endpos, + const char *reason) { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "Latin-1 encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; + if (*exceptionObject == NULL) { + *exceptionObject = PyUnicodeEncodeError_Create( + encoding, unicode, size, startpos, endpos, reason); } else { - PyErr_Format(PyExc_ValueError, - "Latin-1 encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; + if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) + goto onError; + if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) + goto onError; + if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) + goto onError; + return; + onError: + Py_DECREF(*exceptionObject); + *exceptionObject = NULL; } } +/* raises a UnicodeEncodeError */ +static void raise_encode_exception(PyObject **exceptionObject, + const char *encoding, + const Py_UNICODE *unicode, int size, + int startpos, int endpos, + const char *reason) +{ + make_encode_exception(exceptionObject, + encoding, unicode, size, startpos, endpos, reason); + if (*exceptionObject != NULL) + PyCodec_StrictErrors(*exceptionObject); +} + +/* error handling callback helper: + build arguments, call the callback and check the arguments, + put the result into newpos and return the replacement string, which + has to be freed by the caller */ +static PyObject *unicode_encode_call_errorhandler(const char *errors, + PyObject **errorHandler, + const char *encoding, const char *reason, + const Py_UNICODE *unicode, int size, PyObject **exceptionObject, + int startpos, int endpos, + int *newpos) +{ + static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple"; + + PyObject *restuple; + PyObject *resunicode; + + if (*errorHandler == NULL) { + *errorHandler = PyCodec_LookupError(errors); + if (*errorHandler == NULL) + return NULL; + } + + make_encode_exception(exceptionObject, + encoding, unicode, size, startpos, endpos, reason); + if (*exceptionObject == NULL) + return NULL; + + restuple = PyObject_CallFunctionObjArgs( + *errorHandler, *exceptionObject, NULL); + if (restuple == NULL) + return NULL; + if (!PyTuple_Check(restuple)) { + PyErr_Format(PyExc_TypeError, &argparse[4]); + Py_DECREF(restuple); + return NULL; + } + if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, + &resunicode, newpos)) { + Py_DECREF(restuple); + return NULL; + } + if (*newpos<0) + *newpos = 0; + else if (*newpos>size) + *newpos = size; + Py_INCREF(resunicode); + Py_DECREF(restuple); + return resunicode; +} + +static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, + int size, + const char *errors, + int limit) +{ + /* output object */ + PyObject *res; + /* pointers to the beginning and end+1 of input */ + const Py_UNICODE *startp = p; + const Py_UNICODE *endp = p + size; + /* pointer to the beginning of the unencodable characters */ + /* const Py_UNICODE *badp = NULL; */ + /* pointer into the output */ + char *str; + /* current output position */ + int respos = 0; + int ressize; + char *encoding = (limit == 256) ? "latin-1" : "ascii"; + char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + /* the following variable is used for caching string comparisons + * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ + int known_errorHandler = -1; + + /* allocate enough for a simple encoding without + replacements, if we need more, we'll resize */ + res = PyString_FromStringAndSize(NULL, size); + if (res == NULL) + goto onError; + if (size == 0) + return res; + str = PyString_AS_STRING(res); + ressize = size; + + while (p=limit)) + ++collend; + /* cache callback name lookup (if not done yet, i.e. it's the first error) */ + if (known_errorHandler==-1) { + if ((errors==NULL) || (!strcmp(errors, "strict"))) + known_errorHandler = 1; + else if (!strcmp(errors, "replace")) + known_errorHandler = 2; + else if (!strcmp(errors, "ignore")) + known_errorHandler = 3; + else if (!strcmp(errors, "xmlcharrefreplace")) + known_errorHandler = 4; + else + known_errorHandler = 0; + } + switch (known_errorHandler) { + case 1: /* strict */ + raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); + goto onError; + case 2: /* replace */ + while (collstart++ ressize) { + if (requiredsize<2*ressize) + requiredsize = 2*ressize; + if (_PyString_Resize(&res, requiredsize)) + goto onError; + str = PyString_AS_STRING(res) + respos; + ressize = requiredsize; + } + /* generate replacement (temporarily (mis)uses p) */ + for (p = collstart; p < collend; ++p) { + str += sprintf(str, "&#%d;", (int)*p); + } + p = collend; + break; + default: + repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, + encoding, reason, startp, size, &exc, + collstart-startp, collend-startp, &newpos); + if (repunicode == NULL) + goto onError; + /* need more space? (at least enough for what we + have+the replacement+the rest of the string, so + we won't have to check space for encodable characters) */ + respos = str-PyString_AS_STRING(res); + repsize = PyUnicode_GET_SIZE(repunicode); + requiredsize = respos+repsize+(endp-collend); + if (requiredsize > ressize) { + if (requiredsize<2*ressize) + requiredsize = 2*ressize; + if (_PyString_Resize(&res, requiredsize)) { + Py_DECREF(repunicode); + goto onError; + } + str = PyString_AS_STRING(res) + respos; + ressize = requiredsize; + } + /* check if there is anything unencodable in the replacement + and copy it to the output */ + for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { + c = *uni2; + if (c >= limit) { + raise_encode_exception(&exc, encoding, startp, size, + unicodepos, unicodepos+1, reason); + Py_DECREF(repunicode); + goto onError; + } + *str = (char)c; + } + p = startp + newpos; + Py_DECREF(repunicode); + } + } + } + /* Resize if we allocated to much */ + respos = str-PyString_AS_STRING(res); + if (respos 0) { - Py_UNICODE ch = *p++; - if (ch >= 256) { - if (latin1_encoding_error(&p, &s, errors, - "ordinal not in range(256)")) - goto onError; - } - else - *s++ = (char)ch; - } - /* Resize if error handling skipped some characters */ - if (s - start < PyString_GET_SIZE(repr)) - _PyString_Resize(&repr, s - start); - return repr; - - onError: - Py_DECREF(repr); - return NULL; + return unicode_encode_ucs1(p, size, errors, 256); } PyObject *PyUnicode_AsLatin1String(PyObject *unicode) @@ -2137,42 +2451,19 @@ PyObject *PyUnicode_AsLatin1String(PyObject *unicode) /* --- 7-bit ASCII Codec -------------------------------------------------- */ -static -int ascii_decoding_error(const char **source, - Py_UNICODE **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "ASCII decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "ASCII decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} - PyObject *PyUnicode_DecodeASCII(const char *s, int size, const char *errors) { + const char *starts = s; PyUnicodeObject *v; Py_UNICODE *p; + int startinpos; + int endinpos; + int outpos; + const char *e; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; /* ASCII is equivalent to the first 128 ordinals in Unicode. */ if (size == 1 && *(unsigned char*)s < 128) { @@ -2186,89 +2477,44 @@ PyObject *PyUnicode_DecodeASCII(const char *s, if (size == 0) return (PyObject *)v; p = PyUnicode_AS_UNICODE(v); - while (size-- > 0) { - register unsigned char c; - - c = (unsigned char)*s++; - if (c < 128) + e = s + size; + while (s < e) { + register unsigned char c = (unsigned char)*s; + if (c < 128) { *p++ = c; - else if (ascii_decoding_error(&s, &p, errors, - "ordinal not in range(128)")) + ++s; + } + else { + startinpos = s-starts; + endinpos = startinpos + 1; + outpos = p-PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "ascii", "ordinal not in range(128)", + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) goto onError; + } } if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return (PyObject *)v; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return NULL; } -static -int ascii_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "ASCII encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "ASCII encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} - PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, int size, const char *errors) { - PyObject *repr; - char *s, *start; - - repr = PyString_FromStringAndSize(NULL, size); - if (repr == NULL) - return NULL; - if (size == 0) - return repr; - - s = PyString_AS_STRING(repr); - start = s; - while (size-- > 0) { - Py_UNICODE ch = *p++; - if (ch >= 128) { - if (ascii_encoding_error(&p, &s, errors, - "ordinal not in range(128)")) - goto onError; - } - else - *s++ = (char)ch; - } - /* Resize if error handling skipped some characters */ - if (s - start < PyString_GET_SIZE(repr)) - _PyString_Resize(&repr, s - start); - return repr; - - onError: - Py_DECREF(repr); - return NULL; + return unicode_encode_ucs1(p, size, errors, 128); } PyObject *PyUnicode_AsASCIIString(PyObject *unicode) @@ -2348,44 +2594,21 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, /* --- Character Mapping Codec -------------------------------------------- */ -static -int charmap_decoding_error(const char **source, - Py_UNICODE **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "charmap decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "charmap decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} - PyObject *PyUnicode_DecodeCharmap(const char *s, int size, PyObject *mapping, const char *errors) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; + const char *e; PyUnicodeObject *v; Py_UNICODE *p; int extrachars = 0; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; /* Default to Latin-1 */ if (mapping == NULL) @@ -2397,8 +2620,9 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, if (size == 0) return (PyObject *)v; p = PyUnicode_AS_UNICODE(v); - while (size-- > 0) { - unsigned char ch = *s++; + e = s + size; + while (s < e) { + unsigned char ch = *s; PyObject *w, *x; /* Get mapping (char ordinal -> integer, Unicode char or None) */ @@ -2430,11 +2654,18 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, } else if (x == Py_None) { /* undefined mapping */ - if (charmap_decoding_error(&s, &p, errors, - "character maps to ")) { + outpos = p-PyUnicode_AS_UNICODE(v); + startinpos = s-starts; + endinpos = startinpos+1; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "charmap", "character maps to ", + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) { Py_DECREF(x); goto onError; } + continue; } else if (PyUnicode_Check(x)) { int targetsize = PyUnicode_GET_SIZE(x); @@ -2474,147 +2705,296 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, goto onError; } Py_DECREF(x); + ++s; } if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return (PyObject *)v; onError: + Py_XDECREF(errorHandler); + Py_XDECREF(exc); Py_XDECREF(v); return NULL; } -static -int charmap_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) +/* Lookup the character ch in the mapping. If the character + can't be found, Py_None is returned (or NULL, if another + error occured). */ +static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "charmap encoding error: %.400s", - details); - return -1; + PyObject *w = PyInt_FromLong((long)c); + PyObject *x; + + if (w == NULL) + return NULL; + x = PyObject_GetItem(mapping, w); + Py_DECREF(w); + if (x == NULL) { + if (PyErr_ExceptionMatches(PyExc_LookupError)) { + /* No mapping found means: mapping is undefined. */ + PyErr_Clear(); + x = Py_None; + Py_INCREF(x); + return x; + } else + return NULL; } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; + else if (PyInt_Check(x)) { + long value = PyInt_AS_LONG(x); + if (value < 0 || value > 255) { + PyErr_SetString(PyExc_TypeError, + "character mapping must be in range(256)"); + Py_DECREF(x); + return NULL; + } + return x; } + else if (PyString_Check(x)) + return x; else { - PyErr_Format(PyExc_ValueError, - "charmap encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; + /* wrong return value */ + PyErr_SetString(PyExc_TypeError, + "character mapping must return integer, None or str"); + Py_DECREF(x); + return NULL; } } +/* lookup the character, put the result in the output string and adjust + various state variables. Reallocate the output string if not enough + space is available. Return a new reference to the object that + was put in the output buffer, or Py_None, if the mapping was undefined + (in which case no character was written) or NULL, if a + reallocation error ocurred. The called must decref the result */ +static +PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping, + PyObject **outobj, int *outpos) +{ + PyObject *rep = charmapencode_lookup(c, mapping); + + if (rep==NULL) + return NULL; + else if (rep==Py_None) + return rep; + else { + char *outstart = PyString_AS_STRING(*outobj); + int outsize = PyString_GET_SIZE(*outobj); + if (PyInt_Check(rep)) { + int requiredsize = *outpos+1; + if (outsize0; ++uni2) { + x = charmapencode_output(*uni2, mapping, res, respos); + if (x==NULL) { + Py_DECREF(repunicode); + return -1; + } + else if (x==Py_None) { + Py_DECREF(repunicode); + Py_DECREF(x); + raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); + return -1; + } + Py_DECREF(x); + } + *inpos = newpos; + Py_DECREF(repunicode); + } + return 0; +} + PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, int size, PyObject *mapping, const char *errors) { - PyObject *v; - char *s; - int extrachars = 0; + /* output object */ + PyObject *res = NULL; + /* current input position */ + int inpos = 0; + /* current output position */ + int respos = 0; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + /* the following variable is used for caching string comparisons + * -1=not initialized, 0=unknown, 1=strict, 2=replace, + * 3=ignore, 4=xmlcharrefreplace */ + int known_errorHandler = -1; /* Default to Latin-1 */ if (mapping == NULL) return PyUnicode_EncodeLatin1(p, size, errors); - v = PyString_FromStringAndSize(NULL, size); - if (v == NULL) - return NULL; + /* allocate enough for a simple encoding without + replacements, if we need more, we'll resize */ + res = PyString_FromStringAndSize(NULL, size); + if (res == NULL) + goto onError; if (size == 0) - return v; - s = PyString_AS_STRING(v); - while (size-- > 0) { - Py_UNICODE ch = *p++; - PyObject *w, *x; + return res; - /* Get mapping (Unicode ordinal -> string char, integer or None) */ - w = PyInt_FromLong((long)ch); - if (w == NULL) + while (inpos 255) { - PyErr_SetString(PyExc_TypeError, - "character mapping must be in range(256)"); - Py_DECREF(x); - goto onError; - } - *s++ = (char)value; - } - else if (x == Py_None) { - /* undefined mapping */ - if (charmap_encoding_error(&p, &s, errors, - "character maps to ")) { - Py_DECREF(x); - goto onError; - } - } - else if (PyString_Check(x)) { - int targetsize = PyString_GET_SIZE(x); - - if (targetsize == 1) - /* 1-1 mapping */ - *s++ = *PyString_AS_STRING(x); - - else if (targetsize > 1) { - /* 1-n mapping */ - if (targetsize > extrachars) { - /* resize first */ - int oldpos = (int)(s - PyString_AS_STRING(v)); - int needed = (targetsize - extrachars) + \ - (targetsize << 2); - extrachars += needed; - if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { - Py_DECREF(x); - goto onError; - } - s = PyString_AS_STRING(v) + oldpos; - } - memcpy(s, PyString_AS_STRING(x), targetsize); - s += targetsize; - extrachars -= targetsize; - } - /* 1-0 mapping: skip the character */ - } - else { - /* wrong return value */ - PyErr_SetString(PyExc_TypeError, - "character mapping must return integer, None or unicode"); - Py_DECREF(x); - goto onError; - } + else + /* done with this character => adjust input position */ + ++inpos; Py_DECREF(x); } - if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) - _PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))); - return v; - onError: - Py_XDECREF(v); + /* Resize if we allocated to much */ + if (respossize) + *newpos = size; + Py_INCREF(resunicode); + Py_DECREF(restuple); + return resunicode; +} + +/* Lookup the character ch in the mapping and put the result in result, + which must be decrefed by the caller. + Return 0 on success, -1 on error */ +static +int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) +{ + PyObject *w = PyInt_FromLong((long)c); + PyObject *x; + + if (w == NULL) + return -1; + x = PyObject_GetItem(mapping, w); + Py_DECREF(w); + if (x == NULL) { + if (PyErr_ExceptionMatches(PyExc_LookupError)) { + /* No mapping found means: use 1:1 mapping. */ + PyErr_Clear(); + *result = NULL; + return 0; + } else + return -1; + } + else if (x == Py_None) { + *result = x; + return 0; + } + else if (PyInt_Check(x)) { + long value = PyInt_AS_LONG(x); + long max = PyUnicode_GetMax(); + if (value < 0 || value > max) { + PyErr_Format(PyExc_TypeError, + "character mapping must be in range(0x%lx)", max+1); + Py_DECREF(x); + return -1; + } + *result = x; + return 0; + } + else if (PyUnicode_Check(x)) { + *result = x; + return 0; + } + else { + /* wrong return value */ + PyErr_SetString(PyExc_TypeError, + "character mapping must return integer, None or unicode"); + return -1; + } +} +/* ensure that *outobj is at least requiredsize characters long, +if not reallocate and adjust various state variables. +Return 0 on success, -1 on error */ +static +int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize, + int requiredsize) +{ + if (requiredsize > *outsize) { + /* remember old output position */ + int outpos = *outp-PyUnicode_AS_UNICODE(*outobj); + /* exponentially overallocate to minimize reallocations */ + if (requiredsize < 2 * *outsize) + requiredsize = 2 * *outsize; + if (_PyUnicode_Resize(outobj, requiredsize)) + return -1; + *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; + *outsize = requiredsize; + } + return 0; +} +/* lookup the character, put the result in the output string and adjust + various state variables. Return a new reference to the object that + was put in the output buffer in *result, or Py_None, if the mapping was + undefined (in which case no character was written). + The called must decref result. + Return 0 on success, -1 on error. */ +static +int charmaptranslate_output(Py_UNICODE c, PyObject *mapping, + PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res) +{ + if (charmaptranslate_lookup(c, mapping, res)) + return -1; + if (*res==NULL) { + /* not found => default to 1:1 mapping */ + *(*outp)++ = (Py_UNICODE)c; + } + else if (*res==Py_None) + ; + else if (PyInt_Check(*res)) { + /* no overflow check, because we know that the space is enough */ + *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); + } + else if (PyUnicode_Check(*res)) { + int repsize = PyUnicode_GET_SIZE(*res); + if (repsize==1) { + /* no overflow check, because we know that the space is enough */ + *(*outp)++ = *PyUnicode_AS_UNICODE(*res); + } + else if (repsize!=0) { + /* more than one character */ + int requiredsize = *outsize + repsize - 1; + if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize)) + return -1; + memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); + *outp += repsize; + } + } + else + return -1; + return 0; +} + +PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, int size, PyObject *mapping, const char *errors) { - PyUnicodeObject *v; - Py_UNICODE *p; - + /* output object */ + PyObject *res = NULL; + /* pointers to the beginning and end+1 of input */ + const Py_UNICODE *startp = p; + const Py_UNICODE *endp = p + size; + /* pointer into the output */ + Py_UNICODE *str; + /* current output position */ + int respos = 0; + int ressize; + char *reason = "character maps to "; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + /* the following variable is used for caching string comparisons + * -1=not initialized, 0=unknown, 1=strict, 2=replace, + * 3=ignore, 4=xmlcharrefreplace */ + int known_errorHandler = -1; + if (mapping == NULL) { PyErr_BadArgument(); return NULL; } - - /* Output will never be longer than input */ - v = _PyUnicode_New(size); - if (v == NULL) - goto onError; + + /* allocate enough for a simple 1:1 translation without + replacements, if we need more, we'll resize */ + res = PyUnicode_FromUnicode(NULL, size); + if (res == NULL) + goto onError; if (size == 0) - goto done; - p = PyUnicode_AS_UNICODE(v); - while (size-- > 0) { - Py_UNICODE ch = *s++; - PyObject *w, *x; + return res; + str = PyUnicode_AS_UNICODE(res); + ressize = size; - /* Get mapping */ - w = PyInt_FromLong(ch); - if (w == NULL) - goto onError; - x = PyObject_GetItem(mapping, w); - Py_DECREF(w); - if (x == NULL) { - if (PyErr_ExceptionMatches(PyExc_LookupError)) { - /* No mapping found: default to 1-1 mapping */ - PyErr_Clear(); - *p++ = ch; - continue; - } + while (p adjust input pointer */ + ++p; + else { /* untranslatable character */ + PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ + int repsize; + int newpos; + Py_UNICODE *uni2; + /* startpos for collecting untranslatable chars */ + const Py_UNICODE *collstart = p; + const Py_UNICODE *collend = p+1; + const Py_UNICODE *coll; - /* Apply mapping */ - if (PyInt_Check(x)) - *p++ = (Py_UNICODE)PyInt_AS_LONG(x); - else if (x == Py_None) { - /* undefined mapping */ - if (translate_error(&s, &p, errors, - "character maps to ")) { - Py_DECREF(x); - goto onError; + Py_XDECREF(x); + /* find all untranslatable characters */ + while (collend < endp) { + if (charmaptranslate_lookup(*collend, mapping, &x)) + goto onError; + Py_XDECREF(x); + if (x!=Py_None) + break; + ++collend; + } + /* cache callback name lookup + * (if not done yet, i.e. it's the first error) */ + if (known_errorHandler==-1) { + if ((errors==NULL) || (!strcmp(errors, "strict"))) + known_errorHandler = 1; + else if (!strcmp(errors, "replace")) + known_errorHandler = 2; + else if (!strcmp(errors, "ignore")) + known_errorHandler = 3; + else if (!strcmp(errors, "xmlcharrefreplace")) + known_errorHandler = 4; + else + known_errorHandler = 0; + } + switch (known_errorHandler) { + case 1: /* strict */ + raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); + goto onError; + case 2: /* replace */ + /* No need to check for space, this is a 1:1 replacement */ + for (coll = collstart; coll0; ++uni2) + *str++ = *uni2; + p = startp + newpos; + Py_DECREF(repunicode); } } - else if (PyUnicode_Check(x)) { - if (PyUnicode_GET_SIZE(x) != 1) { - /* 1-n mapping */ - PyErr_SetString(PyExc_NotImplementedError, - "1-n mappings are currently not implemented"); - Py_DECREF(x); - goto onError; - } - *p++ = *PyUnicode_AS_UNICODE(x); - } - else { - /* wrong return value */ - PyErr_SetString(PyExc_TypeError, - "translate mapping must return integer, None or unicode"); - Py_DECREF(x); - goto onError; - } - Py_DECREF(x); } - if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) - if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) + /* Resize if we allocated to much */ + respos = str-PyUnicode_AS_UNICODE(res); + if (respos= 0) { *output++ = '0' + decimal; + ++p; continue; } if (0 < ch && ch < 256) { *output++ = (char)ch; + ++p; continue; } - /* All other characters are considered invalid */ - if (errors == NULL || strcmp(errors, "strict") == 0) { - PyErr_SetString(PyExc_ValueError, - "invalid decimal Unicode string"); - goto onError; + /* All other characters are considered unencodable */ + collstart = p; + collend = p+1; + while (collend < end) { + if ((0 < *collend && *collend < 256) || + !Py_UNICODE_ISSPACE(*collend) || + Py_UNICODE_TODECIMAL(*collend)) + break; } - else if (strcmp(errors, "ignore") == 0) - continue; - else if (strcmp(errors, "replace") == 0) { - *output++ = '?'; - continue; + /* cache callback name lookup + * (if not done yet, i.e. it's the first error) */ + if (known_errorHandler==-1) { + if ((errors==NULL) || (!strcmp(errors, "strict"))) + known_errorHandler = 1; + else if (!strcmp(errors, "replace")) + known_errorHandler = 2; + else if (!strcmp(errors, "ignore")) + known_errorHandler = 3; + else if (!strcmp(errors, "xmlcharrefreplace")) + known_errorHandler = 4; + else + known_errorHandler = 0; + } + switch (known_errorHandler) { + case 1: /* strict */ + raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); + goto onError; + case 2: /* replace */ + for (p = collstart; p < collend; ++p) + *output++ = '?'; + /* fall through */ + case 3: /* ignore */ + p = collend; + break; + case 4: /* xmlcharrefreplace */ + /* generate replacement (temporarily (mis)uses p) */ + for (p = collstart; p < collend; ++p) + output += sprintf(output, "&#%d;", (int)*p); + p = collend; + break; + default: + repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, + encoding, reason, s, length, &exc, + collstart-s, collend-s, &newpos); + if (repunicode == NULL) + goto onError; + /* generate replacement */ + repsize = PyUnicode_GET_SIZE(repunicode); + for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { + Py_UNICODE ch = *uni2; + if (Py_UNICODE_ISSPACE(ch)) + *output++ = ' '; + else { + decimal = Py_UNICODE_TODECIMAL(ch); + if (decimal >= 0) + *output++ = '0' + decimal; + else if (0 < ch && ch < 256) + *output++ = (char)ch; + else { + Py_DECREF(repunicode); + raise_encode_exception(&exc, encoding, + s, length, collstart-s, collend-s, reason); + goto onError; + } + } + } + p = s + newpos; + Py_DECREF(repunicode); } } /* 0-terminate the output string */ *output++ = '\0'; + Py_XDECREF(exc); + Py_XDECREF(errorHandler); return 0; onError: + Py_XDECREF(exc); + Py_XDECREF(errorHandler); return -1; } @@ -3927,7 +4613,9 @@ PyDoc_STRVAR(encode__doc__, Return an encoded string version of S. Default encoding is the current\n\ default string encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."); +a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ +'xmlcharrefreplace' as well as any other name registered with\n\ +codecs.register_error that can handle UnicodeEncodeErrors."); static PyObject * unicode_encode(PyUnicodeObject *self, PyObject *args) diff --git a/Python/codecs.c b/Python/codecs.c index 3e54d8f9206..09cba7516c9 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -422,12 +422,409 @@ PyObject *PyCodec_Decode(PyObject *object, return NULL; } +static PyObject *_PyCodec_ErrorRegistry; + +/* Register the error handling callback function error under the name + name. This function will be called by the codec when it encounters + an unencodable characters/undecodable bytes and doesn't know the + callback name, when name is specified as the error parameter + in the call to the encode/decode function. + Return 0 on success, -1 on error */ +int PyCodec_RegisterError(const char *name, PyObject *error) +{ + if (!PyCallable_Check(error)) { + PyErr_SetString(PyExc_TypeError, "handler must be callable"); + return -1; + } + return PyDict_SetItemString( _PyCodec_ErrorRegistry, (char *)name, error); +} + +/* Lookup the error handling callback function registered under the + name error. As a special case NULL can be passed, in which case + the error handling callback for strict encoding will be returned. */ +PyObject *PyCodec_LookupError(const char *name) +{ + PyObject *handler = NULL; + + if (name==NULL) + name = "strict"; + handler = PyDict_GetItemString(_PyCodec_ErrorRegistry, (char *)name); + if (!handler) + PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); + else + Py_INCREF(handler); + return handler; +} + +static void wrong_exception_type(PyObject *exc) +{ + PyObject *type = PyObject_GetAttrString(exc, "__class__"); + if (type != NULL) { + PyObject *name = PyObject_GetAttrString(type, "__name__"); + Py_DECREF(type); + if (name != NULL) { + PyObject *string = PyObject_Str(name); + Py_DECREF(name); + PyErr_Format(PyExc_TypeError, "don't know how to handle %.400s in error callback", + PyString_AS_STRING(string)); + Py_DECREF(string); + } + } +} + +PyObject *PyCodec_StrictErrors(PyObject *exc) +{ + if (PyInstance_Check(exc)) + PyErr_SetObject((PyObject*)((PyInstanceObject*)exc)->in_class, + exc); + else + PyErr_SetString(PyExc_TypeError, "codec must pass exception instance"); + return NULL; +} + + +PyObject *PyCodec_IgnoreErrors(PyObject *exc) +{ + int end; + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { + if (PyUnicodeDecodeError_GetEnd(exc, &end)) + return NULL; + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { + if (PyUnicodeTranslateError_GetEnd(exc, &end)) + return NULL; + } + else { + wrong_exception_type(exc); + return NULL; + } + /* ouch: passing NULL, 0, pos gives None instead of u'' */ + return Py_BuildValue("(u#i)", &end, 0, end); +} + + +PyObject *PyCodec_ReplaceErrors(PyObject *exc) +{ + PyObject *restuple; + int start; + int end; + int i; + + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + PyObject *res; + Py_UNICODE *p; + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + res = PyUnicode_FromUnicode(NULL, end-start); + if (res == NULL) + return NULL; + for (p = PyUnicode_AS_UNICODE(res), i = start; + i0) { + *outp++ = '0' + c/base; + c %= base; + base /= 10; + } + *outp++ = ';'; + } + restuple = Py_BuildValue("(Oi)", res, end); + Py_DECREF(res); + Py_DECREF(object); + return restuple; + } + else { + wrong_exception_type(exc); + return NULL; + } +} + +static Py_UNICODE hexdigits[] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' +}; + +PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) +{ + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + PyObject *restuple; + PyObject *object; + int start; + int end; + PyObject *res; + Py_UNICODE *p; + Py_UNICODE *startp; + Py_UNICODE *outp; + int ressize; + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + startp = PyUnicode_AS_UNICODE(object); + for (p = startp+start, ressize = 0; p < startp+end; ++p) { + if (*p >= 0x00010000) + ressize += 1+1+8; + else if (*p >= 0x100) { + ressize += 1+1+4; + } + else + ressize += 1+1+2; + } + res = PyUnicode_FromUnicode(NULL, ressize); + if (res==NULL) + return NULL; + for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); + p < startp+end; ++p) { + Py_UNICODE c = *p; + *outp++ = '\\'; + if (c >= 0x00010000) { + *outp++ = 'U'; + *outp++ = hexdigits[(c>>28)&0xf]; + *outp++ = hexdigits[(c>>24)&0xf]; + *outp++ = hexdigits[(c>>20)&0xf]; + *outp++ = hexdigits[(c>>16)&0xf]; + *outp++ = hexdigits[(c>>12)&0xf]; + *outp++ = hexdigits[(c>>8)&0xf]; + } + else if (c >= 0x100) { + *outp++ = 'u'; + *outp++ = hexdigits[(c>>12)&0xf]; + *outp++ = hexdigits[(c>>8)&0xf]; + } + else + *outp++ = 'x'; + *outp++ = hexdigits[(c>>4)&0xf]; + *outp++ = hexdigits[c&0xf]; + } + + restuple = Py_BuildValue("(Oi)", res, end); + Py_DECREF(res); + Py_DECREF(object); + return restuple; + } + else { + wrong_exception_type(exc); + return NULL; + } +} + +static PyObject *strict_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_StrictErrors(exc); +} + + +static PyObject *ignore_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_IgnoreErrors(exc); +} + + +static PyObject *replace_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_ReplaceErrors(exc); +} + + +static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_XMLCharRefReplaceErrors(exc); +} + + +static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_BackslashReplaceErrors(exc); +} + + void _PyCodecRegistry_Init(void) { + static struct { + char *name; + PyMethodDef def; + } methods[] = + { + { + "strict", + { + "strict_errors", + strict_errors, + METH_O + } + }, + { + "ignore", + { + "ignore_errors", + ignore_errors, + METH_O + } + }, + { + "replace", + { + "replace_errors", + replace_errors, + METH_O + } + }, + { + "xmlcharrefreplace", + { + "xmlcharrefreplace_errors", + xmlcharrefreplace_errors, + METH_O + } + }, + { + "backslashreplace", + { + "backslashreplace_errors", + backslashreplace_errors, + METH_O + } + } + }; if (_PyCodec_SearchPath == NULL) _PyCodec_SearchPath = PyList_New(0); if (_PyCodec_SearchCache == NULL) _PyCodec_SearchCache = PyDict_New(); + if (_PyCodec_ErrorRegistry == NULL) { + int i; + _PyCodec_ErrorRegistry = PyDict_New(); + + if (_PyCodec_ErrorRegistry) { + for (i = 0; i < 5; ++i) { + PyObject *func = PyCFunction_New(&methods[i].def, NULL); + int res; + if (!func) + Py_FatalError("can't initialize codec error registry"); + res = PyCodec_RegisterError(methods[i].name, func); + Py_DECREF(func); + if (res) + Py_FatalError("can't initialize codec error registry"); + } + } + } if (_PyCodec_SearchPath == NULL || _PyCodec_SearchCache == NULL) Py_FatalError("can't initialize codec registry"); @@ -439,4 +836,6 @@ void _PyCodecRegistry_Fini(void) _PyCodec_SearchPath = NULL; Py_XDECREF(_PyCodec_SearchCache); _PyCodec_SearchCache = NULL; + Py_XDECREF(_PyCodec_ErrorRegistry); + _PyCodec_ErrorRegistry = NULL; } diff --git a/Python/exceptions.c b/Python/exceptions.c index c4bd626dedf..1667cd9b66a 100644 --- a/Python/exceptions.c +++ b/Python/exceptions.c @@ -100,6 +100,10 @@ Exception\n\ | +-- ValueError\n\ | | |\n\ | | +-- UnicodeError\n\ + | | |\n\ + | | +-- UnicodeEncodeError\n\ + | | +-- UnicodeDecodeError\n\ + | | +-- UnicodeTranslateError\n\ | |\n\ | +-- ReferenceError\n\ | +-- SystemError\n\ @@ -840,6 +844,590 @@ static PyMethodDef SyntaxError_methods[] = { }; +static +int get_int(PyObject *exc, const char *name, int *value) +{ + PyObject *attr = PyObject_GetAttrString(exc, (char *)name); + + if (!attr) + return -1; + if (!PyInt_Check(attr)) { + PyErr_Format(PyExc_TypeError, "%s attribute must be int", name); + Py_DECREF(attr); + return -1; + } + *value = PyInt_AS_LONG(attr); + Py_DECREF(attr); + return 0; +} + + +static +int set_int(PyObject *exc, const char *name, int value) +{ + PyObject *obj = PyInt_FromLong(value); + int result; + + if (!obj) + return -1; + result = PyObject_SetAttrString(exc, (char *)name, obj); + Py_DECREF(obj); + return result; +} + + +static +PyObject *get_string(PyObject *exc, const char *name) +{ + PyObject *attr = PyObject_GetAttrString(exc, (char *)name); + + if (!attr) + return NULL; + if (!PyString_Check(attr)) { + PyErr_Format(PyExc_TypeError, "%s attribute must be str", name); + Py_DECREF(attr); + return NULL; + } + return attr; +} + + +static +int set_string(PyObject *exc, const char *name, const char *value) +{ + PyObject *obj = PyString_FromString(value); + int result; + + if (!obj) + return -1; + result = PyObject_SetAttrString(exc, (char *)name, obj); + Py_DECREF(obj); + return result; +} + + +static +PyObject *get_unicode(PyObject *exc, const char *name) +{ + PyObject *attr = PyObject_GetAttrString(exc, (char *)name); + + if (!attr) + return NULL; + if (!PyUnicode_Check(attr)) { + PyErr_Format(PyExc_TypeError, "%s attribute must be unicode", name); + Py_DECREF(attr); + return NULL; + } + return attr; +} + +PyObject * PyUnicodeEncodeError_GetEncoding(PyObject *exc) +{ + return get_string(exc, "encoding"); +} + +PyObject * PyUnicodeDecodeError_GetEncoding(PyObject *exc) +{ + return get_string(exc, "encoding"); +} + +PyObject * PyUnicodeTranslateError_GetEncoding(PyObject *exc) +{ + return get_string(exc, "encoding"); +} + +PyObject *PyUnicodeEncodeError_GetObject(PyObject *exc) +{ + return get_unicode(exc, "object"); +} + +PyObject *PyUnicodeDecodeError_GetObject(PyObject *exc) +{ + return get_string(exc, "object"); +} + +PyObject *PyUnicodeTranslateError_GetObject(PyObject *exc) +{ + return get_unicode(exc, "object"); +} + +int PyUnicodeEncodeError_GetStart(PyObject *exc, int *start) +{ + if (!get_int(exc, "start", start)) { + PyObject *object = PyUnicodeEncodeError_GetObject(exc); + int size; + if (!object) + return -1; + size = PyUnicode_GET_SIZE(object); + if (*start<0) + *start = 0; + if (*start>=size) + *start = size-1; + Py_DECREF(object); + return 0; + } + return -1; +} + + +int PyUnicodeDecodeError_GetStart(PyObject *exc, int *start) +{ + if (!get_int(exc, "start", start)) { + PyObject *object = PyUnicodeDecodeError_GetObject(exc); + int size; + if (!object) + return -1; + size = PyString_GET_SIZE(object); + if (*start<0) + *start = 0; + if (*start>=size) + *start = size-1; + Py_DECREF(object); + return 0; + } + return -1; +} + + +int PyUnicodeTranslateError_GetStart(PyObject *exc, int *start) +{ + return PyUnicodeEncodeError_GetStart(exc, start); +} + + +int PyUnicodeEncodeError_SetStart(PyObject *exc, int start) +{ + return set_int(exc, "start", start); +} + + +int PyUnicodeDecodeError_SetStart(PyObject *exc, int start) +{ + return set_int(exc, "start", start); +} + + +int PyUnicodeTranslateError_SetStart(PyObject *exc, int start) +{ + return set_int(exc, "start", start); +} + + +int PyUnicodeEncodeError_GetEnd(PyObject *exc, int *end) +{ + if (!get_int(exc, "end", end)) { + PyObject *object = PyUnicodeEncodeError_GetObject(exc); + int size; + if (!object) + return -1; + size = PyUnicode_GET_SIZE(object); + if (*end<1) + *end = 1; + if (*end>size) + *end = size; + Py_DECREF(object); + return 0; + } + return -1; +} + + +int PyUnicodeDecodeError_GetEnd(PyObject *exc, int *end) +{ + if (!get_int(exc, "end", end)) { + PyObject *object = PyUnicodeDecodeError_GetObject(exc); + int size; + if (!object) + return -1; + size = PyString_GET_SIZE(object); + if (*end<1) + *end = 1; + if (*end>size) + *end = size; + Py_DECREF(object); + return 0; + } + return -1; +} + + +int PyUnicodeTranslateError_GetEnd(PyObject *exc, int *start) +{ + return PyUnicodeEncodeError_GetEnd(exc, start); +} + + +int PyUnicodeEncodeError_SetEnd(PyObject *exc, int end) +{ + return set_int(exc, "end", end); +} + + +int PyUnicodeDecodeError_SetEnd(PyObject *exc, int end) +{ + return set_int(exc, "end", end); +} + + +int PyUnicodeTranslateError_SetEnd(PyObject *exc, int end) +{ + return set_int(exc, "end", end); +} + + +PyObject *PyUnicodeEncodeError_GetReason(PyObject *exc) +{ + return get_string(exc, "reason"); +} + + +PyObject *PyUnicodeDecodeError_GetReason(PyObject *exc) +{ + return get_string(exc, "reason"); +} + + +PyObject *PyUnicodeTranslateError_GetReason(PyObject *exc) +{ + return get_string(exc, "reason"); +} + + +int PyUnicodeEncodeError_SetReason(PyObject *exc, const char *reason) +{ + return set_string(exc, "reason", reason); +} + + +int PyUnicodeDecodeError_SetReason(PyObject *exc, const char *reason) +{ + return set_string(exc, "reason", reason); +} + + +int PyUnicodeTranslateError_SetReason(PyObject *exc, const char *reason) +{ + return set_string(exc, "reason", reason); +} + + +static PyObject * +UnicodeError__init__(PyObject *self, PyObject *args, PyTypeObject *objecttype) +{ + PyObject *rtnval = NULL; + PyObject *encoding; + PyObject *object; + PyObject *start; + PyObject *end; + PyObject *reason; + + if (!(self = get_self(args))) + return NULL; + + if (!(args = PySequence_GetSlice(args, 1, PySequence_Size(args)))) + return NULL; + + if (!PyArg_ParseTuple(args, "O!O!O!O!O!", + &PyString_Type, &encoding, + objecttype, &object, + &PyInt_Type, &start, + &PyInt_Type, &end, + &PyString_Type, &reason)) + return NULL; + + if (PyObject_SetAttrString(self, "args", args)) + goto finally; + + if (PyObject_SetAttrString(self, "encoding", encoding)) + goto finally; + if (PyObject_SetAttrString(self, "object", object)) + goto finally; + if (PyObject_SetAttrString(self, "start", start)) + goto finally; + if (PyObject_SetAttrString(self, "end", end)) + goto finally; + if (PyObject_SetAttrString(self, "reason", reason)) + goto finally; + + Py_INCREF(Py_None); + rtnval = Py_None; + + finally: + Py_DECREF(args); + return rtnval; +} + + +static PyObject * +UnicodeEncodeError__init__(PyObject *self, PyObject *args) +{ + return UnicodeError__init__(self, args, &PyUnicode_Type); +} + +static PyObject * +UnicodeEncodeError__str__(PyObject *self, PyObject *arg) +{ + PyObject *encodingObj = NULL; + PyObject *objectObj = NULL; + int length; + int start; + int end; + PyObject *reasonObj = NULL; + char buffer[1000]; + PyObject *result = NULL; + + self = arg; + + if (!(encodingObj = PyUnicodeEncodeError_GetEncoding(self))) + goto error; + + if (!(objectObj = PyUnicodeEncodeError_GetObject(self))) + goto error; + + length = PyUnicode_GET_SIZE(objectObj); + + if (PyUnicodeEncodeError_GetStart(self, &start)) + goto error; + + if (PyUnicodeEncodeError_GetEnd(self, &end)) + goto error; + + if (!(reasonObj = PyUnicodeEncodeError_GetReason(self))) + goto error; + + if (end==start+1) { + PyOS_snprintf(buffer, sizeof(buffer), + "'%.400s' codec can't encode character '\\u%x' in position %d: %.400s", + PyString_AS_STRING(encodingObj), + (int)PyUnicode_AS_UNICODE(objectObj)[start], + start, + PyString_AS_STRING(reasonObj) + ); + } + else { + PyOS_snprintf(buffer, sizeof(buffer), + "'%.400s' codec can't encode characters in position %d-%d: %.400s", + PyString_AS_STRING(encodingObj), + start, + end-1, + PyString_AS_STRING(reasonObj) + ); + } + result = PyString_FromString(buffer); + +error: + Py_XDECREF(reasonObj); + Py_XDECREF(objectObj); + Py_XDECREF(encodingObj); + return result; +} + +static PyMethodDef UnicodeEncodeError_methods[] = { + {"__init__", UnicodeEncodeError__init__, METH_VARARGS}, + {"__str__", UnicodeEncodeError__str__, METH_O}, + {NULL, NULL} +}; + + +PyObject * PyUnicodeEncodeError_Create( + const char *encoding, const Py_UNICODE *object, int length, + int start, int end, const char *reason) +{ + return PyObject_CallFunction(PyExc_UnicodeEncodeError, "su#iis", + encoding, object, length, start, end, reason); +} + + +static PyObject * +UnicodeDecodeError__init__(PyObject *self, PyObject *args) +{ + return UnicodeError__init__(self, args, &PyString_Type); +} + +static PyObject * +UnicodeDecodeError__str__(PyObject *self, PyObject *arg) +{ + PyObject *encodingObj = NULL; + PyObject *objectObj = NULL; + int length; + int start; + int end; + PyObject *reasonObj = NULL; + char buffer[1000]; + PyObject *result = NULL; + + self = arg; + + if (!(encodingObj = PyUnicodeDecodeError_GetEncoding(self))) + goto error; + + if (!(objectObj = PyUnicodeDecodeError_GetObject(self))) + goto error; + + length = PyString_GET_SIZE(objectObj); + + if (PyUnicodeDecodeError_GetStart(self, &start)) + goto error; + + if (PyUnicodeDecodeError_GetEnd(self, &end)) + goto error; + + if (!(reasonObj = PyUnicodeDecodeError_GetReason(self))) + goto error; + + if (end==start+1) { + PyOS_snprintf(buffer, sizeof(buffer), + "'%.400s' codec can't decode byte 0x%x in position %d: %.400s", + PyString_AS_STRING(encodingObj), + ((int)PyString_AS_STRING(objectObj)[start])&0xff, + start, + PyString_AS_STRING(reasonObj) + ); + } + else { + PyOS_snprintf(buffer, sizeof(buffer), + "'%.400s' codec can't decode bytes in position %d-%d: %.400s", + PyString_AS_STRING(encodingObj), + start, + end-1, + PyString_AS_STRING(reasonObj) + ); + } + result = PyString_FromString(buffer); + +error: + Py_XDECREF(reasonObj); + Py_XDECREF(objectObj); + Py_XDECREF(encodingObj); + return result; +} + +static PyMethodDef UnicodeDecodeError_methods[] = { + {"__init__", UnicodeDecodeError__init__, METH_VARARGS}, + {"__str__", UnicodeDecodeError__str__, METH_O}, + {NULL, NULL} +}; + + +PyObject * PyUnicodeDecodeError_Create( + const char *encoding, const char *object, int length, + int start, int end, const char *reason) +{ + return PyObject_CallFunction(PyExc_UnicodeDecodeError, "ss#iis", + encoding, object, length, start, end, reason); +} + + +static PyObject * +UnicodeTranslateError__init__(PyObject *self, PyObject *args) +{ + PyObject *rtnval = NULL; + PyObject *object; + PyObject *start; + PyObject *end; + PyObject *reason; + + if (!(self = get_self(args))) + return NULL; + + if (!(args = PySequence_GetSlice(args, 1, PySequence_Size(args)))) + return NULL; + + if (!PyArg_ParseTuple(args, "O!O!O!O!", + &PyUnicode_Type, &object, + &PyInt_Type, &start, + &PyInt_Type, &end, + &PyString_Type, &reason)) + goto finally; + + if (PyObject_SetAttrString(self, "args", args)) + goto finally; + + if (PyObject_SetAttrString(self, "object", object)) + goto finally; + if (PyObject_SetAttrString(self, "start", start)) + goto finally; + if (PyObject_SetAttrString(self, "end", end)) + goto finally; + if (PyObject_SetAttrString(self, "reason", reason)) + goto finally; + + Py_INCREF(Py_None); + rtnval = Py_None; + + finally: + Py_DECREF(args); + return rtnval; +} + + +static PyObject * +UnicodeTranslateError__str__(PyObject *self, PyObject *arg) +{ + PyObject *objectObj = NULL; + int length; + int start; + int end; + PyObject *reasonObj = NULL; + char buffer[1000]; + PyObject *result = NULL; + + self = arg; + + if (!(objectObj = PyUnicodeTranslateError_GetObject(self))) + goto error; + + length = PyUnicode_GET_SIZE(objectObj); + + if (PyUnicodeTranslateError_GetStart(self, &start)) + goto error; + + if (PyUnicodeTranslateError_GetEnd(self, &end)) + goto error; + + if (!(reasonObj = PyUnicodeTranslateError_GetReason(self))) + goto error; + + if (end==start+1) { + PyOS_snprintf(buffer, sizeof(buffer), + "can't translate character '\\u%x' in position %d: %.400s", + (int)PyUnicode_AS_UNICODE(objectObj)[start], + start, + PyString_AS_STRING(reasonObj) + ); + } + else { + PyOS_snprintf(buffer, sizeof(buffer), + "can't translate characters in position %d-%d: %.400s", + start, + end-1, + PyString_AS_STRING(reasonObj) + ); + } + result = PyString_FromString(buffer); + +error: + Py_XDECREF(reasonObj); + Py_XDECREF(objectObj); + return result; +} + +static PyMethodDef UnicodeTranslateError_methods[] = { + {"__init__", UnicodeTranslateError__init__, METH_VARARGS}, + {"__str__", UnicodeTranslateError__str__, METH_O}, + {NULL, NULL} +}; + + +PyObject * PyUnicodeTranslateError_Create( + const Py_UNICODE *object, int length, + int start, int end, const char *reason) +{ + return PyObject_CallFunction(PyExc_UnicodeTranslateError, "u#iis", + object, length, start, end, reason); +} + + /* Exception doc strings */ @@ -865,6 +1453,12 @@ PyDoc_STRVAR(ValueError__doc__, PyDoc_STRVAR(UnicodeError__doc__, "Unicode related error."); +PyDoc_STRVAR(UnicodeEncodeError__doc__, "Unicode encoding error."); + +PyDoc_STRVAR(UnicodeDecodeError__doc__, "Unicode decoding error."); + +PyDoc_STRVAR(UnicodeTranslateError__doc__, "Unicode translation error."); + PyDoc_STRVAR(SystemError__doc__, "Internal error in the Python interpreter.\n\ \n\ @@ -949,6 +1543,9 @@ PyObject *PyExc_SystemError; PyObject *PyExc_SystemExit; PyObject *PyExc_UnboundLocalError; PyObject *PyExc_UnicodeError; +PyObject *PyExc_UnicodeEncodeError; +PyObject *PyExc_UnicodeDecodeError; +PyObject *PyExc_UnicodeTranslateError; PyObject *PyExc_TypeError; PyObject *PyExc_ValueError; PyObject *PyExc_ZeroDivisionError; @@ -1035,6 +1632,12 @@ static struct { FloatingPointError__doc__}, {"ValueError", &PyExc_ValueError, 0, ValueError__doc__}, {"UnicodeError", &PyExc_UnicodeError, &PyExc_ValueError, UnicodeError__doc__}, + {"UnicodeEncodeError", &PyExc_UnicodeEncodeError, &PyExc_UnicodeError, + UnicodeEncodeError__doc__, UnicodeEncodeError_methods}, + {"UnicodeDecodeError", &PyExc_UnicodeDecodeError, &PyExc_UnicodeError, + UnicodeDecodeError__doc__, UnicodeDecodeError_methods}, + {"UnicodeTranslateError", &PyExc_UnicodeTranslateError, &PyExc_UnicodeError, + UnicodeTranslateError__doc__, UnicodeTranslateError_methods}, {"ReferenceError", &PyExc_ReferenceError, 0, ReferenceError__doc__}, {"SystemError", &PyExc_SystemError, 0, SystemError__doc__}, {"MemoryError", &PyExc_MemoryError, 0, MemoryError__doc__},