Close #17828: better handling of codec errors

- output type errors now redirect users to the type-neutral convenience functions in the codecs module - stateless errors that occur during encoding and decoding will now be automatically wrapped in exceptions that give the name of the codec involved
2013-11-13 23:49:21 +10:00 · 2013-11-13 23:49:21 +10:00 · 8b097b4ed7
parent 59799a8399
commit 8b097b4ed7
7 changed files with 414 additions and 46 deletions
--- a/Doc/whatsnew/3.4.rst
+++ b/Doc/whatsnew/3.4.rst
@ -102,6 +102,7 @@ New expected features for Python implementations:
 * :ref:`PEP 446: Make newly created file descriptors non-inheritable <pep-446>`.
 * command line option for :ref:`isolated mode <using-on-misc-options>`,
  (:issue:`16499`).
+* improvements to handling of non-Unicode codecs

 Significantly Improved Library Modules:

@ -170,6 +171,70 @@ PEP 446: Make newly created file descriptors non-inheritable
      PEP written and implemented by Victor Stinner.


+Improvements to handling of non-Unicode codecs
+==============================================
+
+Since it was first introduced, the :mod:`codecs` module has always been
+intended to operate as a type-neutral dynamic encoding and decoding
+system. However, its close coupling with the Python text model, especially
+the type restricted convenience methods on the builtin :class:`str`,
+:class:`bytes` and :class:`bytearray` types, has historically obscured that
+fact.
+
+As a key step in clarifying the situation, the :meth:`codecs.encode` and
+:meth:`codecs.decode` convenience functions are now properly documented in
+Python 2.7, 3.3 and 3.4. These functions have existed in the :mod:`codecs`
+module and have been covered by the regression test suite since Python 2.4,
+but were previously only discoverable through runtime introspection.
+
+Unlike the convenience methods on :class:`str`, :class:`bytes` and
+:class:`bytearray`, these convenience functions support arbitrary codecs
+in both Python 2 and Python 3, rather than being limited to Unicode text
+encodings (in Python 3) or ``basestring`` <-> ``basestring`` conversions
+(in Python 2).
+
+In Python 3.4, the errors raised by the convenience methods when a codec
+produces the incorrect output type have also been updated to direct users
+towards these general purpose convenience functions::
+
+    >>> import codecs
+
+    >>> codecs.encode(b"hello", "bz2_codec").decode("bz2_codec")
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: 'bz2_codec' decoder returned 'bytes' instead of 'str'; use codecs.decode() to decode to arbitrary types
+
+    >>> "hello".encode("rot_13")
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: 'rot_13' encoder returned 'str' instead of 'bytes'; use codecs.encode() to encode to arbitrary types
+
+In a related change, whenever it is feasible without breaking backwards
+compatibility, exceptions raised during encoding and decoding operations
+will be wrapped in a chained exception of the same type that mentions the
+name of the codec responsible for producing the error::
+
+    >>> b"hello".decode("uu_codec")
+    ValueError: Missing "begin" line in input data
+
+    The above exception was the direct cause of the following exception:
+
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    ValueError: decoding with 'uu_codec' codec failed (ValueError: Missing "begin" line in input data)
+
+    >>> "hello".encode("bz2_codec")
+    TypeError: 'str' does not support the buffer interface
+
+    The above exception was the direct cause of the following exception:
+
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: encoding with 'bz2_codec' codec failed (TypeError: 'str' does not support the buffer interface)
+
+(Contributed by Nick Coghlan in :issue:`17827` and :issue:`17828`)
+
+
 Other Language Changes
 ======================

@ -262,19 +327,6 @@ audioop
 Added support for 24-bit samples (:issue:`12866`).


-codecs
------
-
-The :meth:`codecs.encode` and :meth:`codecs.decode` convenience functions are
-now properly documented. These functions have existed in the :mod:`codecs`
-module since ~2004, but were previously only discoverable through runtime
-introspection.
-
-Unlike the convenience methods on :class:`str`, :class:`bytes` and
-:class:`bytearray`, these convenience functions support arbitrary codecs,
-rather than being limited to Unicode text encodings.
-
-
 colorsys
 --------

--- a/Include/pyerrors.h
+++ b/Include/pyerrors.h
@ -285,6 +285,28 @@ PyAPI_FUNC(PyObject *) PyErr_NewExceptionWithDoc(
    const char *name, const char *doc, PyObject *base, PyObject *dict);
 PyAPI_FUNC(void) PyErr_WriteUnraisable(PyObject *);

+/* In exceptions.c */
+#ifndef Py_LIMITED_API
+/* Helper that attempts to replace the current exception with one of the
+ * same type but with a prefix added to the exception text. The resulting
+ * exception description looks like:
+ *
+ *     prefix (exc_type: original_exc_str)
+ *
+ * Only some exceptions can be safely replaced. If the function determines
+ * it isn't safe to perform the replacement, it will leave the original
+ * unmodified exception in place.
+ *
+ * Returns a borrowed reference to the new exception (if any), NULL if the
+ * existing exception was left in place.
+ */
+PyAPI_FUNC(PyObject *) _PyErr_TrySetFromCause(
+    const char *prefix_format,   /* ASCII-encoded string  */
+    ...
+    );
+#endif
+
+
 /* In sigcheck.c or signalmodule.c */
 PyAPI_FUNC(int) PyErr_CheckSignals(void);
 PyAPI_FUNC(void) PyErr_SetInterrupt(void);
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -1,5 +1,6 @@
 import _testcapi
 import codecs
+import contextlib
 import io
 import locale
 import sys
@ -2292,28 +2293,31 @@ class TransformCodecTest(unittest.TestCase):
    def test_basics(self):
        binput = bytes(range(256))
        for encoding in bytes_transform_encodings:
-            # generic codecs interface
-            (o, size) = codecs.getencoder(encoding)(binput)
-            self.assertEqual(size, len(binput))
-            (i, size) = codecs.getdecoder(encoding)(o)
-            self.assertEqual(size, len(o))
-            self.assertEqual(i, binput)
+            with self.subTest(encoding=encoding):
+                # generic codecs interface
+                (o, size) = codecs.getencoder(encoding)(binput)
+                self.assertEqual(size, len(binput))
+                (i, size) = codecs.getdecoder(encoding)(o)
+                self.assertEqual(size, len(o))
+                self.assertEqual(i, binput)

    def test_read(self):
        for encoding in bytes_transform_encodings:
-            sin = codecs.encode(b"\x80", encoding)
-            reader = codecs.getreader(encoding)(io.BytesIO(sin))
-            sout = reader.read()
-            self.assertEqual(sout, b"\x80")
+            with self.subTest(encoding=encoding):
+                sin = codecs.encode(b"\x80", encoding)
+                reader = codecs.getreader(encoding)(io.BytesIO(sin))
+                sout = reader.read()
+                self.assertEqual(sout, b"\x80")

    def test_readline(self):
        for encoding in bytes_transform_encodings:
            if encoding in ['uu_codec', 'zlib_codec']:
                continue
-            sin = codecs.encode(b"\x80", encoding)
-            reader = codecs.getreader(encoding)(io.BytesIO(sin))
-            sout = reader.readline()
-            self.assertEqual(sout, b"\x80")
+            with self.subTest(encoding=encoding):
+                sin = codecs.encode(b"\x80", encoding)
+                reader = codecs.getreader(encoding)(io.BytesIO(sin))
+                sout = reader.readline()
+                self.assertEqual(sout, b"\x80")

    def test_buffer_api_usage(self):
        # We check all the transform codecs accept memoryview input
@ -2321,17 +2325,158 @@ class TransformCodecTest(unittest.TestCase):
        # and also that they roundtrip correctly
        original = b"12345\x80"
        for encoding in bytes_transform_encodings:
-            data = original
-            view = memoryview(data)
-            data = codecs.encode(data, encoding)
-            view_encoded = codecs.encode(view, encoding)
-            self.assertEqual(view_encoded, data)
-            view = memoryview(data)
-            data = codecs.decode(data, encoding)
-            self.assertEqual(data, original)
-            view_decoded = codecs.decode(view, encoding)
-            self.assertEqual(view_decoded, data)
+            with self.subTest(encoding=encoding):
+                data = original
+                view = memoryview(data)
+                data = codecs.encode(data, encoding)
+                view_encoded = codecs.encode(view, encoding)
+                self.assertEqual(view_encoded, data)
+                view = memoryview(data)
+                data = codecs.decode(data, encoding)
+                self.assertEqual(data, original)
+                view_decoded = codecs.decode(view, encoding)
+                self.assertEqual(view_decoded, data)

+    def test_type_error_for_text_input(self):
+        # Check binary -> binary codecs give a good error for str input
+        bad_input = "bad input type"
+        for encoding in bytes_transform_encodings:
+            with self.subTest(encoding=encoding):
+                msg = "^encoding with '{}' codec failed".format(encoding)
+                with self.assertRaisesRegex(TypeError, msg) as failure:
+                    bad_input.encode(encoding)
+                self.assertTrue(isinstance(failure.exception.__cause__,
+                                           TypeError))
+
+    def test_type_error_for_binary_input(self):
+        # Check str -> str codec gives a good error for binary input
+        for bad_input in (b"immutable", bytearray(b"mutable")):
+            with self.subTest(bad_input=bad_input):
+                msg = "^decoding with 'rot_13' codec failed"
+                with self.assertRaisesRegex(AttributeError, msg) as failure:
+                    bad_input.decode("rot_13")
+                self.assertTrue(isinstance(failure.exception.__cause__,
+                                           AttributeError))
+
+    def test_bad_decoding_output_type(self):
+        # Check bytes.decode and bytearray.decode give a good error
+        # message for binary -> binary codecs
+        data = b"encode first to ensure we meet any format restrictions"
+        for encoding in bytes_transform_encodings:
+            with self.subTest(encoding=encoding):
+                encoded_data = codecs.encode(data, encoding)
+                fmt = ("'{}' decoder returned 'bytes' instead of 'str'; "
+                       "use codecs.decode\(\) to decode to arbitrary types")
+                msg = fmt.format(encoding)
+                with self.assertRaisesRegex(TypeError, msg):
+                    encoded_data.decode(encoding)
+                with self.assertRaisesRegex(TypeError, msg):
+                    bytearray(encoded_data).decode(encoding)
+
+    def test_bad_encoding_output_type(self):
+        # Check str.encode gives a good error message for str -> str codecs
+        msg = ("'rot_13' encoder returned 'str' instead of 'bytes'; "
+               "use codecs.encode\(\) to encode to arbitrary types")
+        with self.assertRaisesRegex(TypeError, msg):
+            "just an example message".encode("rot_13")
+
+
+# The codec system tries to wrap exceptions in order to ensure the error
+# mentions the operation being performed and the codec involved. We
+# currently *only* want this to happen for relatively stateless
+# exceptions, where the only significant information they contain is their
+# type and a single str argument.
+class ExceptionChainingTest(unittest.TestCase):
+
+    def setUp(self):
+        # There's no way to unregister a codec search function, so we just
+        # ensure we render this one fairly harmless after the test
+        # case finishes by using the test case repr as the codec name
+        # The codecs module normalizes codec names, although this doesn't
+        # appear to be formally documented...
+        self.codec_name = repr(self).lower().replace(" ", "-")
+        self.codec_info = None
+        codecs.register(self.get_codec)
+
+    def get_codec(self, codec_name):
+        if codec_name != self.codec_name:
+            return None
+        return self.codec_info
+
+    def set_codec(self, obj_to_raise):
+        def raise_obj(*args, **kwds):
+            raise obj_to_raise
+        self.codec_info = codecs.CodecInfo(raise_obj, raise_obj,
+                                           name=self.codec_name)
+
+    @contextlib.contextmanager
+    def assertWrapped(self, operation, exc_type, msg):
+        full_msg = "{} with '{}' codec failed \({}: {}\)".format(
+                  operation, self.codec_name, exc_type.__name__, msg)
+        with self.assertRaisesRegex(exc_type, full_msg) as caught:
+            yield caught
+
+    def check_wrapped(self, obj_to_raise, msg):
+        self.set_codec(obj_to_raise)
+        with self.assertWrapped("encoding", RuntimeError, msg):
+            "str_input".encode(self.codec_name)
+        with self.assertWrapped("encoding", RuntimeError, msg):
+            codecs.encode("str_input", self.codec_name)
+        with self.assertWrapped("decoding", RuntimeError, msg):
+            b"bytes input".decode(self.codec_name)
+        with self.assertWrapped("decoding", RuntimeError, msg):
+            codecs.decode(b"bytes input", self.codec_name)
+
+    def test_raise_by_type(self):
+        self.check_wrapped(RuntimeError, "")
+
+    def test_raise_by_value(self):
+        msg = "This should be wrapped"
+        self.check_wrapped(RuntimeError(msg), msg)
+
+    @contextlib.contextmanager
+    def assertNotWrapped(self, operation, exc_type, msg):
+        with self.assertRaisesRegex(exc_type, msg) as caught:
+            yield caught
+        actual_msg = str(caught.exception)
+        self.assertNotIn(operation, actual_msg)
+        self.assertNotIn(self.codec_name, actual_msg)
+
+    def check_not_wrapped(self, obj_to_raise, msg):
+        self.set_codec(obj_to_raise)
+        with self.assertNotWrapped("encoding", RuntimeError, msg):
+            "str input".encode(self.codec_name)
+        with self.assertNotWrapped("encoding", RuntimeError, msg):
+            codecs.encode("str input", self.codec_name)
+        with self.assertNotWrapped("decoding", RuntimeError, msg):
+            b"bytes input".decode(self.codec_name)
+        with self.assertNotWrapped("decoding", RuntimeError, msg):
+            codecs.decode(b"bytes input", self.codec_name)
+
+    def test_init_override_is_not_wrapped(self):
+        class CustomInit(RuntimeError):
+            def __init__(self):
+                pass
+        self.check_not_wrapped(CustomInit, "")
+
+    def test_new_override_is_not_wrapped(self):
+        class CustomNew(RuntimeError):
+            def __new__(cls):
+                return super().__new__(cls)
+        self.check_not_wrapped(CustomNew, "")
+
+    def test_instance_attribute_is_not_wrapped(self):
+        msg = "This should NOT be wrapped"
+        exc = RuntimeError(msg)
+        exc.attr = 1
+        self.check_not_wrapped(exc, msg)
+
+    def test_non_str_arg_is_not_wrapped(self):
+        self.check_not_wrapped(RuntimeError(1), "1")
+
+    def test_multiple_args_is_not_wrapped(self):
+        msg = "\('a', 'b', 'c'\)"
+        self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg)


@unittest.skipUnless(sys.platform == 'win32',
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,15 @@ Projected release date: 2013-11-24
 Core and Builtins
 -----------------

+- Issue #17828: Output type errors in str.encode(), bytes.decode() and
+  bytearray.decode() now direct users to codecs.encode() or codecs.decode()
+  as appropriate.
+
+- Issue #17828: The interpreter now attempts to chain errors that occur in
+  codec processing with a replacement exception of the same type that
+  includes the codec name in the error message. It ensures it only does this
+  when the creation of the replacement exception won't lose any information.
+
 - Issue #19466: Clear the frames of daemon threads earlier during the
  Python shutdown to call objects destructors. So "unclosed file" resource
  warnings are now corretly emitted for daemon threads.
--- a/Objects/exceptions.c
+++ b/Objects/exceptions.c
@ -2591,3 +2591,116 @@ _PyExc_Fini(void)
    free_preallocated_memerrors();
    Py_CLEAR(errnomap);
 }
+
+/* Helper to do the equivalent of "raise X from Y" in C, but always using
+ * the current exception rather than passing one in.
+ *
+ * We currently limit this to *only* exceptions that use the BaseException
+ * tp_init and tp_new methods, since we can be reasonably sure we can wrap
+ * those correctly without losing data and without losing backwards
+ * compatibility.
+ *
+ * We also aim to rule out *all* exceptions that might be storing additional
+ * state, whether by having a size difference relative to BaseException,
+ * additional arguments passed in during construction or by having a
+ * non-empty instance dict.
+ *
+ * We need to be very careful with what we wrap, since changing types to
+ * a broader exception type would be backwards incompatible for
+ * existing codecs, and with different init or new method implementations
+ * may either not support instantiation with PyErr_Format or lose
+ * information when instantiated that way.
+ *
+ * XXX (ncoghlan): This could be made more comprehensive by exploiting the
+ * fact that exceptions are expected to support pickling. If more builtin
+ * exceptions (e.g. AttributeError) start to be converted to rich
+ * exceptions with additional attributes, that's probably a better approach
+ * to pursue over adding special cases for particular stateful subclasses.
+ *
+ * Returns a borrowed reference to the new exception (if any), NULL if the
+ * existing exception was left in place.
+ */
+PyObject *
+_PyErr_TrySetFromCause(const char *format, ...)
+{
+    PyObject* msg_prefix;
+    PyObject *exc, *val, *tb;
+    PyTypeObject *caught_type;
+    PyObject *instance_dict;
+    PyObject *instance_args;
+    Py_ssize_t num_args;
+    PyObject *new_exc, *new_val, *new_tb;
+    va_list vargs;
+
+#ifdef HAVE_STDARG_PROTOTYPES
+    va_start(vargs, format);
+#else
+    va_start(vargs);
+#endif
+
+    PyErr_Fetch(&exc, &val, &tb);
+    caught_type = (PyTypeObject *) exc;
+    /* Ensure type info indicates no extra state is stored at the C level */
+    if (caught_type->tp_init != (initproc) BaseException_init ||
+        caught_type->tp_new != BaseException_new ||
+        caught_type->tp_basicsize != _PyExc_BaseException.tp_basicsize ||
+        caught_type->tp_itemsize != _PyExc_BaseException.tp_itemsize
+       ) {
+        /* We can't be sure we can wrap this safely, since it may contain
+         * more state than just the exception type. Accordingly, we just
+         * leave it alone.
+         */
+        PyErr_Restore(exc, val, tb);
+        return NULL;
+    }
+
+    /* Check the args are empty or contain a single string */
+    PyErr_NormalizeException(&exc, &val, &tb);
+    instance_args = ((PyBaseExceptionObject *) val)->args;
+    num_args = PyTuple_GET_SIZE(instance_args);
+    if ((num_args > 1) ||
+        (num_args == 1 &&
+         !PyUnicode_CheckExact(PyTuple_GET_ITEM(instance_args, 0))
+        )
+       ) {
+        /* More than 1 arg, or the one arg we do have isn't a string
+         */
+        PyErr_Restore(exc, val, tb);
+        return NULL;
+    }
+
+    /* Ensure the instance dict is also empty */
+    instance_dict = *_PyObject_GetDictPtr(val);
+    if (instance_dict != NULL && PyObject_Length(instance_dict) > 0) {
+        /* While we could potentially copy a non-empty instance dictionary
+         * to the replacement exception, for now we take the more
+         * conservative path of leaving exceptions with attributes set
+         * alone.
+         */
+        PyErr_Restore(exc, val, tb);
+        return NULL;
+    }
+
+    /* For exceptions that we can wrap safely, we chain the original
+     * exception to a new one of the exact same type with an
+     * error message that mentions the additional details and the
+     * original exception.
+     *
+     * It would be nice to wrap OSError and various other exception
+     * types as well, but that's quite a bit trickier due to the extra
+     * state potentially stored on OSError instances.
+     */
+    msg_prefix = PyUnicode_FromFormatV(format, vargs);
+    if (msg_prefix == NULL)
+        return NULL;
+
+    PyErr_Format(exc, "%U (%s: %S)",
+                 msg_prefix, Py_TYPE(val)->tp_name, val);
+    Py_DECREF(exc);
+    Py_XDECREF(tb);
+    PyErr_Fetch(&new_exc, &new_val, &new_tb);
+    PyErr_NormalizeException(&new_exc, &new_val, &new_tb);
+    PyException_SetCause(new_val, val);
+    PyErr_Restore(new_exc, new_val, new_tb);
+    return new_val;
+}
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -3054,8 +3054,10 @@ PyUnicode_Decode(const char *s,
        goto onError;
    if (!PyUnicode_Check(unicode)) {
        PyErr_Format(PyExc_TypeError,
-                     "decoder did not return a str object (type=%.400s)",
-                     Py_TYPE(unicode)->tp_name);
+                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
+                     "use codecs.decode() to decode to arbitrary types",
+                     encoding,
+                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
        Py_DECREF(unicode);
        goto onError;
    }
@ -3113,8 +3115,10 @@ PyUnicode_AsDecodedUnicode(PyObject *unicode,
        goto onError;
    if (!PyUnicode_Check(v)) {
        PyErr_Format(PyExc_TypeError,
-                     "decoder did not return a str object (type=%.400s)",
-                     Py_TYPE(v)->tp_name);
+                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
+                     "use codecs.decode() to decode to arbitrary types",
+                     encoding,
+                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
        Py_DECREF(v);
        goto onError;
    }
@ -3425,7 +3429,8 @@ PyUnicode_AsEncodedString(PyObject *unicode,
        PyObject *b;

        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-            "encoder %s returned bytearray instead of bytes",
+            "encoder %s returned bytearray instead of bytes; "
+            "use codecs.encode() to encode to arbitrary types",
            encoding);
        if (error) {
            Py_DECREF(v);
@ -3438,8 +3443,10 @@ PyUnicode_AsEncodedString(PyObject *unicode,
    }

    PyErr_Format(PyExc_TypeError,
-                 "encoder did not return a bytes object (type=%.400s)",
-                 Py_TYPE(v)->tp_name);
+                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
+                 "use codecs.encode() to encode to arbitrary types",
+                 encoding,
+                 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
    Py_DECREF(v);
    return NULL;
 }
@ -3465,8 +3472,10 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
        goto onError;
    if (!PyUnicode_Check(v)) {
        PyErr_Format(PyExc_TypeError,
-                     "encoder did not return an str object (type=%.400s)",
-                     Py_TYPE(v)->tp_name);
+                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
+                     "use codecs.encode() to encode to arbitrary types",
+                     encoding,
+                     Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
        Py_DECREF(v);
        goto onError;
    }
--- a/Python/codecs.c
+++ b/Python/codecs.c
@ -332,6 +332,22 @@ PyObject *PyCodec_StreamWriter(const char *encoding,
    return codec_getstreamcodec(encoding, stream, errors, 3);
 }

+/* Helper that tries to ensure the reported exception chain indicates the
+ * codec that was invoked to trigger the failure without changing the type
+ * of the exception raised.
+ */
+static void
+wrap_codec_error(const char *operation,
+                 const char *encoding)
+{
+    /* TrySetFromCause will replace the active exception with a suitably
+     * updated clone if it can, otherwise it will leave the original
+     * exception alone.
+     */
+    _PyErr_TrySetFromCause("%s with '%s' codec failed",
+                           operation, encoding);
+}
+
 /* Encode an object (e.g. an Unicode object) using the given encoding
   and return the resulting encoded object (usually a Python string).

@ -376,6 +392,7 @@ PyObject *PyCodec_Encode(PyObject *object,
    Py_XDECREF(result);
    Py_XDECREF(args);
    Py_XDECREF(encoder);
+    wrap_codec_error("encoding", encoding);
    return NULL;
 }

@ -422,6 +439,7 @@ PyObject *PyCodec_Decode(PyObject *object,
    Py_XDECREF(args);
    Py_XDECREF(decoder);
    Py_XDECREF(result);
+    wrap_codec_error("decoding", encoding);
    return NULL;
 }