Issue #20404: reject non-text encodings early in TextIOWrapper.

2014-03-02 09:18:31 +01:00 · 2014-03-02 09:18:31 +01:00 · 2fc8f773e1
parent 2658bad090
commit 2fc8f773e1
5 changed files with 134 additions and 39 deletions
--- a/Include/codecs.h
+++ b/Include/codecs.h
@ -104,7 +104,14 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode(
   Please note that these APIs are internal and should not
   be used in Python C extensions.

+   XXX (ncoghlan): should we make these, or something like them, public
+   in Python 3.5+?
+
 */
+PyAPI_FUNC(PyObject *) _PyCodec_LookupTextEncoding(
+       const char *encoding,
+       const char *alternate_command
+       );

 PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
       PyObject *object,
@ -117,6 +124,19 @@ PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
       const char *encoding,
       const char *errors
       );
+
+/* These two aren't actually text encoding specific, but _io.TextIOWrapper
+ * is the only current API consumer.
+ */
+PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalDecoder(
+       PyObject *codec_info,
+       const char *errors
+       );
+
+PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalEncoder(
+       PyObject *codec_info,
+       const char *errors
+       );
 #endif


--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@ -1495,6 +1495,11 @@ class TextIOWrapper(TextIOBase):
        if not isinstance(encoding, str):
            raise ValueError("invalid encoding: %r" % encoding)

+        if not codecs.lookup(encoding)._is_text_encoding:
+            msg = ("%r is not a text encoding; "
+                   "use codecs.open() to handle arbitrary codecs")
+            raise LookupError(msg % encoding)
+
        if errors is None:
            errors = "strict"
        else:
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@ -1955,6 +1955,15 @@ class TextIOWrapperTest(unittest.TestCase):
        self.assertRaises(TypeError, t.__init__, b, newline=42)
        self.assertRaises(ValueError, t.__init__, b, newline='xyzzy')

+    def test_non_text_encoding_codecs_are_rejected(self):
+        # Ensure the constructor complains if passed a codec that isn't
+        # marked as a text encoding
+        # http://bugs.python.org/issue20404
+        r = self.BytesIO()
+        b = self.BufferedWriter(r)
+        with self.assertRaisesRegex(LookupError, "is not a text encoding"):
+            self.TextIOWrapper(b, encoding="hex_codec")
+
    def test_detach(self):
        r = self.BytesIO()
        b = self.BufferedWriter(r)
@ -2607,15 +2616,22 @@ class TextIOWrapperTest(unittest.TestCase):

    def test_illegal_decoder(self):
        # Issue #17106
+        # Bypass the early encoding check added in issue 20404
+        def _make_illegal_wrapper():
+            quopri = codecs.lookup("quopri_codec")
+            quopri._is_text_encoding = True
+            try:
+                t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'),
+                                       newline='\n', encoding="quopri_codec")
+            finally:
+                quopri._is_text_encoding = False
+            return t
        # Crash when decoder returns non-string
-        t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
-                               encoding='quopri_codec')
+        t = _make_illegal_wrapper()
        self.assertRaises(TypeError, t.read, 1)
-        t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
-                               encoding='quopri_codec')
+        t = _make_illegal_wrapper()
        self.assertRaises(TypeError, t.readline)
-        t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
-                               encoding='quopri_codec')
+        t = _make_illegal_wrapper()
        self.assertRaises(TypeError, t.read)


@ -3053,6 +3069,7 @@ class MiscIOTest(unittest.TestCase):

 class CMiscIOTest(MiscIOTest):
    io = io
+    shutdown_error = "RuntimeError: could not find io module state"

    def test_readinto_buffer_overflow(self):
        # Issue #18025
@ -3065,6 +3082,7 @@ class CMiscIOTest(MiscIOTest):

 class PyMiscIOTest(MiscIOTest):
    io = pyio
+    shutdown_error = "LookupError: unknown encoding: ascii"


@unittest.skipIf(os.name == 'nt', 'POSIX signals required for this test.')
--- a/Modules/_io/textio.c
+++ b/Modules/_io/textio.c
@ -836,7 +836,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
    char *kwlist[] = {"buffer", "encoding", "errors",
                      "newline", "line_buffering", "write_through",
                      NULL};
-    PyObject *buffer, *raw;
+    PyObject *buffer, *raw, *codec_info = NULL;
    char *encoding = NULL;
    char *errors = NULL;
    char *newline = NULL;
@ -951,6 +951,17 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
                        "could not determine default encoding");
    }

+    /* Check we have been asked for a real text encoding */
+    codec_info = _PyCodec_LookupTextEncoding(encoding, "codecs.open()");
+    if (codec_info == NULL) {
+        Py_CLEAR(self->encoding);
+        goto error;
+    }
+
+    /* XXX: Failures beyond this point have the potential to leak elements
+     * of the partially constructed object (like self->encoding)
+     */
+
    if (errors == NULL)
        errors = "strict";
    self->errors = PyBytes_FromString(errors);
@ -965,7 +976,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
    if (newline) {
        self->readnl = PyUnicode_FromString(newline);
        if (self->readnl == NULL)
-            return -1;
+            goto error;
    }
    self->writetranslate = (newline == NULL || newline[0] != '\0');
    if (!self->readuniversal && self->readnl) {
@ -989,8 +1000,8 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
    if (r == -1)
        goto error;
    if (r == 1) {
-        self->decoder = PyCodec_IncrementalDecoder(
-            encoding, errors);
+        self->decoder = _PyCodecInfo_GetIncrementalDecoder(codec_info,
+                                                           errors);
        if (self->decoder == NULL)
            goto error;

@ -1014,17 +1025,12 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
    if (r == -1)
        goto error;
    if (r == 1) {
-        PyObject *ci;
-        self->encoder = PyCodec_IncrementalEncoder(
-            encoding, errors);
+        self->encoder = _PyCodecInfo_GetIncrementalEncoder(codec_info,
+                                                           errors);
        if (self->encoder == NULL)
            goto error;
        /* Get the normalized named of the codec */
-        ci = _PyCodec_Lookup(encoding);
-        if (ci == NULL)
-            goto error;
-        res = _PyObject_GetAttrId(ci, &PyId_name);
-        Py_DECREF(ci);
+        res = _PyObject_GetAttrId(codec_info, &PyId_name);
        if (res == NULL) {
            if (PyErr_ExceptionMatches(PyExc_AttributeError))
                PyErr_Clear();
@ -1044,6 +1050,9 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
        Py_XDECREF(res);
    }

+    /* Finished sorting out the codec details */
+    Py_DECREF(codec_info);
+
    self->buffer = buffer;
    Py_INCREF(buffer);

@ -1106,6 +1115,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
    return 0;

  error:
+    Py_XDECREF(codec_info);
    return -1;
 }

--- a/Python/codecs.c
+++ b/Python/codecs.c
@ -243,20 +243,15 @@ PyObject *codec_getitem(const char *encoding, int index)
    return v;
 }

-/* Helper function to create an incremental codec. */
-
+/* Helper functions to create an incremental codec. */
 static
-PyObject *codec_getincrementalcodec(const char *encoding,
-                                    const char *errors,
-                                    const char *attrname)
+PyObject *codec_makeincrementalcodec(PyObject *codec_info,
+                                     const char *errors,
+                                     const char *attrname)
 {
-    PyObject *codecs, *ret, *inccodec;
+    PyObject *ret, *inccodec;

-    codecs = _PyCodec_Lookup(encoding);
-    if (codecs == NULL)
-        return NULL;
-    inccodec = PyObject_GetAttrString(codecs, attrname);
-    Py_DECREF(codecs);
+    inccodec = PyObject_GetAttrString(codec_info, attrname);
    if (inccodec == NULL)
        return NULL;
    if (errors)
@ -267,6 +262,21 @@ PyObject *codec_getincrementalcodec(const char *encoding,
    return ret;
 }

+static
+PyObject *codec_getincrementalcodec(const char *encoding,
+                                    const char *errors,
+                                    const char *attrname)
+{
+    PyObject *codec_info, *ret;
+
+    codec_info = _PyCodec_Lookup(encoding);
+    if (codec_info == NULL)
+        return NULL;
+    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
+    Py_DECREF(codec_info);
+    return ret;
+}
+
 /* Helper function to create a stream codec. */

 static
@ -290,6 +300,24 @@ PyObject *codec_getstreamcodec(const char *encoding,
    return streamcodec;
 }

+/* Helpers to work with the result of _PyCodec_Lookup
+
+ */
+PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
+                                             const char *errors)
+{
+    return codec_makeincrementalcodec(codec_info, errors,
+                                      "incrementaldecoder");
+}
+
+PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
+                                             const char *errors)
+{
+    return codec_makeincrementalcodec(codec_info, errors,
+                                      "incrementalencoder");
+}
+
+
 /* Convenience APIs to query the Codec registry.

   All APIs return a codec object with incremented refcount.
@ -447,15 +475,12 @@ PyObject *PyCodec_Decode(PyObject *object,
 }

 /* Text encoding/decoding API */
-static
-PyObject *codec_getitem_checked(const char *encoding,
-                                const char *operation_name,
-                                int index)
+PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
+                                       const char *alternate_command)
 {
    _Py_IDENTIFIER(_is_text_encoding);
    PyObject *codec;
    PyObject *attr;
-    PyObject *v;
    int is_text_codec;

    codec = _PyCodec_Lookup(encoding);
@ -482,27 +507,44 @@ PyObject *codec_getitem_checked(const char *encoding,
                Py_DECREF(codec);
                PyErr_Format(PyExc_LookupError,
                             "'%.400s' is not a text encoding; "
-                             "use codecs.%s() to handle arbitrary codecs",
-                             encoding, operation_name);
+                             "use %s to handle arbitrary codecs",
+                             encoding, alternate_command);
                return NULL;
            }
        }
    }

+    /* This appears to be a valid text encoding */
+    return codec;
+}
+
+
+static
+PyObject *codec_getitem_checked(const char *encoding,
+                                const char *alternate_command,
+                                int index)
+{
+    PyObject *codec;
+    PyObject *v;
+
+    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
+    if (codec == NULL)
+        return NULL;
+
    v = PyTuple_GET_ITEM(codec, index);
-    Py_DECREF(codec);
    Py_INCREF(v);
+    Py_DECREF(codec);
    return v;
 }

 static PyObject * _PyCodec_TextEncoder(const char *encoding)
 {
-    return codec_getitem_checked(encoding, "encode", 0);
+    return codec_getitem_checked(encoding, "codecs.encode()", 0);
 }

 static PyObject * _PyCodec_TextDecoder(const char *encoding)
 {
-    return codec_getitem_checked(encoding, "decode", 1);
+    return codec_getitem_checked(encoding, "codecs.decode()", 1);
 }

 PyObject *_PyCodec_EncodeText(PyObject *object,