Close #20404: blacklist non-text encodings in io.TextIOWrapper
- io.TextIOWrapper (and hence the open() builtin) now use the internal codec marking system added for issue #19619 - also tweaked the C code to only look up the encoding once, rather than multiple times - the existing output type checks remain in place to deal with unmarked third party codecs.
This commit is contained in:
parent
1ea4e4174b
commit
a9b15241c6
|
@ -104,7 +104,14 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode(
|
||||||
Please note that these APIs are internal and should not
|
Please note that these APIs are internal and should not
|
||||||
be used in Python C extensions.
|
be used in Python C extensions.
|
||||||
|
|
||||||
|
XXX (ncoghlan): should we make these, or something like them, public
|
||||||
|
in Python 3.5+?
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
PyAPI_FUNC(PyObject *) _PyCodec_LookupTextEncoding(
|
||||||
|
const char *encoding,
|
||||||
|
const char *alternate_command
|
||||||
|
);
|
||||||
|
|
||||||
PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
|
PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
|
||||||
PyObject *object,
|
PyObject *object,
|
||||||
|
@ -117,6 +124,19 @@ PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
|
||||||
const char *encoding,
|
const char *encoding,
|
||||||
const char *errors
|
const char *errors
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/* These two aren't actually text encoding specific, but _io.TextIOWrapper
|
||||||
|
* is the only current API consumer.
|
||||||
|
*/
|
||||||
|
PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalDecoder(
|
||||||
|
PyObject *codec_info,
|
||||||
|
const char *errors
|
||||||
|
);
|
||||||
|
|
||||||
|
PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalEncoder(
|
||||||
|
PyObject *codec_info,
|
||||||
|
const char *errors
|
||||||
|
);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1503,6 +1503,11 @@ class TextIOWrapper(TextIOBase):
|
||||||
if not isinstance(encoding, str):
|
if not isinstance(encoding, str):
|
||||||
raise ValueError("invalid encoding: %r" % encoding)
|
raise ValueError("invalid encoding: %r" % encoding)
|
||||||
|
|
||||||
|
if not codecs.lookup(encoding)._is_text_encoding:
|
||||||
|
msg = ("%r is not a text encoding; "
|
||||||
|
"use codecs.open() to handle arbitrary codecs")
|
||||||
|
raise LookupError(msg % encoding)
|
||||||
|
|
||||||
if errors is None:
|
if errors is None:
|
||||||
errors = "strict"
|
errors = "strict"
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1929,6 +1929,15 @@ class TextIOWrapperTest(unittest.TestCase):
|
||||||
self.assertRaises(TypeError, t.__init__, b, newline=42)
|
self.assertRaises(TypeError, t.__init__, b, newline=42)
|
||||||
self.assertRaises(ValueError, t.__init__, b, newline='xyzzy')
|
self.assertRaises(ValueError, t.__init__, b, newline='xyzzy')
|
||||||
|
|
||||||
|
def test_non_text_encoding_codecs_are_rejected(self):
|
||||||
|
# Ensure the constructor complains if passed a codec that isn't
|
||||||
|
# marked as a text encoding
|
||||||
|
# http://bugs.python.org/issue20404
|
||||||
|
r = self.BytesIO()
|
||||||
|
b = self.BufferedWriter(r)
|
||||||
|
with self.assertRaisesRegex(LookupError, "is not a text encoding"):
|
||||||
|
self.TextIOWrapper(b, encoding="hex")
|
||||||
|
|
||||||
def test_detach(self):
|
def test_detach(self):
|
||||||
r = self.BytesIO()
|
r = self.BytesIO()
|
||||||
b = self.BufferedWriter(r)
|
b = self.BufferedWriter(r)
|
||||||
|
@ -2579,15 +2588,22 @@ class TextIOWrapperTest(unittest.TestCase):
|
||||||
|
|
||||||
def test_illegal_decoder(self):
|
def test_illegal_decoder(self):
|
||||||
# Issue #17106
|
# Issue #17106
|
||||||
|
# Bypass the early encoding check added in issue 20404
|
||||||
|
def _make_illegal_wrapper():
|
||||||
|
quopri = codecs.lookup("quopri")
|
||||||
|
quopri._is_text_encoding = True
|
||||||
|
try:
|
||||||
|
t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'),
|
||||||
|
newline='\n', encoding="quopri")
|
||||||
|
finally:
|
||||||
|
quopri._is_text_encoding = False
|
||||||
|
return t
|
||||||
# Crash when decoder returns non-string
|
# Crash when decoder returns non-string
|
||||||
t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
|
t = _make_illegal_wrapper()
|
||||||
encoding='quopri_codec')
|
|
||||||
self.assertRaises(TypeError, t.read, 1)
|
self.assertRaises(TypeError, t.read, 1)
|
||||||
t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
|
t = _make_illegal_wrapper()
|
||||||
encoding='quopri_codec')
|
|
||||||
self.assertRaises(TypeError, t.readline)
|
self.assertRaises(TypeError, t.readline)
|
||||||
t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
|
t = _make_illegal_wrapper()
|
||||||
encoding='quopri_codec')
|
|
||||||
self.assertRaises(TypeError, t.read)
|
self.assertRaises(TypeError, t.read)
|
||||||
|
|
||||||
def _check_create_at_shutdown(self, **kwargs):
|
def _check_create_at_shutdown(self, **kwargs):
|
||||||
|
@ -2616,8 +2632,7 @@ class TextIOWrapperTest(unittest.TestCase):
|
||||||
if err:
|
if err:
|
||||||
# Can error out with a RuntimeError if the module state
|
# Can error out with a RuntimeError if the module state
|
||||||
# isn't found.
|
# isn't found.
|
||||||
self.assertIn("RuntimeError: could not find io module state",
|
self.assertIn(self.shutdown_error, err.decode())
|
||||||
err.decode())
|
|
||||||
else:
|
else:
|
||||||
self.assertEqual("ok", out.decode().strip())
|
self.assertEqual("ok", out.decode().strip())
|
||||||
|
|
||||||
|
@ -2630,6 +2645,7 @@ class TextIOWrapperTest(unittest.TestCase):
|
||||||
|
|
||||||
class CTextIOWrapperTest(TextIOWrapperTest):
|
class CTextIOWrapperTest(TextIOWrapperTest):
|
||||||
io = io
|
io = io
|
||||||
|
shutdown_error = "RuntimeError: could not find io module state"
|
||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
r = self.BytesIO(b"\xc3\xa9\n\n")
|
r = self.BytesIO(b"\xc3\xa9\n\n")
|
||||||
|
@ -2674,6 +2690,7 @@ class CTextIOWrapperTest(TextIOWrapperTest):
|
||||||
|
|
||||||
class PyTextIOWrapperTest(TextIOWrapperTest):
|
class PyTextIOWrapperTest(TextIOWrapperTest):
|
||||||
io = pyio
|
io = pyio
|
||||||
|
shutdown_error = "LookupError: unknown encoding: ascii"
|
||||||
|
|
||||||
|
|
||||||
class IncrementalNewlineDecoderTest(unittest.TestCase):
|
class IncrementalNewlineDecoderTest(unittest.TestCase):
|
||||||
|
|
|
@ -10,6 +10,12 @@ Release date: 2014-02-09
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #20404: io.TextIOWrapper (and hence the open() builtin) now uses the
|
||||||
|
internal codec marking system added for issue #19619 to throw LookupError
|
||||||
|
for known non-text encodings at stream construction time. The existing
|
||||||
|
output type checks remain in place to deal with unmarked third party
|
||||||
|
codecs.
|
||||||
|
|
||||||
- Issue #17162: Add PyType_GetSlot.
|
- Issue #17162: Add PyType_GetSlot.
|
||||||
|
|
||||||
- Issue #20162: Fix an alignment issue in the siphash24() hash function which
|
- Issue #20162: Fix an alignment issue in the siphash24() hash function which
|
||||||
|
|
|
@ -849,7 +849,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
|
||||||
char *kwlist[] = {"buffer", "encoding", "errors",
|
char *kwlist[] = {"buffer", "encoding", "errors",
|
||||||
"newline", "line_buffering", "write_through",
|
"newline", "line_buffering", "write_through",
|
||||||
NULL};
|
NULL};
|
||||||
PyObject *buffer, *raw;
|
PyObject *buffer, *raw, *codec_info = NULL;
|
||||||
char *encoding = NULL;
|
char *encoding = NULL;
|
||||||
char *errors = NULL;
|
char *errors = NULL;
|
||||||
char *newline = NULL;
|
char *newline = NULL;
|
||||||
|
@ -961,6 +961,17 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
|
||||||
"could not determine default encoding");
|
"could not determine default encoding");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Check we have been asked for a real text encoding */
|
||||||
|
codec_info = _PyCodec_LookupTextEncoding(encoding, "codecs.open()");
|
||||||
|
if (codec_info == NULL) {
|
||||||
|
Py_CLEAR(self->encoding);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* XXX: Failures beyond this point have the potential to leak elements
|
||||||
|
* of the partially constructed object (like self->encoding)
|
||||||
|
*/
|
||||||
|
|
||||||
if (errors == NULL)
|
if (errors == NULL)
|
||||||
errors = "strict";
|
errors = "strict";
|
||||||
self->errors = PyBytes_FromString(errors);
|
self->errors = PyBytes_FromString(errors);
|
||||||
|
@ -975,7 +986,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
|
||||||
if (newline) {
|
if (newline) {
|
||||||
self->readnl = PyUnicode_FromString(newline);
|
self->readnl = PyUnicode_FromString(newline);
|
||||||
if (self->readnl == NULL)
|
if (self->readnl == NULL)
|
||||||
return -1;
|
goto error;
|
||||||
}
|
}
|
||||||
self->writetranslate = (newline == NULL || newline[0] != '\0');
|
self->writetranslate = (newline == NULL || newline[0] != '\0');
|
||||||
if (!self->readuniversal && self->readnl) {
|
if (!self->readuniversal && self->readnl) {
|
||||||
|
@ -999,8 +1010,8 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
|
||||||
if (r == -1)
|
if (r == -1)
|
||||||
goto error;
|
goto error;
|
||||||
if (r == 1) {
|
if (r == 1) {
|
||||||
self->decoder = PyCodec_IncrementalDecoder(
|
self->decoder = _PyCodecInfo_GetIncrementalDecoder(codec_info,
|
||||||
encoding, errors);
|
errors);
|
||||||
if (self->decoder == NULL)
|
if (self->decoder == NULL)
|
||||||
goto error;
|
goto error;
|
||||||
|
|
||||||
|
@ -1024,17 +1035,12 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
|
||||||
if (r == -1)
|
if (r == -1)
|
||||||
goto error;
|
goto error;
|
||||||
if (r == 1) {
|
if (r == 1) {
|
||||||
PyObject *ci;
|
self->encoder = _PyCodecInfo_GetIncrementalEncoder(codec_info,
|
||||||
self->encoder = PyCodec_IncrementalEncoder(
|
errors);
|
||||||
encoding, errors);
|
|
||||||
if (self->encoder == NULL)
|
if (self->encoder == NULL)
|
||||||
goto error;
|
goto error;
|
||||||
/* Get the normalized named of the codec */
|
/* Get the normalized named of the codec */
|
||||||
ci = _PyCodec_Lookup(encoding);
|
res = _PyObject_GetAttrId(codec_info, &PyId_name);
|
||||||
if (ci == NULL)
|
|
||||||
goto error;
|
|
||||||
res = _PyObject_GetAttrId(ci, &PyId_name);
|
|
||||||
Py_DECREF(ci);
|
|
||||||
if (res == NULL) {
|
if (res == NULL) {
|
||||||
if (PyErr_ExceptionMatches(PyExc_AttributeError))
|
if (PyErr_ExceptionMatches(PyExc_AttributeError))
|
||||||
PyErr_Clear();
|
PyErr_Clear();
|
||||||
|
@ -1054,6 +1060,9 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
|
||||||
Py_XDECREF(res);
|
Py_XDECREF(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Finished sorting out the codec details */
|
||||||
|
Py_DECREF(codec_info);
|
||||||
|
|
||||||
self->buffer = buffer;
|
self->buffer = buffer;
|
||||||
Py_INCREF(buffer);
|
Py_INCREF(buffer);
|
||||||
|
|
||||||
|
@ -1116,6 +1125,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
|
Py_XDECREF(codec_info);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -243,20 +243,15 @@ PyObject *codec_getitem(const char *encoding, int index)
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Helper function to create an incremental codec. */
|
/* Helper functions to create an incremental codec. */
|
||||||
|
|
||||||
static
|
static
|
||||||
PyObject *codec_getincrementalcodec(const char *encoding,
|
PyObject *codec_makeincrementalcodec(PyObject *codec_info,
|
||||||
const char *errors,
|
const char *errors,
|
||||||
const char *attrname)
|
const char *attrname)
|
||||||
{
|
{
|
||||||
PyObject *codecs, *ret, *inccodec;
|
PyObject *ret, *inccodec;
|
||||||
|
|
||||||
codecs = _PyCodec_Lookup(encoding);
|
inccodec = PyObject_GetAttrString(codec_info, attrname);
|
||||||
if (codecs == NULL)
|
|
||||||
return NULL;
|
|
||||||
inccodec = PyObject_GetAttrString(codecs, attrname);
|
|
||||||
Py_DECREF(codecs);
|
|
||||||
if (inccodec == NULL)
|
if (inccodec == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (errors)
|
if (errors)
|
||||||
|
@ -267,6 +262,21 @@ PyObject *codec_getincrementalcodec(const char *encoding,
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
PyObject *codec_getincrementalcodec(const char *encoding,
|
||||||
|
const char *errors,
|
||||||
|
const char *attrname)
|
||||||
|
{
|
||||||
|
PyObject *codec_info, *ret;
|
||||||
|
|
||||||
|
codec_info = _PyCodec_Lookup(encoding);
|
||||||
|
if (codec_info == NULL)
|
||||||
|
return NULL;
|
||||||
|
ret = codec_makeincrementalcodec(codec_info, errors, attrname);
|
||||||
|
Py_DECREF(codec_info);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/* Helper function to create a stream codec. */
|
/* Helper function to create a stream codec. */
|
||||||
|
|
||||||
static
|
static
|
||||||
|
@ -290,6 +300,24 @@ PyObject *codec_getstreamcodec(const char *encoding,
|
||||||
return streamcodec;
|
return streamcodec;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Helpers to work with the result of _PyCodec_Lookup
|
||||||
|
|
||||||
|
*/
|
||||||
|
PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
|
||||||
|
const char *errors)
|
||||||
|
{
|
||||||
|
return codec_makeincrementalcodec(codec_info, errors,
|
||||||
|
"incrementaldecoder");
|
||||||
|
}
|
||||||
|
|
||||||
|
PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
|
||||||
|
const char *errors)
|
||||||
|
{
|
||||||
|
return codec_makeincrementalcodec(codec_info, errors,
|
||||||
|
"incrementalencoder");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Convenience APIs to query the Codec registry.
|
/* Convenience APIs to query the Codec registry.
|
||||||
|
|
||||||
All APIs return a codec object with incremented refcount.
|
All APIs return a codec object with incremented refcount.
|
||||||
|
@ -467,15 +495,12 @@ PyObject *PyCodec_Decode(PyObject *object,
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Text encoding/decoding API */
|
/* Text encoding/decoding API */
|
||||||
static
|
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
|
||||||
PyObject *codec_getitem_checked(const char *encoding,
|
const char *alternate_command)
|
||||||
const char *operation_name,
|
|
||||||
int index)
|
|
||||||
{
|
{
|
||||||
_Py_IDENTIFIER(_is_text_encoding);
|
_Py_IDENTIFIER(_is_text_encoding);
|
||||||
PyObject *codec;
|
PyObject *codec;
|
||||||
PyObject *attr;
|
PyObject *attr;
|
||||||
PyObject *v;
|
|
||||||
int is_text_codec;
|
int is_text_codec;
|
||||||
|
|
||||||
codec = _PyCodec_Lookup(encoding);
|
codec = _PyCodec_Lookup(encoding);
|
||||||
|
@ -502,27 +527,44 @@ PyObject *codec_getitem_checked(const char *encoding,
|
||||||
Py_DECREF(codec);
|
Py_DECREF(codec);
|
||||||
PyErr_Format(PyExc_LookupError,
|
PyErr_Format(PyExc_LookupError,
|
||||||
"'%.400s' is not a text encoding; "
|
"'%.400s' is not a text encoding; "
|
||||||
"use codecs.%s() to handle arbitrary codecs",
|
"use %s to handle arbitrary codecs",
|
||||||
encoding, operation_name);
|
encoding, alternate_command);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* This appears to be a valid text encoding */
|
||||||
|
return codec;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static
|
||||||
|
PyObject *codec_getitem_checked(const char *encoding,
|
||||||
|
const char *alternate_command,
|
||||||
|
int index)
|
||||||
|
{
|
||||||
|
PyObject *codec;
|
||||||
|
PyObject *v;
|
||||||
|
|
||||||
|
codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
|
||||||
|
if (codec == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
v = PyTuple_GET_ITEM(codec, index);
|
v = PyTuple_GET_ITEM(codec, index);
|
||||||
Py_DECREF(codec);
|
|
||||||
Py_INCREF(v);
|
Py_INCREF(v);
|
||||||
|
Py_DECREF(codec);
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
static PyObject * _PyCodec_TextEncoder(const char *encoding)
|
static PyObject * _PyCodec_TextEncoder(const char *encoding)
|
||||||
{
|
{
|
||||||
return codec_getitem_checked(encoding, "encode", 0);
|
return codec_getitem_checked(encoding, "codecs.encode()", 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static PyObject * _PyCodec_TextDecoder(const char *encoding)
|
static PyObject * _PyCodec_TextDecoder(const char *encoding)
|
||||||
{
|
{
|
||||||
return codec_getitem_checked(encoding, "decode", 1);
|
return codec_getitem_checked(encoding, "codecs.decode()", 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *_PyCodec_EncodeText(PyObject *object,
|
PyObject *_PyCodec_EncodeText(PyObject *object,
|
||||||
|
|
Loading…
Reference in New Issue