Issue #19619: Blacklist non-text codecs in method API
str.encode, bytes.decode and bytearray.decode now use an internal API to throw LookupError for known non-text encodings, rather than attempting the encoding or decoding operation and then throwing a TypeError for an unexpected output type. The latter mechanism remains in place for third party non-text encodings. Backported changeset d68df99d7a57.
This commit is contained in:
parent
20f8728bf0
commit
94ee389308
|
@ -94,6 +94,33 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode(
|
|||
const char *errors
|
||||
);
|
||||
|
||||
#ifndef PY_LIMITED_API
|
||||
/* Text codec specific encoding and decoding API.
|
||||
|
||||
Checks the encoding against a list of codecs which do not
|
||||
implement a str<->bytes encoding before attempting the
|
||||
operation.
|
||||
|
||||
Please note that these APIs are internal and should not
|
||||
be used in Python C extensions.
|
||||
|
||||
*/
|
||||
|
||||
PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
|
||||
PyObject *object,
|
||||
const char *encoding,
|
||||
const char *errors
|
||||
);
|
||||
|
||||
PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
|
||||
PyObject *object,
|
||||
const char *encoding,
|
||||
const char *errors
|
||||
);
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* --- Codec Lookup APIs --------------------------------------------------
|
||||
|
||||
All APIs return a codec object with incremented refcount and are
|
||||
|
|
|
@ -73,9 +73,19 @@ BOM64_BE = BOM_UTF32_BE
|
|||
### Codec base classes (defining the API)
|
||||
|
||||
class CodecInfo(tuple):
|
||||
"""Codec details when looking up the codec registry"""
|
||||
|
||||
# Private API to allow Python 3.4 to blacklist the known non-Unicode
|
||||
# codecs in the standard library. A more general mechanism to
|
||||
# reliably distinguish test encodings from other codecs will hopefully
|
||||
# be defined for Python 3.5
|
||||
#
|
||||
# See http://bugs.python.org/issue19619
|
||||
_is_text_encoding = True # Assume codecs are text encodings by default
|
||||
|
||||
def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
|
||||
incrementalencoder=None, incrementaldecoder=None, name=None):
|
||||
incrementalencoder=None, incrementaldecoder=None, name=None,
|
||||
*, _is_text_encoding=None):
|
||||
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
|
||||
self.name = name
|
||||
self.encode = encode
|
||||
|
@ -84,6 +94,8 @@ class CodecInfo(tuple):
|
|||
self.incrementaldecoder = incrementaldecoder
|
||||
self.streamwriter = streamwriter
|
||||
self.streamreader = streamreader
|
||||
if _is_text_encoding is not None:
|
||||
self._is_text_encoding = _is_text_encoding
|
||||
return self
|
||||
|
||||
def __repr__(self):
|
||||
|
|
|
@ -52,4 +52,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
_is_text_encoding=False,
|
||||
)
|
||||
|
|
|
@ -74,4 +74,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
_is_text_encoding=False,
|
||||
)
|
||||
|
|
|
@ -52,4 +52,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
_is_text_encoding=False,
|
||||
)
|
||||
|
|
|
@ -53,4 +53,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
_is_text_encoding=False,
|
||||
)
|
||||
|
|
|
@ -43,6 +43,7 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
_is_text_encoding=False,
|
||||
)
|
||||
|
||||
### Map
|
||||
|
|
|
@ -96,4 +96,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_is_text_encoding=False,
|
||||
)
|
||||
|
|
|
@ -74,4 +74,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_is_text_encoding=False,
|
||||
)
|
||||
|
|
|
@ -4,6 +4,7 @@ import locale
|
|||
import sys
|
||||
import unittest
|
||||
import warnings
|
||||
import encodings
|
||||
|
||||
from test import support
|
||||
|
||||
|
@ -2408,6 +2409,47 @@ class TransformCodecTest(unittest.TestCase):
|
|||
sout = reader.readline()
|
||||
self.assertEqual(sout, b"\x80")
|
||||
|
||||
def test_text_to_binary_blacklists_binary_transforms(self):
|
||||
# Check binary -> binary codecs give a good error for str input
|
||||
bad_input = "bad input type"
|
||||
for encoding in bytes_transform_encodings:
|
||||
fmt = (r"{!r} is not a text encoding; "
|
||||
r"use codecs.encode\(\) to handle arbitrary codecs")
|
||||
msg = fmt.format(encoding)
|
||||
with self.assertRaisesRegex(LookupError, msg) as failure:
|
||||
bad_input.encode(encoding)
|
||||
self.assertIsNone(failure.exception.__cause__)
|
||||
|
||||
def test_text_to_binary_blacklists_text_transforms(self):
|
||||
# Check str.encode gives a good error message for str -> str codecs
|
||||
msg = (r"^'rot_13' is not a text encoding; "
|
||||
r"use codecs.encode\(\) to handle arbitrary codecs")
|
||||
with self.assertRaisesRegex(LookupError, msg):
|
||||
"just an example message".encode("rot_13")
|
||||
|
||||
def test_binary_to_text_blacklists_binary_transforms(self):
|
||||
# Check bytes.decode and bytearray.decode give a good error
|
||||
# message for binary -> binary codecs
|
||||
data = b"encode first to ensure we meet any format restrictions"
|
||||
for encoding in bytes_transform_encodings:
|
||||
encoded_data = codecs.encode(data, encoding)
|
||||
fmt = (r"{!r} is not a text encoding; "
|
||||
r"use codecs.decode\(\) to handle arbitrary codecs")
|
||||
msg = fmt.format(encoding)
|
||||
with self.assertRaisesRegex(LookupError, msg):
|
||||
encoded_data.decode(encoding)
|
||||
with self.assertRaisesRegex(LookupError, msg):
|
||||
bytearray(encoded_data).decode(encoding)
|
||||
|
||||
def test_binary_to_text_blacklists_text_transforms(self):
|
||||
# Check str -> str codec gives a good error for binary input
|
||||
for bad_input in (b"immutable", bytearray(b"mutable")):
|
||||
msg = (r"^'rot_13' is not a text encoding; "
|
||||
r"use codecs.decode\(\) to handle arbitrary codecs")
|
||||
with self.assertRaisesRegex(LookupError, msg) as failure:
|
||||
bad_input.decode("rot_13")
|
||||
self.assertIsNone(failure.exception.__cause__)
|
||||
|
||||
|
||||
@unittest.skipUnless(sys.platform == 'win32',
|
||||
'code pages are specific to Windows')
|
||||
|
|
|
@ -10,6 +10,12 @@ What's New in Python 3.3.5 release candidate 1?
|
|||
Core and Builtins
|
||||
-----------------
|
||||
|
||||
- Issue #19619: str.encode, bytes.decode and bytearray.decode now use an
|
||||
internal API to throw LookupError for known non-text encodings, rather
|
||||
than attempting the encoding or decoding operation and then throwing a
|
||||
TypeError for an unexpected output type. (The latter mechanism remains
|
||||
in place for third party non-text encodings)
|
||||
|
||||
- Issue #20588: Make Python-ast.c C89 compliant.
|
||||
|
||||
- Issue #20437: Fixed 21 potential bugs when deleting objects references.
|
||||
|
|
|
@ -3129,7 +3129,7 @@ PyUnicode_Decode(const char *s,
|
|||
buffer = PyMemoryView_FromBuffer(&info);
|
||||
if (buffer == NULL)
|
||||
goto onError;
|
||||
unicode = PyCodec_Decode(buffer, encoding, errors);
|
||||
unicode = _PyCodec_DecodeText(buffer, encoding, errors);
|
||||
if (unicode == NULL)
|
||||
goto onError;
|
||||
if (!PyUnicode_Check(unicode)) {
|
||||
|
@ -3489,7 +3489,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
|
|||
}
|
||||
|
||||
/* Encode via the codec registry */
|
||||
v = PyCodec_Encode(unicode, encoding, errors);
|
||||
v = _PyCodec_EncodeText(unicode, encoding, errors);
|
||||
if (v == NULL)
|
||||
return NULL;
|
||||
|
||||
|
|
138
Python/codecs.c
138
Python/codecs.c
|
@ -337,18 +337,15 @@ PyObject *PyCodec_StreamWriter(const char *encoding,
|
|||
|
||||
errors is passed to the encoder factory as argument if non-NULL. */
|
||||
|
||||
PyObject *PyCodec_Encode(PyObject *object,
|
||||
const char *encoding,
|
||||
const char *errors)
|
||||
static PyObject *
|
||||
_PyCodec_EncodeInternal(PyObject *object,
|
||||
PyObject *encoder,
|
||||
const char *encoding,
|
||||
const char *errors)
|
||||
{
|
||||
PyObject *encoder = NULL;
|
||||
PyObject *args = NULL, *result = NULL;
|
||||
PyObject *v = NULL;
|
||||
|
||||
encoder = PyCodec_Encoder(encoding);
|
||||
if (encoder == NULL)
|
||||
goto onError;
|
||||
|
||||
args = args_tuple(object, errors);
|
||||
if (args == NULL)
|
||||
goto onError;
|
||||
|
@ -384,18 +381,15 @@ PyObject *PyCodec_Encode(PyObject *object,
|
|||
|
||||
errors is passed to the decoder factory as argument if non-NULL. */
|
||||
|
||||
PyObject *PyCodec_Decode(PyObject *object,
|
||||
const char *encoding,
|
||||
const char *errors)
|
||||
static PyObject *
|
||||
_PyCodec_DecodeInternal(PyObject *object,
|
||||
PyObject *decoder,
|
||||
const char *encoding,
|
||||
const char *errors)
|
||||
{
|
||||
PyObject *decoder = NULL;
|
||||
PyObject *args = NULL, *result = NULL;
|
||||
PyObject *v;
|
||||
|
||||
decoder = PyCodec_Decoder(encoding);
|
||||
if (decoder == NULL)
|
||||
goto onError;
|
||||
|
||||
args = args_tuple(object, errors);
|
||||
if (args == NULL)
|
||||
goto onError;
|
||||
|
@ -425,6 +419,118 @@ PyObject *PyCodec_Decode(PyObject *object,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
/* Generic encoding/decoding API */
|
||||
PyObject *PyCodec_Encode(PyObject *object,
|
||||
const char *encoding,
|
||||
const char *errors)
|
||||
{
|
||||
PyObject *encoder;
|
||||
|
||||
encoder = PyCodec_Encoder(encoding);
|
||||
if (encoder == NULL)
|
||||
return NULL;
|
||||
|
||||
return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
|
||||
}
|
||||
|
||||
PyObject *PyCodec_Decode(PyObject *object,
|
||||
const char *encoding,
|
||||
const char *errors)
|
||||
{
|
||||
PyObject *decoder;
|
||||
|
||||
decoder = PyCodec_Decoder(encoding);
|
||||
if (decoder == NULL)
|
||||
return NULL;
|
||||
|
||||
return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
|
||||
}
|
||||
|
||||
/* Text encoding/decoding API */
|
||||
static
|
||||
PyObject *codec_getitem_checked(const char *encoding,
|
||||
const char *operation_name,
|
||||
int index)
|
||||
{
|
||||
_Py_IDENTIFIER(_is_text_encoding);
|
||||
PyObject *codec;
|
||||
PyObject *attr;
|
||||
PyObject *v;
|
||||
int is_text_codec;
|
||||
|
||||
codec = _PyCodec_Lookup(encoding);
|
||||
if (codec == NULL)
|
||||
return NULL;
|
||||
|
||||
/* Backwards compatibility: assume any raw tuple describes a text
|
||||
* encoding, and the same for anything lacking the private
|
||||
* attribute.
|
||||
*/
|
||||
if (!PyTuple_CheckExact(codec)) {
|
||||
attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
|
||||
if (attr == NULL) {
|
||||
if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
|
||||
PyErr_Clear();
|
||||
} else {
|
||||
Py_DECREF(codec);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
is_text_codec = PyObject_IsTrue(attr);
|
||||
Py_DECREF(attr);
|
||||
if (!is_text_codec) {
|
||||
Py_DECREF(codec);
|
||||
PyErr_Format(PyExc_LookupError,
|
||||
"'%.400s' is not a text encoding; "
|
||||
"use codecs.%s() to handle arbitrary codecs",
|
||||
encoding, operation_name);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
v = PyTuple_GET_ITEM(codec, index);
|
||||
Py_DECREF(codec);
|
||||
Py_INCREF(v);
|
||||
return v;
|
||||
}
|
||||
|
||||
static PyObject * _PyCodec_TextEncoder(const char *encoding)
|
||||
{
|
||||
return codec_getitem_checked(encoding, "encode", 0);
|
||||
}
|
||||
|
||||
static PyObject * _PyCodec_TextDecoder(const char *encoding)
|
||||
{
|
||||
return codec_getitem_checked(encoding, "decode", 1);
|
||||
}
|
||||
|
||||
PyObject *_PyCodec_EncodeText(PyObject *object,
|
||||
const char *encoding,
|
||||
const char *errors)
|
||||
{
|
||||
PyObject *encoder;
|
||||
|
||||
encoder = _PyCodec_TextEncoder(encoding);
|
||||
if (encoder == NULL)
|
||||
return NULL;
|
||||
|
||||
return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
|
||||
}
|
||||
|
||||
PyObject *_PyCodec_DecodeText(PyObject *object,
|
||||
const char *encoding,
|
||||
const char *errors)
|
||||
{
|
||||
PyObject *decoder;
|
||||
|
||||
decoder = _PyCodec_TextDecoder(encoding);
|
||||
if (decoder == NULL)
|
||||
return NULL;
|
||||
|
||||
return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
|
||||
}
|
||||
|
||||
/* Register the error handling callback function error under the name
|
||||
name. This function will be called by the codec when it encounters
|
||||
an unencodable characters/undecodable bytes and doesn't know the
|
||||
|
|
Loading…
Reference in New Issue