bpo-37388: Development mode check encoding and errors (GH-14341)

In development mode and in debug build, encoding and errors arguments are now checked on string encoding and decoding operations. Examples: open(), str.encode() and bytes.decode(). By default, for best performances, the errors argument is only checked at the first encoding/decoding error, and the encoding argument is sometimes ignored for empty strings.
2019-06-26 00:51:05 +02:00 · 2019-06-26 00:51:05 +02:00 · 22eb689cf3
parent e1a63c4f21
commit 22eb689cf3
10 changed files with 315 additions and 6 deletions
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@ -1559,9 +1559,16 @@ expression support in the :mod:`re` module).
   :func:`codecs.register_error`, see section :ref:`error-handlers`. For a
   list of possible encodings, see section :ref:`standard-encodings`.
   By default, the *errors* argument is not checked for best performances, but
   only used at the first encoding error. Enable the development mode
   (:option:`-X` ``dev`` option), or use a debug build, to check *errors*.
   .. versionchanged:: 3.1
      Support for keyword arguments added.
   .. versionchanged:: 3.9
      The *errors* is now checked in development mode and in debug mode.
 .. method:: str.endswith(suffix[, start[, end]])
@ -2575,6 +2582,10 @@ arbitrary binary data.
   :func:`codecs.register_error`, see section :ref:`error-handlers`. For a
   list of possible encodings, see section :ref:`standard-encodings`.
   By default, the *errors* argument is not checked for best performances, but
   only used at the first decoding error. Enable the development mode
   (:option:`-X` ``dev`` option), or use a debug build, to check *errors*.
   .. note::
      Passing the *encoding* argument to :class:`str` allows decoding any
@ -2584,6 +2595,9 @@ arbitrary binary data.
   .. versionchanged:: 3.1
      Added support for keyword arguments.
   .. versionchanged:: 3.9
      The *errors* is now checked in development mode and in debug mode.
 .. method:: bytes.endswith(suffix[, start[, end]])
            bytearray.endswith(suffix[, start[, end]])
--- a/Doc/using/cmdline.rst
+++ b/Doc/using/cmdline.rst
@ -429,6 +429,9 @@ Miscellaneous options
     not be more verbose than the default if the code is correct: new warnings
     are only emitted when an issue is detected. Effect of the developer mode:
     * Check *encoding* and *errors* arguments on string encoding and decoding
       operations. Examples: :func:`open`, :meth:`str.encode` and
       :meth:`bytes.decode`.
     * Add ``default`` warning filter, as :option:`-W` ``default``.
     * Install debug hooks on memory allocators: see the
       :c:func:`PyMem_SetupDebugHooks` C function.
@ -469,6 +472,10 @@ Miscellaneous options
      The ``-X pycache_prefix`` option. The ``-X dev`` option now logs
      ``close()`` exceptions in :class:`io.IOBase` destructor.
   .. versionchanged:: 3.9
      Using ``-X dev`` option, check *encoding* and *errors* arguments on
      string encoding and decoding operations.
 Options you shouldn't use
 ~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/Doc/whatsnew/3.9.rst
+++ b/Doc/whatsnew/3.9.rst
@ -84,6 +84,15 @@ Other Language Changes
  this case.
  (Contributed by Victor Stinner in :issue:`20443`.)
 * In development mode and in debug build, *encoding* and *errors* arguments are
  now checked on string encoding and decoding operations. Examples:
  :func:`open`, :meth:`str.encode` and :meth:`bytes.decode`.
  By default, for best performances, the *errors* argument is only checked at
  the first encoding/decoding error, and the *encoding* argument is sometimes
  ignored for empty strings.
  (Contributed by Victor Stinner in :issue:`37388`.)
 New Modules
 ===========
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@ -36,6 +36,8 @@ BlockingIOError = BlockingIOError
 # Does io.IOBase finalizer log the exception if the close() method fails?
 # The exception is ignored silently by default in release build.
 _IOBASE_EMITS_UNRAISABLE = (hasattr(sys, "gettotalrefcount") or sys.flags.dev_mode)
 # Does open() check its 'errors' argument?
 _CHECK_ERRORS = _IOBASE_EMITS_UNRAISABLE
 def open(file, mode="r", buffering=-1, encoding=None, errors=None,
@ -2022,6 +2024,8 @@ class TextIOWrapper(TextIOBase):
        else:
            if not isinstance(errors, str):
                raise ValueError("invalid errors: %r" % errors)
            if _CHECK_ERRORS:
                codecs.lookup_error(errors)
        self._buffer = buffer
        self._decoded_chars = ''  # buffer for text returned from decoder
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@ -12,12 +12,14 @@ import copy
 import functools
 import pickle
 import tempfile
 import textwrap
 import unittest
 import test.support
 import test.string_tests
 import test.list_tests
 from test.support import bigaddrspacetest, MAX_Py_ssize_t
 from test.support.script_helper import assert_python_failure
 if sys.flags.bytes_warning:
@ -315,6 +317,62 @@ class BaseBytesTest:
        # Default encoding is utf-8
        self.assertEqual(self.type2test(b'\xe2\x98\x83').decode(), '\u2603')
    def test_check_encoding_errors(self):
        # bpo-37388: bytes(str) and bytes.encode() must check encoding
        # and errors arguments in dev mode
        invalid = 'Boom, Shaka Laka, Boom!'
        encodings = ('ascii', 'utf8', 'latin1')
        code = textwrap.dedent(f'''
            import sys
            type2test = {self.type2test.__name__}
            encodings = {encodings!r}
            for data in ('', 'short string'):
                try:
                    type2test(data, encoding={invalid!r})
                except LookupError:
                    pass
                else:
                    sys.exit(21)
                for encoding in encodings:
                    try:
                        type2test(data, encoding=encoding, errors={invalid!r})
                    except LookupError:
                        pass
                    else:
                        sys.exit(22)
            for data in (b'', b'short string'):
                data = type2test(data)
                print(repr(data))
                try:
                    data.decode(encoding={invalid!r})
                except LookupError:
                    sys.exit(10)
                else:
                    sys.exit(23)
                try:
                    data.decode(errors={invalid!r})
                except LookupError:
                    pass
                else:
                    sys.exit(24)
                for encoding in encodings:
                    try:
                        data.decode(encoding=encoding, errors={invalid!r})
                    except LookupError:
                        pass
                    else:
                        sys.exit(25)
            sys.exit(10)
        ''')
        proc = assert_python_failure('-X', 'dev', '-c', code)
        self.assertEqual(proc.rc, 10, proc)
    def test_from_int(self):
        b = self.type2test(0)
        self.assertEqual(b, self.type2test())
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@ -29,6 +29,7 @@ import random
 import signal
 import sys
 import sysconfig
 import textwrap
 import threading
 import time
 import unittest
@ -37,7 +38,8 @@ import weakref
 from collections import deque, UserList
 from itertools import cycle, count
 from test import support
-from test.support.script_helper import assert_python_ok, run_python_until_end
+from test.support.script_helper import (
    assert_python_ok, assert_python_failure, run_python_until_end)
 from test.support import FakePath
 import codecs
@ -4130,6 +4132,51 @@ class MiscIOTest(unittest.TestCase):
        # there used to be a buffer overflow in the parser for rawmode
        self.assertRaises(ValueError, self.open, support.TESTFN, 'rwax+')
    def test_check_encoding_errors(self):
        # bpo-37388: open() and TextIOWrapper must check encoding and errors
        # arguments in dev mode
        mod = self.io.__name__
        filename = __file__
        invalid = 'Boom, Shaka Laka, Boom!'
        code = textwrap.dedent(f'''
            import sys
            from {mod} import open, TextIOWrapper
            try:
                open({filename!r}, encoding={invalid!r})
            except LookupError:
                pass
            else:
                sys.exit(21)
            try:
                open({filename!r}, errors={invalid!r})
            except LookupError:
                pass
            else:
                sys.exit(22)
            fp = open({filename!r}, "rb")
            with fp:
                try:
                    TextIOWrapper(fp, encoding={invalid!r})
                except LookupError:
                    pass
                else:
                    sys.exit(23)
                try:
                    TextIOWrapper(fp, errors={invalid!r})
                except LookupError:
                    pass
                else:
                    sys.exit(24)
            sys.exit(10)
        ''')
        proc = assert_python_failure('-X', 'dev', '-c', code)
        self.assertEqual(proc.rc, 10, proc)
 class CMiscIOTest(MiscIOTest):
    io = io
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -11,9 +11,11 @@ import itertools
 import operator
 import struct
 import sys
 import textwrap
 import unittest
 import warnings
 from test import support, string_tests
 from test.support.script_helper import assert_python_failure
 # Error handling (bad decoder return)
 def search_function(encoding):
@ -2436,6 +2438,66 @@ class UnicodeTest(string_tests.CommonTest,
        support.check_free_after_iterating(self, iter, str)
        support.check_free_after_iterating(self, reversed, str)
    def test_check_encoding_errors(self):
        # bpo-37388: str(bytes) and str.decode() must check encoding and errors
        # arguments in dev mode
        encodings = ('ascii', 'utf8', 'latin1')
        invalid = 'Boom, Shaka Laka, Boom!'
        code = textwrap.dedent(f'''
            import sys
            encodings = {encodings!r}
            for data in (b'', b'short string'):
                try:
                    str(data, encoding={invalid!r})
                except LookupError:
                    pass
                else:
                    sys.exit(21)
                try:
                    str(data, errors={invalid!r})
                except LookupError:
                    pass
                else:
                    sys.exit(22)
                for encoding in encodings:
                    try:
                        str(data, encoding, errors={invalid!r})
                    except LookupError:
                        pass
                    else:
                        sys.exit(22)
            for data in ('', 'short string'):
                try:
                    data.encode(encoding={invalid!r})
                except LookupError:
                    pass
                else:
                    sys.exit(23)
                try:
                    data.encode(errors={invalid!r})
                except LookupError:
                    pass
                else:
                    sys.exit(24)
                for encoding in encodings:
                    try:
                        data.encode(encoding, errors={invalid!r})
                    except LookupError:
                        pass
                    else:
                        sys.exit(24)
            sys.exit(10)
        ''')
        proc = assert_python_failure('-X', 'dev', '-c', code)
        self.assertEqual(proc.rc, 10, proc)
 class CAPITest(unittest.TestCase):
--- a/Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst
+++ b/Builtins/2019-06-24-21-53-52.bpo-37388.0XTZmW.rst
@ -0,0 +1,7 @@
 In development mode and in debug build, *encoding* and *errors* arguments are
 now checked on string encoding and decoding operations. Examples: :func:`open`,
 :meth:`str.encode` and :meth:`bytes.decode`.
 By default, for best performances, the *errors* argument is only checked at the
 first encoding/decoding error, and the *encoding* argument is sometimes ignored
 for empty strings.
--- a/Modules/_io/textio.c
+++ b/Modules/_io/textio.c
@ -988,6 +988,46 @@ _textiowrapper_fix_encoder_state(textio *self)
    return 0;
 }
 static int
 io_check_errors(PyObject *errors)
 {
    assert(errors != NULL && errors != Py_None);
    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
 #ifndef Py_DEBUG
    /* In release mode, only check in development mode (-X dev) */
    if (!interp->config.dev_mode) {
        return 0;
    }
 #else
    /* Always check in debug mode */
 #endif
    /* Avoid calling PyCodec_LookupError() before the codec registry is ready:
       before_PyUnicode_InitEncodings() is called. */
    if (!interp->fs_codec.encoding) {
        return 0;
    }
    Py_ssize_t name_length;
    const char *name = PyUnicode_AsUTF8AndSize(errors, &name_length);
    if (name == NULL) {
        return -1;
    }
    if (strlen(name) != (size_t)name_length) {
        PyErr_SetString(PyExc_ValueError, "embedded null character in errors");
        return -1;
    }
    PyObject *handler = PyCodec_LookupError(name);
    if (handler != NULL) {
        Py_DECREF(handler);
        return 0;
    }
    return -1;
 }
 /*[clinic input]
 _io.TextIOWrapper.__init__
    buffer: object
@ -1057,6 +1097,9 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
            errors->ob_type->tp_name);
        return -1;
    }
    else if (io_check_errors(errors)) {
        return -1;
    }
    if (validate_newline(newline) < 0) {
        return -1;
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -427,6 +427,48 @@ get_error_handler_wide(const wchar_t *errors)
 }
 static inline int
 unicode_check_encoding_errors(const char *encoding, const char *errors)
 {
    if (encoding == NULL && errors == NULL) {
        return 0;
    }
    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
 #ifndef Py_DEBUG
    /* In release mode, only check in development mode (-X dev) */
    if (!interp->config.dev_mode) {
        return 0;
    }
 #else
    /* Always check in debug mode */
 #endif
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
    if (!interp->fs_codec.encoding) {
        return 0;
    }
    if (encoding != NULL) {
        PyObject *handler = _PyCodec_Lookup(encoding);
        if (handler == NULL) {
            return -1;
        }
        Py_DECREF(handler);
    }
    if (errors != NULL) {
        PyObject *handler = PyCodec_LookupError(errors);
        if (handler == NULL) {
            return -1;
        }
        Py_DECREF(handler);
    }
    return 0;
 }
 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
   This function is kept for backward compatibility with the old API. */
 Py_UNICODE
@ -3211,12 +3253,15 @@ PyUnicode_FromEncodedObject(PyObject *obj,
    /* Decoding bytes objects is the most common case and should be fast */
    if (PyBytes_Check(obj)) {
-        if (PyBytes_GET_SIZE(obj) == 0)
+        if (PyBytes_GET_SIZE(obj) == 0) {
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
                return NULL;
            }
            _Py_RETURN_UNICODE_EMPTY();
-        v = PyUnicode_Decode(
+        }
        return PyUnicode_Decode(
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
                encoding, errors);
        return v;
    }
    if (PyUnicode_Check(obj)) {
@ -3235,6 +3280,9 @@ PyUnicode_FromEncodedObject(PyObject *obj,
    if (buffer.len == 0) {
        PyBuffer_Release(&buffer);
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
            return NULL;
        }
        _Py_RETURN_UNICODE_EMPTY();
    }
@ -3302,6 +3350,10 @@ PyUnicode_Decode(const char *s,
    Py_buffer info;
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
        return NULL;
    }
    if (encoding == NULL) {
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
    }
@ -3562,7 +3614,8 @@ PyUnicode_EncodeFSDefault(PyObject *unicode)
       cannot use it to encode and decode filenames before it is loaded. Load
       the Python codec requires to encode at least its own filename. Use the C
       implementation of the locale codec until the codec registry is
-       initialized and the Python codec is loaded. See initfsencoding(). */
+       initialized and the Python codec is loaded.
       See _PyUnicode_InitEncodings(). */
    if (interp->fs_codec.encoding) {
        return PyUnicode_AsEncodedString(unicode,
                                         interp->fs_codec.encoding,
@ -3591,6 +3644,10 @@ PyUnicode_AsEncodedString(PyObject *unicode,
        return NULL;
    }
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
        return NULL;
    }
    if (encoding == NULL) {
        return _PyUnicode_AsUTF8String(unicode, errors);
    }
@ -3800,7 +3857,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
       cannot use it to encode and decode filenames before it is loaded. Load
       the Python codec requires to encode at least its own filename. Use the C
       implementation of the locale codec until the codec registry is
-       initialized and the Python codec is loaded. See initfsencoding(). */
+       initialized and the Python codec is loaded.
       See _PyUnicode_InitEncodings(). */
    if (interp->fs_codec.encoding) {
        return PyUnicode_Decode(s, size,
                                interp->fs_codec.encoding,