From 4827483f47906fecee6b5d9097df2a69a293a85c Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Mon, 29 Mar 2021 12:28:14 +0900 Subject: [PATCH] bpo-43510: Implement PEP 597 opt-in EncodingWarning. (GH-19481) See [PEP 597](https://www.python.org/dev/peps/pep-0597/). * Add `-X warn_default_encoding` and `PYTHONWARNDEFAULTENCODING`. * Add EncodingWarning * Add io.text_encoding() * open(), TextIOWrapper() emits EncodingWarning when encoding is omitted and warn_default_encoding is enabled. * _pyio.TextIOWrapper() uses UTF-8 as fallback default encoding used when failed to import locale module. (used during building Python) * bz2, configparser, gzip, lzma, pathlib, tempfile modules use io.text_encoding(). * What's new entry --- Doc/c-api/init_config.rst | 9 +++ Doc/library/exceptions.rst | 9 +++ Doc/library/io.rst | 81 +++++++++++++++++++ Doc/using/cmdline.rst | 15 ++++ Doc/whatsnew/3.10.rst | 24 ++++++ Include/cpython/initconfig.h | 1 + Include/internal/pycore_initconfig.h | 1 + Include/pyerrors.h | 1 + Lib/_pyio.py | 47 ++++++++--- Lib/bz2.py | 1 + Lib/configparser.py | 1 + Lib/gzip.py | 1 + Lib/io.py | 2 +- Lib/lzma.py | 1 + Lib/pathlib.py | 4 + Lib/site.py | 4 +- Lib/subprocess.py | 9 ++- Lib/tempfile.py | 7 ++ Lib/test/exception_hierarchy.txt | 1 + Lib/test/test_embed.py | 1 + Lib/test/test_io.py | 23 ++++++ Lib/test/test_pickle.py | 3 +- Lib/test/test_sys.py | 3 +- .../2021-03-16-17-20-33.bpo-43510.-BeQH_.rst | 3 + Modules/_io/_iomodule.c | 41 ++++++++++ Modules/_io/clinic/_iomodule.c.h | 48 ++++++++++- Modules/_io/textio.c | 11 +++ Objects/exceptions.c | 9 +++ PC/python3dll.c | 1 + Python/initconfig.c | 9 ++- Python/preconfig.c | 9 +++ Python/sysmodule.c | 4 +- 32 files changed, 366 insertions(+), 18 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2021-03-16-17-20-33.bpo-43510.-BeQH_.rst diff --git a/Doc/c-api/init_config.rst b/Doc/c-api/init_config.rst index db7c1f43765..29fbb68195b 100644 --- a/Doc/c-api/init_config.rst +++ b/Doc/c-api/init_config.rst @@ -583,6 +583,15 @@ PyConfig Default: ``0``. + .. c:member:: int warn_default_encoding + + If non-zero, emit a :exc:`EncodingWarning` warning when :class:`io.TextIOWrapper` + uses its default encoding. See :ref:`io-encoding-warning` for details. + + Default: ``0``. + + .. versionadded:: 3.10 + .. c:member:: wchar_t* check_hash_pycs_mode Control the validation behavior of hash-based ``.pyc`` files: diff --git a/Doc/library/exceptions.rst b/Doc/library/exceptions.rst index 1028213699d..40ccde72d07 100644 --- a/Doc/library/exceptions.rst +++ b/Doc/library/exceptions.rst @@ -741,6 +741,15 @@ The following exceptions are used as warning categories; see the Base class for warnings related to Unicode. +.. exception:: EncodingWarning + + Base class for warnings related to encodings. + + See :ref:`io-encoding-warning` for details. + + .. versionadded:: 3.10 + + .. exception:: BytesWarning Base class for warnings related to :class:`bytes` and :class:`bytearray`. diff --git a/Doc/library/io.rst b/Doc/library/io.rst index 96e02e839ae..f9ffc19fac4 100644 --- a/Doc/library/io.rst +++ b/Doc/library/io.rst @@ -106,6 +106,56 @@ stream by opening a file in binary mode with buffering disabled:: The raw stream API is described in detail in the docs of :class:`RawIOBase`. +.. _io-text-encoding: + +Text Encoding +------------- + +The default encoding of :class:`TextIOWrapper` and :func:`open` is +locale-specific (:func:`locale.getpreferredencoding(False) `). + +However, many developers forget to specify the encoding when opening text files +encoded in UTF-8 (e.g. JSON, TOML, Markdown, etc...) since most Unix +platforms use UTF-8 locale by default. This causes bugs because the locale +encoding is not UTF-8 for most Windows users. For example:: + + # May not work on Windows when non-ASCII characters in the file. + with open("README.md") as f: + long_description = f.read() + +Additionally, while there is no concrete plan as of yet, Python may change +the default text file encoding to UTF-8 in the future. + +Accordingly, it is highly recommended that you specify the encoding +explicitly when opening text files. If you want to use UTF-8, pass +``encoding="utf-8"``. To use the current locale encoding, +``encoding="locale"`` is supported in Python 3.10. + +When you need to run existing code on Windows that attempts to opens +UTF-8 files using the default locale encoding, you can enable the UTF-8 +mode. See :ref:`UTF-8 mode on Windows `. + +.. _io-encoding-warning: + +Opt-in EncodingWarning +^^^^^^^^^^^^^^^^^^^^^^ + +.. versionadded:: 3.10 + See :pep:`597` for more details. + +To find where the default locale encoding is used, you can enable +the ``-X warn_default_encoding`` command line option or set the +:envvar:`PYTHONWARNDEFAULTENCODING` environment variable, which will +emit an :exc:`EncodingWarning` when the default encoding is used. + +If you are providing an API that uses :func:`open` or +:class:`TextIOWrapper` and passes ``encoding=None`` as a parameter, you +can use :func:`text_encoding` so that callers of the API will emit an +:exc:`EncodingWarning` if they don't pass an ``encoding``. However, +please consider using UTF-8 by default (i.e. ``encoding="utf-8"``) for +new APIs. + + High-level Module Interface --------------------------- @@ -143,6 +193,32 @@ High-level Module Interface .. versionadded:: 3.8 +.. function:: text_encoding(encoding, stacklevel=2) + + This is a helper function for callables that use :func:`open` or + :class:`TextIOWrapper` and have an ``encoding=None`` parameter. + + This function returns *encoding* if it is not ``None`` and ``"locale"`` if + *encoding* is ``None``. + + This function emits an :class:`EncodingWarning` if + :data:`sys.flags.warn_default_encoding ` is true and *encoding* + is None. *stacklevel* specifies where the warning is emitted. + For example:: + + def read_text(path, encoding=None): + encoding = io.text_encoding(encoding) # stacklevel=2 + with open(path, encoding) as f: + return f.read() + + In this example, an :class:`EncodingWarning` is emitted for the caller of + ``read_text()``. + + See :ref:`io-text-encoding` for more information. + + .. versionadded:: 3.10 + + .. exception:: BlockingIOError This is a compatibility alias for the builtin :exc:`BlockingIOError` @@ -869,6 +945,8 @@ Text I/O *encoding* gives the name of the encoding that the stream will be decoded or encoded with. It defaults to :func:`locale.getpreferredencoding(False) `. + ``encoding="locale"`` can be used to specify the current locale's encoding + explicitly. See :ref:`io-text-encoding` for more information. *errors* is an optional string that specifies how encoding and decoding errors are to be handled. Pass ``'strict'`` to raise a :exc:`ValueError` @@ -920,6 +998,9 @@ Text I/O locale encoding using :func:`locale.setlocale`, use the current locale encoding instead of the user preferred encoding. + .. versionchanged:: 3.10 + The *encoding* argument now supports the ``"locale"`` dummy encoding name. + :class:`TextIOWrapper` provides these data attributes and methods in addition to those from :class:`TextIOBase` and :class:`IOBase`: diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 04e0f3267db..1493c7c9017 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -453,6 +453,9 @@ Miscellaneous options * ``-X pycache_prefix=PATH`` enables writing ``.pyc`` files to a parallel tree rooted at the given directory instead of to the code tree. See also :envvar:`PYTHONPYCACHEPREFIX`. + * ``-X warn_default_encoding`` issues a :class:`EncodingWarning` when the + locale-specific default encoding is used for opening files. + See also :envvar:`PYTHONWARNDEFAULTENCODING`. It also allows passing arbitrary values and retrieving them through the :data:`sys._xoptions` dictionary. @@ -482,6 +485,9 @@ Miscellaneous options The ``-X showalloccount`` option has been removed. + .. versionadded:: 3.10 + The ``-X warn_default_encoding`` option. + .. deprecated-removed:: 3.9 3.10 The ``-X oldparser`` option. @@ -907,6 +913,15 @@ conflict. .. versionadded:: 3.7 +.. envvar:: PYTHONWARNDEFAULTENCODING + + If this environment variable is set to a non-empty string, issue a + :class:`EncodingWarning` when the locale-specific default encoding is used. + + See :ref:`io-encoding-warning` for details. + + .. versionadded:: 3.10 + Debug-mode variables ~~~~~~~~~~~~~~~~~~~~ diff --git a/Doc/whatsnew/3.10.rst b/Doc/whatsnew/3.10.rst index 1c4e5c47fc6..3a563c10282 100644 --- a/Doc/whatsnew/3.10.rst +++ b/Doc/whatsnew/3.10.rst @@ -454,6 +454,30 @@ For the full specification see :pep:`634`. Motivation and rationale are in :pep:`635`, and a longer tutorial is in :pep:`636`. +.. _whatsnew310-pep597: + +Optional ``EncodingWarning`` and ``encoding="locale"`` option +------------------------------------------------------------- + +The default encoding of :class:`TextIOWrapper` and :func:`open` is +platform and locale dependent. Since UTF-8 is used on most Unix +platforms, omitting ``encoding`` option when opening UTF-8 files +(e.g. JSON, YAML, TOML, Markdown) is very common bug. For example:: + + # BUG: "rb" mode or encoding="utf-8" should be used. + with open("data.json") as f: + data = json.laod(f) + +To find this type of bugs, optional ``EncodingWarning`` is added. +It is emitted when :data:`sys.flags.warn_default_encoding ` +is true and locale-specific default encoding is used. + +``-X warn_default_encoding`` option and :envvar:`PYTHONWARNDEFAULTENCODING` +are added to enable the warning. + +See :ref:`io-text-encoding` for more information. + + New Features Related to Type Annotations ======================================== diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h index 666c1e419ca..09f9a2947ef 100644 --- a/Include/cpython/initconfig.h +++ b/Include/cpython/initconfig.h @@ -153,6 +153,7 @@ typedef struct PyConfig { PyWideStringList warnoptions; int site_import; int bytes_warning; + int warn_default_encoding; int inspect; int interactive; int optimization_level; diff --git a/Include/internal/pycore_initconfig.h b/Include/internal/pycore_initconfig.h index 28cd57030e2..4b009e816b4 100644 --- a/Include/internal/pycore_initconfig.h +++ b/Include/internal/pycore_initconfig.h @@ -102,6 +102,7 @@ typedef struct { int isolated; /* -I option */ int use_environment; /* -E option */ int dev_mode; /* -X dev and PYTHONDEVMODE */ + int warn_default_encoding; /* -X warn_default_encoding and PYTHONWARNDEFAULTENCODING */ } _PyPreCmdline; #define _PyPreCmdline_INIT \ diff --git a/Include/pyerrors.h b/Include/pyerrors.h index 14129d3533c..f5d1c711577 100644 --- a/Include/pyerrors.h +++ b/Include/pyerrors.h @@ -146,6 +146,7 @@ PyAPI_DATA(PyObject *) PyExc_FutureWarning; PyAPI_DATA(PyObject *) PyExc_ImportWarning; PyAPI_DATA(PyObject *) PyExc_UnicodeWarning; PyAPI_DATA(PyObject *) PyExc_BytesWarning; +PyAPI_DATA(PyObject *) PyExc_EncodingWarning; PyAPI_DATA(PyObject *) PyExc_ResourceWarning; diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 4804ed27cd1..0f182d42402 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -40,6 +40,29 @@ _IOBASE_EMITS_UNRAISABLE = (hasattr(sys, "gettotalrefcount") or sys.flags.dev_mo _CHECK_ERRORS = _IOBASE_EMITS_UNRAISABLE +def text_encoding(encoding, stacklevel=2): + """ + A helper function to choose the text encoding. + + When encoding is not None, just return it. + Otherwise, return the default text encoding (i.e. "locale"). + + This function emits an EncodingWarning if *encoding* is None and + sys.flags.warn_default_encoding is true. + + This can be used in APIs with an encoding=None parameter + that pass it to TextIOWrapper or open. + However, please consider using encoding="utf-8" for new APIs. + """ + if encoding is None: + encoding = "locale" + if sys.flags.warn_default_encoding: + import warnings + warnings.warn("'encoding' argument not specified.", + EncodingWarning, stacklevel + 1) + return encoding + + def open(file, mode="r", buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None): @@ -248,6 +271,7 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None, result = buffer if binary: return result + encoding = text_encoding(encoding) text = TextIOWrapper(buffer, encoding, errors, newline, line_buffering) result = text text.mode = mode @@ -2004,19 +2028,22 @@ class TextIOWrapper(TextIOBase): def __init__(self, buffer, encoding=None, errors=None, newline=None, line_buffering=False, write_through=False): self._check_newline(newline) - if encoding is None: + encoding = text_encoding(encoding) + + if encoding == "locale": try: - encoding = os.device_encoding(buffer.fileno()) + encoding = os.device_encoding(buffer.fileno()) or "locale" except (AttributeError, UnsupportedOperation): pass - if encoding is None: - try: - import locale - except ImportError: - # Importing locale may fail if Python is being built - encoding = "ascii" - else: - encoding = locale.getpreferredencoding(False) + + if encoding == "locale": + try: + import locale + except ImportError: + # Importing locale may fail if Python is being built + encoding = "utf-8" + else: + encoding = locale.getpreferredencoding(False) if not isinstance(encoding, str): raise ValueError("invalid encoding: %r" % encoding) diff --git a/Lib/bz2.py b/Lib/bz2.py index ce07ebeb142..1da3ce65c81 100644 --- a/Lib/bz2.py +++ b/Lib/bz2.py @@ -311,6 +311,7 @@ def open(filename, mode="rb", compresslevel=9, binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel) if "t" in mode: + encoding = io.text_encoding(encoding) return io.TextIOWrapper(binary_file, encoding, errors, newline) else: return binary_file diff --git a/Lib/configparser.py b/Lib/configparser.py index 924cc56a3f1..3b4cb5e6b24 100644 --- a/Lib/configparser.py +++ b/Lib/configparser.py @@ -690,6 +690,7 @@ class RawConfigParser(MutableMapping): """ if isinstance(filenames, (str, bytes, os.PathLike)): filenames = [filenames] + encoding = io.text_encoding(encoding) read_ok = [] for filename in filenames: try: diff --git a/Lib/gzip.py b/Lib/gzip.py index 136915725ab..0a8993ba354 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -62,6 +62,7 @@ def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, raise TypeError("filename must be a str or bytes object, or a file") if "t" in mode: + encoding = io.text_encoding(encoding) return io.TextIOWrapper(binary_file, encoding, errors, newline) else: return binary_file diff --git a/Lib/io.py b/Lib/io.py index fbce6efc010..01f1df80ded 100644 --- a/Lib/io.py +++ b/Lib/io.py @@ -54,7 +54,7 @@ import abc from _io import (DEFAULT_BUFFER_SIZE, BlockingIOError, UnsupportedOperation, open, open_code, FileIO, BytesIO, StringIO, BufferedReader, BufferedWriter, BufferedRWPair, BufferedRandom, - IncrementalNewlineDecoder, TextIOWrapper) + IncrementalNewlineDecoder, text_encoding, TextIOWrapper) OpenWrapper = _io.open # for compatibility with _pyio diff --git a/Lib/lzma.py b/Lib/lzma.py index 0817b872d20..c8b197055cd 100644 --- a/Lib/lzma.py +++ b/Lib/lzma.py @@ -302,6 +302,7 @@ def open(filename, mode="rb", *, preset=preset, filters=filters) if "t" in mode: + encoding = io.text_encoding(encoding) return io.TextIOWrapper(binary_file, encoding, errors, newline) else: return binary_file diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 531a699a40d..5c9284b331a 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -1241,6 +1241,8 @@ class Path(PurePath): Open the file pointed by this path and return a file object, as the built-in open() function does. """ + if "b" not in mode: + encoding = io.text_encoding(encoding) return io.open(self, mode, buffering, encoding, errors, newline, opener=self._opener) @@ -1255,6 +1257,7 @@ class Path(PurePath): """ Open the file in text mode, read it, and close the file. """ + encoding = io.text_encoding(encoding) with self.open(mode='r', encoding=encoding, errors=errors) as f: return f.read() @@ -1274,6 +1277,7 @@ class Path(PurePath): if not isinstance(data, str): raise TypeError('data must be str, not %s' % data.__class__.__name__) + encoding = io.text_encoding(encoding) with self.open(mode='w', encoding=encoding, errors=errors, newline=newline) as f: return f.write(data) diff --git a/Lib/site.py b/Lib/site.py index 5f1b31e73d9..939893eb5ee 100644 --- a/Lib/site.py +++ b/Lib/site.py @@ -170,7 +170,9 @@ def addpackage(sitedir, name, known_paths): fullname = os.path.join(sitedir, name) _trace(f"Processing .pth file: {fullname!r}") try: - f = io.TextIOWrapper(io.open_code(fullname)) + # locale encoding is not ideal especially on Windows. But we have used + # it for a long time. setuptools uses the locale encoding too. + f = io.TextIOWrapper(io.open_code(fullname), encoding="locale") except OSError: return with f: diff --git a/Lib/subprocess.py b/Lib/subprocess.py index 4b011e4ce55..2b785496e4f 100644 --- a/Lib/subprocess.py +++ b/Lib/subprocess.py @@ -693,7 +693,7 @@ def _use_posix_spawn(): _USE_POSIX_SPAWN = _use_posix_spawn() -class Popen(object): +class Popen: """ Execute a child program in a new process. For a complete description of the arguments see the Python documentation. @@ -844,6 +844,13 @@ class Popen(object): self.text_mode = encoding or errors or text or universal_newlines + # PEP 597: We suppress the EncodingWarning in subprocess module + # for now (at Python 3.10), because we focus on files for now. + # This will be changed to encoding = io.text_encoding(encoding) + # in the future. + if self.text_mode and encoding is None: + self.encoding = encoding = "locale" + # How long to resume waiting on a child after the first ^C. # There is no right value for this. The purpose is to be polite # yet remain good for interactive users trying to exit a tool. diff --git a/Lib/tempfile.py b/Lib/tempfile.py index 4b2547c98f1..efcf7a7fb3b 100644 --- a/Lib/tempfile.py +++ b/Lib/tempfile.py @@ -543,6 +543,9 @@ def NamedTemporaryFile(mode='w+b', buffering=-1, encoding=None, if _os.name == 'nt' and delete: flags |= _os.O_TEMPORARY + if "b" not in mode: + encoding = _io.text_encoding(encoding) + (fd, name) = _mkstemp_inner(dir, prefix, suffix, flags, output_type) try: file = _io.open(fd, mode, buffering=buffering, @@ -583,6 +586,9 @@ else: """ global _O_TMPFILE_WORKS + if "b" not in mode: + encoding = _io.text_encoding(encoding) + prefix, suffix, dir, output_type = _sanitize_params(prefix, suffix, dir) flags = _bin_openflags @@ -638,6 +644,7 @@ class SpooledTemporaryFile: if 'b' in mode: self._file = _io.BytesIO() else: + encoding = _io.text_encoding(encoding) self._file = _io.TextIOWrapper(_io.BytesIO(), encoding=encoding, errors=errors, newline=newline) diff --git a/Lib/test/exception_hierarchy.txt b/Lib/test/exception_hierarchy.txt index 763a6c899b4..6c5e8213910 100644 --- a/Lib/test/exception_hierarchy.txt +++ b/Lib/test/exception_hierarchy.txt @@ -61,4 +61,5 @@ BaseException +-- ImportWarning +-- UnicodeWarning +-- BytesWarning + +-- EncodingWarning +-- ResourceWarning diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index 6833b2540d6..646cd0632ed 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -389,6 +389,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): 'site_import': 1, 'bytes_warning': 0, + 'warn_default_encoding': 0, 'inspect': 0, 'interactive': 0, 'optimization_level': 0, diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 3768b625516..c731302a9f2 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -4249,6 +4249,29 @@ class MiscIOTest(unittest.TestCase): proc = assert_python_failure('-X', 'dev', '-c', code) self.assertEqual(proc.rc, 10, proc) + def test_check_encoding_warning(self): + # PEP 597: Raise warning when encoding is not specified + # and sys.flags.warn_default_encoding is set. + mod = self.io.__name__ + filename = __file__ + code = textwrap.dedent(f'''\ + import sys + from {mod} import open, TextIOWrapper + import pathlib + + with open({filename!r}) as f: # line 5 + pass + + pathlib.Path({filename!r}).read_text() # line 8 + ''') + proc = assert_python_ok('-X', 'warn_default_encoding', '-c', code) + warnings = proc.err.splitlines() + self.assertEqual(len(warnings), 2) + self.assertTrue( + warnings[0].startswith(b":5: EncodingWarning: ")) + self.assertTrue( + warnings[1].startswith(b":8: EncodingWarning: ")) + class CMiscIOTest(MiscIOTest): io = io diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py index 1f5cb103933..23c7bd261e8 100644 --- a/Lib/test/test_pickle.py +++ b/Lib/test/test_pickle.py @@ -483,7 +483,8 @@ class CompatPickleTests(unittest.TestCase): if exc in (BlockingIOError, ResourceWarning, StopAsyncIteration, - RecursionError): + RecursionError, + EncodingWarning): continue if exc is not OSError and issubclass(exc, OSError): self.assertEqual(reverse_mapping('builtins', name), diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index fca05e6f88f..5b004c2b52d 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -591,7 +591,8 @@ class SysModuleTest(unittest.TestCase): "inspect", "interactive", "optimize", "dont_write_bytecode", "no_user_site", "no_site", "ignore_environment", "verbose", "bytes_warning", "quiet", - "hash_randomization", "isolated", "dev_mode", "utf8_mode") + "hash_randomization", "isolated", "dev_mode", "utf8_mode", + "warn_default_encoding") for attr in attrs: self.assertTrue(hasattr(sys.flags, attr), attr) attr_type = bool if attr == "dev_mode" else int diff --git a/Misc/NEWS.d/next/Library/2021-03-16-17-20-33.bpo-43510.-BeQH_.rst b/Misc/NEWS.d/next/Library/2021-03-16-17-20-33.bpo-43510.-BeQH_.rst new file mode 100644 index 00000000000..b79a49c881b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-03-16-17-20-33.bpo-43510.-BeQH_.rst @@ -0,0 +1,3 @@ +Implement :pep:`597`: Add ``EncodingWarning`` warning, ``-X +warn_default_encoding`` option, :envvar:`PYTHONWARNDEFAULTENCODING` +environment variable and ``encoding="locale"`` argument value. diff --git a/Modules/_io/_iomodule.c b/Modules/_io/_iomodule.c index 9147648b243..652c2ce5b0d 100644 --- a/Modules/_io/_iomodule.c +++ b/Modules/_io/_iomodule.c @@ -10,6 +10,7 @@ #define PY_SSIZE_T_CLEAN #include "Python.h" #include "_iomodule.h" +#include "pycore_pystate.h" // _PyInterpreterState_GET() #ifdef HAVE_SYS_TYPES_H #include @@ -33,6 +34,7 @@ PyObject *_PyIO_str_fileno = NULL; PyObject *_PyIO_str_flush = NULL; PyObject *_PyIO_str_getstate = NULL; PyObject *_PyIO_str_isatty = NULL; +PyObject *_PyIO_str_locale = NULL; PyObject *_PyIO_str_newlines = NULL; PyObject *_PyIO_str_nl = NULL; PyObject *_PyIO_str_peek = NULL; @@ -504,6 +506,43 @@ _io_open_impl(PyObject *module, PyObject *file, const char *mode, return NULL; } + +/*[clinic input] +_io.text_encoding + encoding: object + stacklevel: int = 2 + / + +A helper function to choose the text encoding. + +When encoding is not None, just return it. +Otherwise, return the default text encoding (i.e. "locale"). + +This function emits an EncodingWarning if encoding is None and +sys.flags.warn_default_encoding is true. + +This can be used in APIs with an encoding=None parameter. +However, please consider using encoding="utf-8" for new APIs. +[clinic start generated code]*/ + +static PyObject * +_io_text_encoding_impl(PyObject *module, PyObject *encoding, int stacklevel) +/*[clinic end generated code: output=91b2cfea6934cc0c input=bf70231213e2a7b4]*/ +{ + if (encoding == NULL || encoding == Py_None) { + PyInterpreterState *interp = _PyInterpreterState_GET(); + if (_PyInterpreterState_GetConfig(interp)->warn_default_encoding) { + PyErr_WarnEx(PyExc_EncodingWarning, + "'encoding' argument not specified", stacklevel); + } + Py_INCREF(_PyIO_str_locale); + return _PyIO_str_locale; + } + Py_INCREF(encoding); + return encoding; +} + + /*[clinic input] _io.open_code @@ -629,6 +668,7 @@ iomodule_free(PyObject *mod) { static PyMethodDef module_methods[] = { _IO_OPEN_METHODDEF + _IO_TEXT_ENCODING_METHODDEF _IO_OPEN_CODE_METHODDEF {NULL, NULL} }; @@ -747,6 +787,7 @@ PyInit__io(void) ADD_INTERNED(flush) ADD_INTERNED(getstate) ADD_INTERNED(isatty) + ADD_INTERNED(locale) ADD_INTERNED(newlines) ADD_INTERNED(peek) ADD_INTERNED(read) diff --git a/Modules/_io/clinic/_iomodule.c.h b/Modules/_io/clinic/_iomodule.c.h index dc7b5ff243a..91c55b1816c 100644 --- a/Modules/_io/clinic/_iomodule.c.h +++ b/Modules/_io/clinic/_iomodule.c.h @@ -272,6 +272,52 @@ exit: return return_value; } +PyDoc_STRVAR(_io_text_encoding__doc__, +"text_encoding($module, encoding, stacklevel=2, /)\n" +"--\n" +"\n" +"A helper function to choose the text encoding.\n" +"\n" +"When encoding is not None, just return it.\n" +"Otherwise, return the default text encoding (i.e. \"locale\").\n" +"\n" +"This function emits an EncodingWarning if encoding is None and\n" +"sys.flags.warn_default_encoding is true.\n" +"\n" +"This can be used in APIs with an encoding=None parameter.\n" +"However, please consider using encoding=\"utf-8\" for new APIs."); + +#define _IO_TEXT_ENCODING_METHODDEF \ + {"text_encoding", (PyCFunction)(void(*)(void))_io_text_encoding, METH_FASTCALL, _io_text_encoding__doc__}, + +static PyObject * +_io_text_encoding_impl(PyObject *module, PyObject *encoding, int stacklevel); + +static PyObject * +_io_text_encoding(PyObject *module, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + PyObject *encoding; + int stacklevel = 2; + + if (!_PyArg_CheckPositional("text_encoding", nargs, 1, 2)) { + goto exit; + } + encoding = args[0]; + if (nargs < 2) { + goto skip_optional; + } + stacklevel = _PyLong_AsInt(args[1]); + if (stacklevel == -1 && PyErr_Occurred()) { + goto exit; + } +skip_optional: + return_value = _io_text_encoding_impl(module, encoding, stacklevel); + +exit: + return return_value; +} + PyDoc_STRVAR(_io_open_code__doc__, "open_code($module, /, path)\n" "--\n" @@ -313,4 +359,4 @@ _io_open_code(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObjec exit: return return_value; } -/*[clinic end generated code: output=5c0dd7a262c30ebc input=a9049054013a1b77]*/ +/*[clinic end generated code: output=06e055d1d80b835d input=a9049054013a1b77]*/ diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 03001ecb0a5..6f89a879c9c 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -1123,6 +1123,17 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer, self->encodefunc = NULL; self->b2cratio = 0.0; + if (encoding == NULL) { + PyInterpreterState *interp = _PyInterpreterState_GET(); + if (_PyInterpreterState_GetConfig(interp)->warn_default_encoding) { + PyErr_WarnEx(PyExc_EncodingWarning, + "'encoding' argument not specified", 1); + } + } + else if (strcmp(encoding, "locale") == 0) { + encoding = NULL; + } + if (encoding == NULL) { /* Try os.device_encoding(fileno) */ PyObject *fileno; diff --git a/Objects/exceptions.c b/Objects/exceptions.c index 88e2287b143..dfa069e01d9 100644 --- a/Objects/exceptions.c +++ b/Objects/exceptions.c @@ -2464,6 +2464,13 @@ SimpleExtendsException(PyExc_Warning, BytesWarning, "related to conversion from str or comparing to str."); +/* + * EncodingWarning extends Warning + */ +SimpleExtendsException(PyExc_Warning, EncodingWarning, + "Base class for warnings about encodings."); + + /* * ResourceWarning extends Warning */ @@ -2592,6 +2599,7 @@ _PyExc_Init(PyInterpreterState *interp) PRE_INIT(BufferError); PRE_INIT(Warning); PRE_INIT(UserWarning); + PRE_INIT(EncodingWarning); PRE_INIT(DeprecationWarning); PRE_INIT(PendingDeprecationWarning); PRE_INIT(SyntaxWarning); @@ -2731,6 +2739,7 @@ _PyBuiltins_AddExceptions(PyObject *bltinmod) POST_INIT(BufferError); POST_INIT(Warning); POST_INIT(UserWarning); + POST_INIT(EncodingWarning); POST_INIT(DeprecationWarning); POST_INIT(PendingDeprecationWarning); POST_INIT(SyntaxWarning); diff --git a/PC/python3dll.c b/PC/python3dll.c index ddbd1b1e8e4..1567ac15916 100644 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -724,6 +724,7 @@ EXPORT_DATA(PyExc_BlockingIOError) EXPORT_DATA(PyExc_BrokenPipeError) EXPORT_DATA(PyExc_BufferError) EXPORT_DATA(PyExc_BytesWarning) +EXPORT_DATA(PyExc_EncodingWarning) EXPORT_DATA(PyExc_ChildProcessError) EXPORT_DATA(PyExc_ConnectionAbortedError) EXPORT_DATA(PyExc_ConnectionError) diff --git a/Python/initconfig.c b/Python/initconfig.c index 7886d09f7a0..27ae48dd3c9 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -94,6 +94,7 @@ static const char usage_3[] = "\ otherwise activate automatically)\n\ -X pycache_prefix=PATH: enable writing .pyc files to a parallel tree rooted at the\n\ given directory instead of to the code tree\n\ + -X warn_default_encoding: enable opt-in EncodingWarning for 'encoding=None'\n\ \n\ --check-hash-based-pycs always|default|never:\n\ control how Python invalidates hash-based .pyc files\n\ @@ -129,7 +130,8 @@ static const char usage_6[] = "PYTHONBREAKPOINT: if this variable is set to 0, it disables the default\n" " debugger. It can be set to the callable of your debugger of choice.\n" "PYTHONDEVMODE: enable the development mode.\n" -"PYTHONPYCACHEPREFIX: root directory for bytecode cache (pyc) files.\n"; +"PYTHONPYCACHEPREFIX: root directory for bytecode cache (pyc) files.\n" +"PYTHONWARNDEFAULTENCODING: enable opt-in EncodingWarning for 'encoding=None'.\n"; #if defined(MS_WINDOWS) # define PYTHONHOMEHELP "\\python{major}{minor}" @@ -600,6 +602,7 @@ config_check_consistency(const PyConfig *config) assert(config->malloc_stats >= 0); assert(config->site_import >= 0); assert(config->bytes_warning >= 0); + assert(config->warn_default_encoding >= 0); assert(config->inspect >= 0); assert(config->interactive >= 0); assert(config->optimization_level >= 0); @@ -698,6 +701,7 @@ _PyConfig_InitCompatConfig(PyConfig *config) config->parse_argv = 0; config->site_import = -1; config->bytes_warning = -1; + config->warn_default_encoding = 0; config->inspect = -1; config->interactive = -1; config->optimization_level = -1; @@ -906,6 +910,7 @@ _PyConfig_Copy(PyConfig *config, const PyConfig *config2) COPY_ATTR(site_import); COPY_ATTR(bytes_warning); + COPY_ATTR(warn_default_encoding); COPY_ATTR(inspect); COPY_ATTR(interactive); COPY_ATTR(optimization_level); @@ -1007,6 +1012,7 @@ _PyConfig_AsDict(const PyConfig *config) SET_ITEM_WSTR(platlibdir); SET_ITEM_INT(site_import); SET_ITEM_INT(bytes_warning); + SET_ITEM_INT(warn_default_encoding); SET_ITEM_INT(inspect); SET_ITEM_INT(interactive); SET_ITEM_INT(optimization_level); @@ -1271,6 +1277,7 @@ _PyConfig_FromDict(PyConfig *config, PyObject *dict) GET_WSTRLIST(warnoptions); GET_UINT(site_import); GET_UINT(bytes_warning); + GET_UINT(warn_default_encoding); GET_UINT(inspect); GET_UINT(interactive); GET_UINT(optimization_level); diff --git a/Python/preconfig.c b/Python/preconfig.c index b8b0c3a0775..ae1cc3f90fc 100644 --- a/Python/preconfig.c +++ b/Python/preconfig.c @@ -169,6 +169,7 @@ _PyPreCmdline_SetConfig(const _PyPreCmdline *cmdline, PyConfig *config) COPY_ATTR(isolated); COPY_ATTR(use_environment); COPY_ATTR(dev_mode); + COPY_ATTR(warn_default_encoding); return _PyStatus_OK(); #undef COPY_ATTR @@ -257,9 +258,17 @@ _PyPreCmdline_Read(_PyPreCmdline *cmdline, const PyPreConfig *preconfig) cmdline->dev_mode = 0; } + // warn_default_encoding + if (_Py_get_xoption(&cmdline->xoptions, L"warn_default_encoding") + || _Py_GetEnv(cmdline->use_environment, "PYTHONWARNDEFAULTENCODING")) + { + cmdline->warn_default_encoding = 1; + } + assert(cmdline->use_environment >= 0); assert(cmdline->isolated >= 0); assert(cmdline->dev_mode >= 0); + assert(cmdline->warn_default_encoding >= 0); return _PyStatus_OK(); } diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 686b6cae3b2..54d70ef0569 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2514,6 +2514,7 @@ static PyStructSequence_Field flags_fields[] = { {"isolated", "-I"}, {"dev_mode", "-X dev"}, {"utf8_mode", "-X utf8"}, + {"warn_default_encoding", "-X warn_default_encoding"}, {0} }; @@ -2521,7 +2522,7 @@ static PyStructSequence_Desc flags_desc = { "sys.flags", /* name */ flags__doc__, /* doc */ flags_fields, /* fields */ - 15 + 16 }; static int @@ -2560,6 +2561,7 @@ set_flags_from_config(PyInterpreterState *interp, PyObject *flags) SetFlag(config->isolated); SetFlagObj(PyBool_FromLong(config->dev_mode)); SetFlag(preconfig->utf8_mode); + SetFlag(config->warn_default_encoding); #undef SetFlagObj #undef SetFlag return 0;