gh-91156: Fix `encoding="locale"` in UTF-8 mode (GH-70056)

This commit is contained in:
Inada Naoki 2022-04-14 16:00:35 +09:00 committed by GitHub
parent 7b87e8af0c
commit 13b17e2a0a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 35 additions and 24 deletions

View File

@ -112,7 +112,7 @@ Text Encoding
-------------
The default encoding of :class:`TextIOWrapper` and :func:`open` is
locale-specific (:func:`locale.getpreferredencoding(False) <locale.getpreferredencoding>`).
locale-specific (:func:`locale.getencoding`).
However, many developers forget to specify the encoding when opening text files
encoded in UTF-8 (e.g. JSON, TOML, Markdown, etc...) since most Unix
@ -948,8 +948,7 @@ Text I/O
:class:`TextIOBase`.
*encoding* gives the name of the encoding that the stream will be decoded or
encoded with. It defaults to
:func:`locale.getpreferredencoding(False) <locale.getpreferredencoding>`.
encoded with. It defaults to :func:`locale.getencoding()`.
``encoding="locale"`` can be used to specify the current locale's encoding
explicitly. See :ref:`io-text-encoding` for more information.

View File

@ -618,7 +618,7 @@ UTF-8 mode
Windows still uses legacy encodings for the system encoding (the ANSI Code
Page). Python uses it for the default encoding of text files (e.g.
:func:`locale.getpreferredencoding`).
:func:`locale.getencoding`).
This may cause issues because UTF-8 is widely used on the internet
and most Unix systems, including WSL (Windows Subsystem for Linux).

View File

@ -1988,7 +1988,7 @@ class TextIOWrapper(TextIOBase):
r"""Character and line based layer over a BufferedIOBase object, buffer.
encoding gives the name of the encoding that the stream will be
decoded or encoded with. It defaults to locale.getpreferredencoding(False).
decoded or encoded with. It defaults to locale.getencoding().
errors determines the strictness of encoding and decoding (see the
codecs.register) and defaults to "strict".
@ -2021,7 +2021,9 @@ class TextIOWrapper(TextIOBase):
self._check_newline(newline)
encoding = text_encoding(encoding)
if encoding == "locale":
if encoding == "locale" and sys.platform == "win32":
# On Unix, os.device_encoding() returns "utf-8" instead of locale encoding
# in the UTF-8 mode. So we use os.device_encoding() only on Windows.
try:
encoding = os.device_encoding(buffer.fileno()) or "locale"
except (AttributeError, UnsupportedOperation):
@ -2034,7 +2036,7 @@ class TextIOWrapper(TextIOBase):
# Importing locale may fail if Python is being built
encoding = "utf-8"
else:
encoding = locale.getpreferredencoding(False)
encoding = locale.getencoding()
if not isinstance(encoding, str):
raise ValueError("invalid encoding: %r" % encoding)

View File

@ -557,7 +557,7 @@ def getdefaultlocale(envvars=('LC_ALL', 'LC_CTYPE', 'LANG', 'LANGUAGE')):
import warnings
warnings.warn(
"Use setlocale(), getpreferredencoding(False) and getlocale() instead",
"Use setlocale(), getencoding() and getlocale() instead",
DeprecationWarning, stacklevel=2
)

View File

@ -2737,6 +2737,7 @@ class TextIOWrapperTest(unittest.TestCase):
os.environ.update(old_environ)
@support.cpython_only
@unittest.skipIf(sys.platform != "win32", "Windows-only test")
@unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
def test_device_encoding(self):
# Issue 15989

View File

@ -0,0 +1,2 @@
Make :class:`TextIOWrapper` uses locale encoding when ``encoding="locale"``
is specified even in UTF-8 mode.

View File

@ -92,9 +92,9 @@ it already exists), 'x' for creating and writing to a new file, and
'a' for appending (which on some Unix systems, means that all writes
append to the end of the file regardless of the current seek position).
In text mode, if encoding is not specified the encoding used is platform
dependent: locale.getpreferredencoding(False) is called to get the
current locale encoding. (For reading and writing raw bytes use binary
mode and leave encoding unspecified.) The available modes are:
dependent: locale.getencoding() is called to get the current locale encoding.
(For reading and writing raw bytes use binary mode and leave encoding
unspecified.) The available modes are:
========= ===============================================================
Character Meaning
@ -196,7 +196,7 @@ static PyObject *
_io_open_impl(PyObject *module, PyObject *file, const char *mode,
int buffering, const char *encoding, const char *errors,
const char *newline, int closefd, PyObject *opener)
/*[clinic end generated code: output=aefafc4ce2b46dc0 input=1543f4511d2356a5]*/
/*[clinic end generated code: output=aefafc4ce2b46dc0 input=5bb37f174cb2fb11]*/
{
unsigned i;

View File

@ -22,9 +22,9 @@ PyDoc_STRVAR(_io_open__doc__,
"\'a\' for appending (which on some Unix systems, means that all writes\n"
"append to the end of the file regardless of the current seek position).\n"
"In text mode, if encoding is not specified the encoding used is platform\n"
"dependent: locale.getpreferredencoding(False) is called to get the\n"
"current locale encoding. (For reading and writing raw bytes use binary\n"
"mode and leave encoding unspecified.) The available modes are:\n"
"dependent: locale.getencoding() is called to get the current locale encoding.\n"
"(For reading and writing raw bytes use binary mode and leave encoding\n"
"unspecified.) The available modes are:\n"
"\n"
"========= ===============================================================\n"
"Character Meaning\n"
@ -355,4 +355,4 @@ _io_open_code(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObjec
exit:
return return_value;
}
/*[clinic end generated code: output=1a7fd7755c9a9609 input=a9049054013a1b77]*/
/*[clinic end generated code: output=e562f29e3c2533a6 input=a9049054013a1b77]*/

View File

@ -146,7 +146,7 @@ PyDoc_STRVAR(_io_TextIOWrapper___init____doc__,
"Character and line based layer over a BufferedIOBase object, buffer.\n"
"\n"
"encoding gives the name of the encoding that the stream will be\n"
"decoded or encoded with. It defaults to locale.getpreferredencoding(False).\n"
"decoded or encoded with. It defaults to locale.getencoding().\n"
"\n"
"errors determines the strictness of encoding and decoding (see\n"
"help(codecs.Codec) or the documentation for codecs.register) and\n"
@ -671,4 +671,4 @@ _io_TextIOWrapper_close(textio *self, PyObject *Py_UNUSED(ignored))
{
return _io_TextIOWrapper_close_impl(self);
}
/*[clinic end generated code: output=2604c8f3a45b9a03 input=a9049054013a1b77]*/
/*[clinic end generated code: output=e88abad34e31c0cb input=a9049054013a1b77]*/

View File

@ -1023,7 +1023,7 @@ _io.TextIOWrapper.__init__
Character and line based layer over a BufferedIOBase object, buffer.
encoding gives the name of the encoding that the stream will be
decoded or encoded with. It defaults to locale.getpreferredencoding(False).
decoded or encoded with. It defaults to locale.getencoding().
errors determines the strictness of encoding and decoding (see
help(codecs.Codec) or the documentation for codecs.register) and
@ -1055,12 +1055,12 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
const char *encoding, PyObject *errors,
const char *newline, int line_buffering,
int write_through)
/*[clinic end generated code: output=72267c0c01032ed2 input=77d8696d1a1f460b]*/
/*[clinic end generated code: output=72267c0c01032ed2 input=72590963698f289b]*/
{
PyObject *raw, *codec_info = NULL;
_PyIO_State *state = NULL;
PyObject *res;
int r;
int use_locale_encoding = 0; // Use locale encoding even in UTF-8 mode.
self->ok = 0;
self->detached = 0;
@ -1076,6 +1076,7 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
}
else if (strcmp(encoding, "locale") == 0) {
encoding = NULL;
use_locale_encoding = 1;
}
if (errors == Py_None) {
@ -1113,10 +1114,15 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
self->encodefunc = NULL;
self->b2cratio = 0.0;
#ifdef MS_WINDOWS
// os.device_encoding() on Unix is the locale encoding or UTF-8
// according to UTF-8 Mode.
// Since UTF-8 mode shouldn't affect `encoding="locale"`, we call
// os.device_encoding() only on Windows.
if (encoding == NULL) {
/* Try os.device_encoding(fileno) */
PyObject *fileno;
state = IO_STATE();
_PyIO_State *state = IO_STATE();
if (state == NULL)
goto error;
fileno = PyObject_CallMethodNoArgs(buffer, &_Py_ID(fileno));
@ -1144,8 +1150,10 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
Py_CLEAR(self->encoding);
}
}
#endif
if (encoding == NULL && self->encoding == NULL) {
if (_PyRuntime.preconfig.utf8_mode) {
if (_PyRuntime.preconfig.utf8_mode && !use_locale_encoding) {
_Py_DECLARE_STR(utf_8, "utf-8");
self->encoding = Py_NewRef(&_Py_STR(utf_8));
}

View File

@ -251,7 +251,6 @@ Modules/_io/textio.c:PyId_close _Py_IDENTIFIER(
Modules/_io/textio.c:PyId_decode _Py_IDENTIFIER(decode)
Modules/_io/textio.c:PyId_fileno _Py_IDENTIFIER(fileno)
Modules/_io/textio.c:PyId_flush _Py_IDENTIFIER(flush)
Modules/_io/textio.c:PyId_getpreferredencoding _Py_IDENTIFIER(getpreferredencoding)
Modules/_io/textio.c:PyId_isatty _Py_IDENTIFIER(isatty)
Modules/_io/textio.c:PyId_mode _Py_IDENTIFIER(mode)
Modules/_io/textio.c:PyId_name _Py_IDENTIFIER(name)