bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)
* Add -X utf8 command line option, PYTHONUTF8 environment variable and a new sys.flags.utf8_mode flag. * If the LC_CTYPE locale is "C" at startup: enable automatically the UTF-8 mode. * Add _winapi.GetACP(). encodings._alias_mbcs() now calls _winapi.GetACP() to get the ANSI code page * locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8 mode. As a side effect, open() now uses the UTF-8 encoding by default in this mode. * Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding in the UTF-8 Mode. * Update subprocess._args_from_interpreter_flags() to handle -X utf8 * Skip some tests relying on the current locale if the UTF-8 mode is enabled. * Add test_utf8mode.py. * _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to return also the length (number of wide characters). * pymain_get_global_config() and pymain_set_global_config() now always copy flag values, rather than only copying if the new value is greater than the old value.
This commit is contained in:
parent
c3e070f849
commit
91106cd9ff
|
@ -127,6 +127,9 @@ Operating System Utilities
|
|||
|
||||
.. versionadded:: 3.5
|
||||
|
||||
.. versionchanged:: 3.7
|
||||
The function now uses the UTF-8 encoding in the UTF-8 mode.
|
||||
|
||||
|
||||
.. c:function:: char* Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
|
||||
|
||||
|
@ -138,12 +141,15 @@ Operating System Utilities
|
|||
to free the memory. Return ``NULL`` on encoding error or memory allocation
|
||||
error
|
||||
|
||||
If error_pos is not ``NULL``, ``*error_pos`` is set to the index of the
|
||||
invalid character on encoding error, or set to ``(size_t)-1`` otherwise.
|
||||
If error_pos is not ``NULL``, ``*error_pos`` is set to ``(size_t)-1`` on
|
||||
success, or set to the index of the invalid character on encoding error.
|
||||
|
||||
Use the :c:func:`Py_DecodeLocale` function to decode the bytes string back
|
||||
to a wide character string.
|
||||
|
||||
.. versionchanged:: 3.7
|
||||
The function now uses the UTF-8 encoding in the UTF-8 mode.
|
||||
|
||||
.. seealso::
|
||||
|
||||
The :c:func:`PyUnicode_EncodeFSDefault` and
|
||||
|
@ -151,6 +157,9 @@ Operating System Utilities
|
|||
|
||||
.. versionadded:: 3.5
|
||||
|
||||
.. versionchanged:: 3.7
|
||||
The function now supports the UTF-8 mode.
|
||||
|
||||
|
||||
.. _systemfunctions:
|
||||
|
||||
|
|
|
@ -316,6 +316,13 @@ The :mod:`locale` module defines the following exception and functions:
|
|||
preferences, so this function is not thread-safe. If invoking setlocale is not
|
||||
necessary or desired, *do_setlocale* should be set to ``False``.
|
||||
|
||||
On Android or in the UTF-8 mode (:option:`-X` ``utf8`` option), always
|
||||
return ``'UTF-8'``, the locale and the *do_setlocale* argument are ignored.
|
||||
|
||||
.. versionchanged:: 3.7
|
||||
The function now always returns ``UTF-8`` on Android or if the UTF-8 mode
|
||||
is enabled.
|
||||
|
||||
|
||||
.. function:: normalize(localename)
|
||||
|
||||
|
|
|
@ -313,6 +313,9 @@ always available.
|
|||
has caught :exc:`SystemExit` (such as an error flushing buffered data
|
||||
in the standard streams), the exit status is changed to 120.
|
||||
|
||||
.. versionchanged:: 3.7
|
||||
Added ``utf8_mode`` attribute for the new :option:`-X` ``utf8`` flag.
|
||||
|
||||
|
||||
.. data:: flags
|
||||
|
||||
|
@ -335,6 +338,7 @@ always available.
|
|||
:const:`quiet` :option:`-q`
|
||||
:const:`hash_randomization` :option:`-R`
|
||||
:const:`dev_mode` :option:`-X` ``dev``
|
||||
:const:`utf8_mode` :option:`-X` ``utf8``
|
||||
============================= =============================
|
||||
|
||||
.. versionchanged:: 3.2
|
||||
|
@ -347,7 +351,8 @@ always available.
|
|||
Removed obsolete ``division_warning`` attribute.
|
||||
|
||||
.. versionchanged:: 3.7
|
||||
Added ``dev_mode`` attribute for the new :option:`-X` ``dev`` flag.
|
||||
Added ``dev_mode`` attribute for the new :option:`-X` ``dev`` flag
|
||||
and ``utf8_mode`` attribute for the new :option:`-X` ``utf8`` flag.
|
||||
|
||||
|
||||
.. data:: float_info
|
||||
|
@ -492,6 +497,8 @@ always available.
|
|||
:func:`os.fsencode` and :func:`os.fsdecode` should be used to ensure that
|
||||
the correct encoding and errors mode are used.
|
||||
|
||||
* In the UTF-8 mode, the encoding is ``utf-8`` on any platform.
|
||||
|
||||
* On Mac OS X, the encoding is ``'utf-8'``.
|
||||
|
||||
* On Unix, the encoding is the locale encoding.
|
||||
|
@ -506,6 +513,10 @@ always available.
|
|||
Windows is no longer guaranteed to return ``'mbcs'``. See :pep:`529`
|
||||
and :func:`_enablelegacywindowsfsencoding` for more information.
|
||||
|
||||
.. versionchanged:: 3.7
|
||||
Return 'utf-8' in the UTF-8 mode.
|
||||
|
||||
|
||||
.. function:: getfilesystemencodeerrors()
|
||||
|
||||
Return the name of the error mode used to convert between Unicode filenames
|
||||
|
|
|
@ -439,6 +439,9 @@ Miscellaneous options
|
|||
* Set the :attr:`~sys.flags.dev_mode` attribute of :attr:`sys.flags` to
|
||||
``True``
|
||||
|
||||
* ``-X utf8`` enables the UTF-8 mode, whereas ``-X utf8=0`` disables the
|
||||
UTF-8 mode.
|
||||
|
||||
It also allows passing arbitrary values and retrieving them through the
|
||||
:data:`sys._xoptions` dictionary.
|
||||
|
||||
|
@ -455,7 +458,7 @@ Miscellaneous options
|
|||
The ``-X showalloccount`` option.
|
||||
|
||||
.. versionadded:: 3.7
|
||||
The ``-X importtime`` and ``-X dev`` options.
|
||||
The ``-X importtime``, ``-X dev`` and ``-X utf8`` options.
|
||||
|
||||
|
||||
Options you shouldn't use
|
||||
|
@ -816,6 +819,14 @@ conflict.
|
|||
|
||||
.. versionadded:: 3.7
|
||||
|
||||
.. envvar:: PYTHONUTF8
|
||||
|
||||
If set to ``1``, enable the UTF-8 mode. If set to ``0``, disable the UTF-8
|
||||
mode. Any other non-empty string cause an error.
|
||||
|
||||
.. versionadded:: 3.7
|
||||
|
||||
|
||||
Debug-mode variables
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
|
|
@ -185,6 +185,23 @@ resolution on Linux and Windows.
|
|||
PEP written and implemented by Victor Stinner
|
||||
|
||||
|
||||
PEP 540: Add a new UTF-8 mode
|
||||
-----------------------------
|
||||
|
||||
Add a new UTF-8 mode to ignore the locale, use the UTF-8 encoding, and change
|
||||
:data:`sys.stdin` and :data:`sys.stdout` error handlers to ``surrogateescape``.
|
||||
This mode is enabled by default in the POSIX locale, but otherwise disabled by
|
||||
default.
|
||||
|
||||
The new :option:`-X` ``utf8`` command line option and :envvar:`PYTHONUTF8`
|
||||
environment variable are added to control the UTF-8 mode.
|
||||
|
||||
.. seealso::
|
||||
|
||||
:pep:`540` -- Add a new UTF-8 mode
|
||||
PEP written and implemented by Victor Stinner
|
||||
|
||||
|
||||
New Development Mode: -X dev
|
||||
----------------------------
|
||||
|
||||
|
@ -353,6 +370,10 @@ Added another argument *monetary* in :meth:`format_string` of :mod:`locale`.
|
|||
If *monetary* is true, the conversion uses monetary thousands separator and
|
||||
grouping strings. (Contributed by Garvit in :issue:`10379`.)
|
||||
|
||||
The :func:`locale.getpreferredencoding` function now always returns ``'UTF-8'``
|
||||
on Android or in the UTF-8 mode (:option:`-X` ``utf8`` option), the locale and
|
||||
the *do_setlocale* argument are ignored.
|
||||
|
||||
math
|
||||
----
|
||||
|
||||
|
|
|
@ -28,6 +28,10 @@ PyAPI_DATA(const char *) Py_FileSystemDefaultEncodeErrors;
|
|||
#endif
|
||||
PyAPI_DATA(int) Py_HasFileSystemDefaultEncoding;
|
||||
|
||||
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
|
||||
PyAPI_DATA(int) Py_UTF8Mode;
|
||||
#endif
|
||||
|
||||
/* Internal API
|
||||
|
||||
The std printer acts as a preliminary sys.stderr until the new io
|
||||
|
|
|
@ -38,6 +38,7 @@ typedef struct {
|
|||
int show_alloc_count; /* -X showalloccount */
|
||||
int dump_refs; /* PYTHONDUMPREFS */
|
||||
int malloc_stats; /* PYTHONMALLOCSTATS */
|
||||
int utf8_mode; /* -X utf8 or PYTHONUTF8 environment variable */
|
||||
} _PyCoreConfig;
|
||||
|
||||
#define _PyCoreConfig_INIT (_PyCoreConfig){.use_hash_seed = -1}
|
||||
|
|
|
@ -9,6 +9,8 @@ import _locale
|
|||
|
||||
if sys.platform.startswith("win"):
|
||||
def getpreferredencoding(do_setlocale=True):
|
||||
if sys.flags.utf8_mode:
|
||||
return 'UTF-8'
|
||||
return _locale._getdefaultlocale()[1]
|
||||
else:
|
||||
try:
|
||||
|
@ -21,6 +23,8 @@ else:
|
|||
return 'UTF-8'
|
||||
else:
|
||||
def getpreferredencoding(do_setlocale=True):
|
||||
if sys.flags.utf8_mode:
|
||||
return 'UTF-8'
|
||||
# This path for legacy systems needs the more complex
|
||||
# getdefaultlocale() function, import the full locale module.
|
||||
import locale
|
||||
|
@ -28,6 +32,8 @@ else:
|
|||
else:
|
||||
def getpreferredencoding(do_setlocale=True):
|
||||
assert not do_setlocale
|
||||
if sys.flags.utf8_mode:
|
||||
return 'UTF-8'
|
||||
result = _locale.nl_langinfo(_locale.CODESET)
|
||||
if not result and sys.platform == 'darwin':
|
||||
# nl_langinfo can return an empty string
|
||||
|
|
|
@ -158,8 +158,9 @@ codecs.register(search_function)
|
|||
if sys.platform == 'win32':
|
||||
def _alias_mbcs(encoding):
|
||||
try:
|
||||
import _bootlocale
|
||||
if encoding == _bootlocale.getpreferredencoding(False):
|
||||
import _winapi
|
||||
ansi_code_page = "cp%s" % _winapi.GetACP()
|
||||
if encoding == ansi_code_page:
|
||||
import encodings.mbcs
|
||||
return encodings.mbcs.getregentry()
|
||||
except ImportError:
|
||||
|
|
|
@ -617,6 +617,8 @@ if sys.platform.startswith("win"):
|
|||
# On Win32, this will return the ANSI code page
|
||||
def getpreferredencoding(do_setlocale = True):
|
||||
"""Return the charset that the user is likely using."""
|
||||
if sys.flags.utf8_mode:
|
||||
return 'UTF-8'
|
||||
import _bootlocale
|
||||
return _bootlocale.getpreferredencoding(False)
|
||||
else:
|
||||
|
@ -634,6 +636,8 @@ else:
|
|||
def getpreferredencoding(do_setlocale = True):
|
||||
"""Return the charset that the user is likely using,
|
||||
by looking at environment variables."""
|
||||
if sys.flags.utf8_mode:
|
||||
return 'UTF-8'
|
||||
res = getdefaultlocale()[1]
|
||||
if res is None:
|
||||
# LANG not set, default conservatively to ASCII
|
||||
|
@ -643,6 +647,8 @@ else:
|
|||
def getpreferredencoding(do_setlocale = True):
|
||||
"""Return the charset that the user is likely using,
|
||||
according to the system configuration."""
|
||||
if sys.flags.utf8_mode:
|
||||
return 'UTF-8'
|
||||
import _bootlocale
|
||||
if do_setlocale:
|
||||
oldloc = setlocale(LC_CTYPE)
|
||||
|
|
|
@ -280,7 +280,7 @@ def _args_from_interpreter_flags():
|
|||
if dev_mode:
|
||||
args.extend(('-X', 'dev'))
|
||||
for opt in ('faulthandler', 'tracemalloc', 'importtime',
|
||||
'showalloccount', 'showrefcount'):
|
||||
'showalloccount', 'showrefcount', 'utf8'):
|
||||
if opt in xoptions:
|
||||
value = xoptions[opt]
|
||||
if value is True:
|
||||
|
|
|
@ -1022,6 +1022,7 @@ class BuiltinTest(unittest.TestCase):
|
|||
self.assertRaises(ValueError, open, 'a\x00b')
|
||||
self.assertRaises(ValueError, open, b'a\x00b')
|
||||
|
||||
@unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
|
||||
def test_open_default_encoding(self):
|
||||
old_environ = dict(os.environ)
|
||||
try:
|
||||
|
|
|
@ -130,7 +130,7 @@ class EncodingDetails(_EncodingDetails):
|
|||
that.
|
||||
"""
|
||||
result, py_cmd = run_python_until_end(
|
||||
"-c", cls.CHILD_PROCESS_SCRIPT,
|
||||
"-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
|
||||
__isolated=True,
|
||||
**env_vars
|
||||
)
|
||||
|
|
|
@ -5,6 +5,7 @@ import locale
|
|||
import sys
|
||||
import unittest
|
||||
import encodings
|
||||
from unittest import mock
|
||||
|
||||
from test import support
|
||||
|
||||
|
@ -3180,16 +3181,9 @@ class CodePageTest(unittest.TestCase):
|
|||
def test_mbcs_alias(self):
|
||||
# Check that looking up our 'default' codepage will return
|
||||
# mbcs when we don't have a more specific one available
|
||||
import _bootlocale
|
||||
def _get_fake_codepage(*a):
|
||||
return 'cp123'
|
||||
old_getpreferredencoding = _bootlocale.getpreferredencoding
|
||||
_bootlocale.getpreferredencoding = _get_fake_codepage
|
||||
try:
|
||||
with mock.patch('_winapi.GetACP', return_value=123):
|
||||
codec = codecs.lookup('cp123')
|
||||
self.assertEqual(codec.name, 'mbcs')
|
||||
finally:
|
||||
_bootlocale.getpreferredencoding = old_getpreferredencoding
|
||||
|
||||
|
||||
class ASCIITest(unittest.TestCase):
|
||||
|
|
|
@ -2580,6 +2580,7 @@ class TextIOWrapperTest(unittest.TestCase):
|
|||
t.reconfigure(line_buffering=None)
|
||||
self.assertEqual(t.line_buffering, True)
|
||||
|
||||
@unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
|
||||
def test_default_encoding(self):
|
||||
old_environ = dict(os.environ)
|
||||
try:
|
||||
|
@ -2599,6 +2600,7 @@ class TextIOWrapperTest(unittest.TestCase):
|
|||
os.environ.update(old_environ)
|
||||
|
||||
@support.cpython_only
|
||||
@unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
|
||||
def test_device_encoding(self):
|
||||
# Issue 15989
|
||||
import _testcapi
|
||||
|
|
|
@ -527,7 +527,7 @@ class SysModuleTest(unittest.TestCase):
|
|||
"inspect", "interactive", "optimize", "dont_write_bytecode",
|
||||
"no_user_site", "no_site", "ignore_environment", "verbose",
|
||||
"bytes_warning", "quiet", "hash_randomization", "isolated",
|
||||
"dev_mode")
|
||||
"dev_mode", "utf8_mode")
|
||||
for attr in attrs:
|
||||
self.assertTrue(hasattr(sys.flags, attr), attr)
|
||||
attr_type = bool if attr == "dev_mode" else int
|
||||
|
@ -535,6 +535,8 @@ class SysModuleTest(unittest.TestCase):
|
|||
self.assertTrue(repr(sys.flags))
|
||||
self.assertEqual(len(sys.flags), len(attrs))
|
||||
|
||||
self.assertIn(sys.flags.utf8_mode, {0, 1, 2})
|
||||
|
||||
def assert_raise_on_new_sys_type(self, sys_attr):
|
||||
# Users are intentionally prevented from creating new instances of
|
||||
# sys.flags, sys.version_info, and sys.getwindowsversion.
|
||||
|
@ -710,8 +712,8 @@ class SysModuleTest(unittest.TestCase):
|
|||
# have no any effect
|
||||
out = self.c_locale_get_error_handler(encoding=':')
|
||||
self.assertEqual(out,
|
||||
'stdin: surrogateescape\n'
|
||||
'stdout: surrogateescape\n'
|
||||
'stdin: strict\n'
|
||||
'stdout: strict\n'
|
||||
'stderr: backslashreplace\n')
|
||||
out = self.c_locale_get_error_handler(encoding='')
|
||||
self.assertEqual(out,
|
||||
|
|
|
@ -0,0 +1,206 @@
|
|||
"""
|
||||
Test the implementation of the PEP 540: the UTF-8 Mode.
|
||||
"""
|
||||
|
||||
import locale
|
||||
import os
|
||||
import sys
|
||||
import textwrap
|
||||
import unittest
|
||||
from test.support.script_helper import assert_python_ok, assert_python_failure
|
||||
|
||||
|
||||
MS_WINDOWS = (sys.platform == 'win32')
|
||||
|
||||
|
||||
class UTF8ModeTests(unittest.TestCase):
|
||||
# Override PYTHONUTF8 and PYTHONLEGACYWINDOWSFSENCODING environment
|
||||
# variables by default
|
||||
DEFAULT_ENV = {'PYTHONUTF8': '', 'PYTHONLEGACYWINDOWSFSENCODING': ''}
|
||||
|
||||
def posix_locale(self):
|
||||
loc = locale.setlocale(locale.LC_CTYPE, None)
|
||||
return (loc == 'C')
|
||||
|
||||
def get_output(self, *args, failure=False, **kw):
|
||||
kw = dict(self.DEFAULT_ENV, **kw)
|
||||
if failure:
|
||||
out = assert_python_failure(*args, **kw)
|
||||
out = out[2]
|
||||
else:
|
||||
out = assert_python_ok(*args, **kw)
|
||||
out = out[1]
|
||||
return out.decode().rstrip("\n\r")
|
||||
|
||||
@unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale')
|
||||
def test_posix_locale(self):
|
||||
code = 'import sys; print(sys.flags.utf8_mode)'
|
||||
|
||||
out = self.get_output('-c', code, LC_ALL='C')
|
||||
self.assertEqual(out, '1')
|
||||
|
||||
def test_xoption(self):
|
||||
code = 'import sys; print(sys.flags.utf8_mode)'
|
||||
|
||||
out = self.get_output('-X', 'utf8', '-c', code)
|
||||
self.assertEqual(out, '1')
|
||||
|
||||
# undocumented but accepted syntax: -X utf8=1
|
||||
out = self.get_output('-X', 'utf8=1', '-c', code)
|
||||
self.assertEqual(out, '1')
|
||||
|
||||
out = self.get_output('-X', 'utf8=0', '-c', code)
|
||||
self.assertEqual(out, '0')
|
||||
|
||||
if MS_WINDOWS:
|
||||
# PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8
|
||||
# and has the priority over -X utf8
|
||||
out = self.get_output('-X', 'utf8', '-c', code,
|
||||
PYTHONLEGACYWINDOWSFSENCODING='1')
|
||||
self.assertEqual(out, '0')
|
||||
|
||||
def test_env_var(self):
|
||||
code = 'import sys; print(sys.flags.utf8_mode)'
|
||||
|
||||
out = self.get_output('-c', code, PYTHONUTF8='1')
|
||||
self.assertEqual(out, '1')
|
||||
|
||||
out = self.get_output('-c', code, PYTHONUTF8='0')
|
||||
self.assertEqual(out, '0')
|
||||
|
||||
# -X utf8 has the priority over PYTHONUTF8
|
||||
out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1')
|
||||
self.assertEqual(out, '0')
|
||||
|
||||
if MS_WINDOWS:
|
||||
# PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
|
||||
# and has the priority over PYTHONUTF8
|
||||
out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1',
|
||||
PYTHONLEGACYWINDOWSFSENCODING='1')
|
||||
self.assertEqual(out, '0')
|
||||
|
||||
# Cannot test with the POSIX locale, since the POSIX locale enables
|
||||
# the UTF-8 mode
|
||||
if not self.posix_locale():
|
||||
# PYTHONUTF8 should be ignored if -E is used
|
||||
out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
|
||||
self.assertEqual(out, '0')
|
||||
|
||||
# invalid mode
|
||||
out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
|
||||
self.assertIn('invalid PYTHONUTF8 environment variable value',
|
||||
out.rstrip())
|
||||
|
||||
def test_filesystemencoding(self):
|
||||
code = textwrap.dedent('''
|
||||
import sys
|
||||
print("{}/{}".format(sys.getfilesystemencoding(),
|
||||
sys.getfilesystemencodeerrors()))
|
||||
''')
|
||||
|
||||
if MS_WINDOWS:
|
||||
expected = 'utf-8/surrogatepass'
|
||||
else:
|
||||
expected = 'utf-8/surrogateescape'
|
||||
|
||||
out = self.get_output('-X', 'utf8', '-c', code)
|
||||
self.assertEqual(out, expected)
|
||||
|
||||
if MS_WINDOWS:
|
||||
# PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
|
||||
# and has the priority over -X utf8 and PYTHONUTF8
|
||||
out = self.get_output('-X', 'utf8', '-c', code,
|
||||
PYTHONUTF8='strict',
|
||||
PYTHONLEGACYWINDOWSFSENCODING='1')
|
||||
self.assertEqual(out, 'mbcs/replace')
|
||||
|
||||
def test_stdio(self):
|
||||
code = textwrap.dedent('''
|
||||
import sys
|
||||
print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}")
|
||||
print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}")
|
||||
print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}")
|
||||
''')
|
||||
|
||||
out = self.get_output('-X', 'utf8', '-c', code,
|
||||
PYTHONIOENCODING='')
|
||||
self.assertEqual(out.splitlines(),
|
||||
['stdin: utf-8/surrogateescape',
|
||||
'stdout: utf-8/surrogateescape',
|
||||
'stderr: utf-8/backslashreplace'])
|
||||
|
||||
# PYTHONIOENCODING has the priority over PYTHONUTF8
|
||||
out = self.get_output('-X', 'utf8', '-c', code,
|
||||
PYTHONIOENCODING="latin1")
|
||||
self.assertEqual(out.splitlines(),
|
||||
['stdin: latin1/strict',
|
||||
'stdout: latin1/strict',
|
||||
'stderr: latin1/backslashreplace'])
|
||||
|
||||
out = self.get_output('-X', 'utf8', '-c', code,
|
||||
PYTHONIOENCODING=":namereplace")
|
||||
self.assertEqual(out.splitlines(),
|
||||
['stdin: UTF-8/namereplace',
|
||||
'stdout: UTF-8/namereplace',
|
||||
'stderr: UTF-8/backslashreplace'])
|
||||
|
||||
def test_io(self):
|
||||
code = textwrap.dedent('''
|
||||
import sys
|
||||
filename = sys.argv[1]
|
||||
with open(filename) as fp:
|
||||
print(f"{fp.encoding}/{fp.errors}")
|
||||
''')
|
||||
filename = __file__
|
||||
|
||||
out = self.get_output('-c', code, filename, PYTHONUTF8='1')
|
||||
self.assertEqual(out, 'UTF-8/strict')
|
||||
|
||||
def _check_io_encoding(self, module, encoding=None, errors=None):
|
||||
filename = __file__
|
||||
|
||||
# Encoding explicitly set
|
||||
args = []
|
||||
if encoding:
|
||||
args.append(f'encoding={encoding!r}')
|
||||
if errors:
|
||||
args.append(f'errors={errors!r}')
|
||||
code = textwrap.dedent('''
|
||||
import sys
|
||||
from %s import open
|
||||
filename = sys.argv[1]
|
||||
with open(filename, %s) as fp:
|
||||
print(f"{fp.encoding}/{fp.errors}")
|
||||
''') % (module, ', '.join(args))
|
||||
out = self.get_output('-c', code, filename,
|
||||
PYTHONUTF8='1')
|
||||
|
||||
if not encoding:
|
||||
encoding = 'UTF-8'
|
||||
if not errors:
|
||||
errors = 'strict'
|
||||
self.assertEqual(out, f'{encoding}/{errors}')
|
||||
|
||||
def check_io_encoding(self, module):
|
||||
self._check_io_encoding(module, encoding="latin1")
|
||||
self._check_io_encoding(module, errors="namereplace")
|
||||
self._check_io_encoding(module,
|
||||
encoding="latin1", errors="namereplace")
|
||||
|
||||
def test_io_encoding(self):
|
||||
self.check_io_encoding('io')
|
||||
|
||||
def test_io_encoding(self):
|
||||
self.check_io_encoding('_pyio')
|
||||
|
||||
def test_locale_getpreferredencoding(self):
|
||||
code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))'
|
||||
out = self.get_output('-X', 'utf8', '-c', code)
|
||||
self.assertEqual(out, 'UTF-8 UTF-8')
|
||||
|
||||
out = self.get_output('-X', 'utf8', '-c', code, LC_ALL='C')
|
||||
self.assertEqual(out, 'UTF-8 UTF-8')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
|
@ -0,0 +1 @@
|
|||
Add a new UTF-8 mode: implementation of the :pep:`540`.
|
|
@ -1490,6 +1490,20 @@ _winapi_WriteFile_impl(PyObject *module, HANDLE handle, PyObject *buffer,
|
|||
}
|
||||
|
||||
|
||||
/*[clinic input]
|
||||
_winapi.GetACP
|
||||
|
||||
Get the current Windows ANSI code page identifier.
|
||||
[clinic start generated code]*/
|
||||
|
||||
static PyObject *
|
||||
_winapi_GetACP_impl(PyObject *module)
|
||||
/*[clinic end generated code: output=f7ee24bf705dbb88 input=1433c96d03a05229]*/
|
||||
{
|
||||
return PyLong_FromUnsignedLong(GetACP());
|
||||
}
|
||||
|
||||
|
||||
static PyMethodDef winapi_functions[] = {
|
||||
_WINAPI_CLOSEHANDLE_METHODDEF
|
||||
_WINAPI_CONNECTNAMEDPIPE_METHODDEF
|
||||
|
@ -1515,6 +1529,7 @@ static PyMethodDef winapi_functions[] = {
|
|||
_WINAPI_WAITFORMULTIPLEOBJECTS_METHODDEF
|
||||
_WINAPI_WAITFORSINGLEOBJECT_METHODDEF
|
||||
_WINAPI_WRITEFILE_METHODDEF
|
||||
_WINAPI_GETACP_METHODDEF
|
||||
{NULL, NULL}
|
||||
};
|
||||
|
||||
|
|
|
@ -889,4 +889,22 @@ _winapi_WriteFile(PyObject *module, PyObject **args, Py_ssize_t nargs, PyObject
|
|||
exit:
|
||||
return return_value;
|
||||
}
|
||||
/*[clinic end generated code: output=fba2ad7bf1a87e4a input=a9049054013a1b77]*/
|
||||
|
||||
PyDoc_STRVAR(_winapi_GetACP__doc__,
|
||||
"GetACP($module, /)\n"
|
||||
"--\n"
|
||||
"\n"
|
||||
"Get the current Windows ANSI code page identifier.");
|
||||
|
||||
#define _WINAPI_GETACP_METHODDEF \
|
||||
{"GetACP", (PyCFunction)_winapi_GetACP, METH_NOARGS, _winapi_GetACP__doc__},
|
||||
|
||||
static PyObject *
|
||||
_winapi_GetACP_impl(PyObject *module);
|
||||
|
||||
static PyObject *
|
||||
_winapi_GetACP(PyObject *module, PyObject *Py_UNUSED(ignored))
|
||||
{
|
||||
return _winapi_GetACP_impl(module);
|
||||
}
|
||||
/*[clinic end generated code: output=fd91c1ec286f0bf3 input=a9049054013a1b77]*/
|
||||
|
|
150
Modules/main.c
150
Modules/main.c
|
@ -1114,50 +1114,32 @@ pymain_set_argv(_PyMain *pymain)
|
|||
}
|
||||
|
||||
|
||||
static void
|
||||
pymain_get_flag(int flag, int *value)
|
||||
{
|
||||
if (flag) {
|
||||
*value = flag;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
pymain_set_flag(int *flag, int value)
|
||||
{
|
||||
/* Helper to set flag variables from command line options
|
||||
* - uses the higher of the two values if they're both set
|
||||
* - otherwise leaves the flag unset
|
||||
*/
|
||||
if (*flag < value) {
|
||||
*flag = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Get Py_xxx global configuration variables */
|
||||
static void
|
||||
pymain_get_global_config(_PyMain *pymain)
|
||||
{
|
||||
_Py_CommandLineDetails *cmdline = &pymain->cmdline;
|
||||
pymain_get_flag(Py_BytesWarningFlag, &cmdline->bytes_warning);
|
||||
pymain_get_flag(Py_DebugFlag, &cmdline->debug);
|
||||
pymain_get_flag(Py_InspectFlag, &cmdline->inspect);
|
||||
pymain_get_flag(Py_InteractiveFlag, &cmdline->interactive);
|
||||
pymain_get_flag(Py_IsolatedFlag, &cmdline->isolated);
|
||||
pymain_get_flag(Py_OptimizeFlag, &cmdline->optimization_level);
|
||||
pymain_get_flag(Py_DontWriteBytecodeFlag, &cmdline->dont_write_bytecode);
|
||||
pymain_get_flag(Py_NoUserSiteDirectory, &cmdline->no_user_site_directory);
|
||||
pymain_get_flag(Py_NoSiteFlag, &cmdline->no_site_import);
|
||||
pymain_get_flag(Py_UnbufferedStdioFlag, &cmdline->use_unbuffered_io);
|
||||
pymain_get_flag(Py_VerboseFlag, &cmdline->verbosity);
|
||||
pymain_get_flag(Py_QuietFlag, &cmdline->quiet_flag);
|
||||
#ifdef MS_WINDOWS
|
||||
pymain_get_flag(Py_LegacyWindowsFSEncodingFlag, &cmdline->legacy_windows_fs_encoding);
|
||||
pymain_get_flag(Py_LegacyWindowsStdioFlag, &cmdline->legacy_windows_stdio);
|
||||
#endif
|
||||
|
||||
pymain_get_flag(Py_IgnoreEnvironmentFlag, &pymain->core_config.ignore_environment);
|
||||
cmdline->bytes_warning = Py_BytesWarningFlag;
|
||||
cmdline->debug = Py_DebugFlag;
|
||||
cmdline->inspect = Py_InspectFlag;
|
||||
cmdline->interactive = Py_InteractiveFlag;
|
||||
cmdline->isolated = Py_IsolatedFlag;
|
||||
cmdline->optimization_level = Py_OptimizeFlag;
|
||||
cmdline->dont_write_bytecode = Py_DontWriteBytecodeFlag;
|
||||
cmdline->no_user_site_directory = Py_NoUserSiteDirectory;
|
||||
cmdline->no_site_import = Py_NoSiteFlag;
|
||||
cmdline->use_unbuffered_io = Py_UnbufferedStdioFlag;
|
||||
cmdline->verbosity = Py_VerboseFlag;
|
||||
cmdline->quiet_flag = Py_QuietFlag;
|
||||
#ifdef MS_WINDOWS
|
||||
cmdline->legacy_windows_fs_encoding = Py_LegacyWindowsFSEncodingFlag;
|
||||
cmdline->legacy_windows_stdio = Py_LegacyWindowsStdioFlag;
|
||||
#endif
|
||||
cmdline->check_hash_pycs_mode = _Py_CheckHashBasedPycsMode ;
|
||||
|
||||
pymain->core_config.ignore_environment = Py_IgnoreEnvironmentFlag;
|
||||
pymain->core_config.utf8_mode = Py_UTF8Mode;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1166,26 +1148,27 @@ static void
|
|||
pymain_set_global_config(_PyMain *pymain)
|
||||
{
|
||||
_Py_CommandLineDetails *cmdline = &pymain->cmdline;
|
||||
pymain_set_flag(&Py_BytesWarningFlag, cmdline->bytes_warning);
|
||||
pymain_set_flag(&Py_DebugFlag, cmdline->debug);
|
||||
pymain_set_flag(&Py_InspectFlag, cmdline->inspect);
|
||||
pymain_set_flag(&Py_InteractiveFlag, cmdline->interactive);
|
||||
pymain_set_flag(&Py_IsolatedFlag, cmdline->isolated);
|
||||
pymain_set_flag(&Py_OptimizeFlag, cmdline->optimization_level);
|
||||
pymain_set_flag(&Py_DontWriteBytecodeFlag, cmdline->dont_write_bytecode);
|
||||
pymain_set_flag(&Py_NoUserSiteDirectory, cmdline->no_user_site_directory);
|
||||
pymain_set_flag(&Py_NoSiteFlag, cmdline->no_site_import);
|
||||
pymain_set_flag(&Py_UnbufferedStdioFlag, cmdline->use_unbuffered_io);
|
||||
pymain_set_flag(&Py_VerboseFlag, cmdline->verbosity);
|
||||
pymain_set_flag(&Py_QuietFlag, cmdline->quiet_flag);
|
||||
if (cmdline->check_hash_pycs_mode)
|
||||
_Py_CheckHashBasedPycsMode = cmdline->check_hash_pycs_mode;
|
||||
|
||||
Py_BytesWarningFlag = cmdline->bytes_warning;
|
||||
Py_DebugFlag = cmdline->debug;
|
||||
Py_InspectFlag = cmdline->inspect;
|
||||
Py_InteractiveFlag = cmdline->interactive;
|
||||
Py_IsolatedFlag = cmdline->isolated;
|
||||
Py_OptimizeFlag = cmdline->optimization_level;
|
||||
Py_DontWriteBytecodeFlag = cmdline->dont_write_bytecode;
|
||||
Py_NoUserSiteDirectory = cmdline->no_user_site_directory;
|
||||
Py_NoSiteFlag = cmdline->no_site_import;
|
||||
Py_UnbufferedStdioFlag = cmdline->use_unbuffered_io;
|
||||
Py_VerboseFlag = cmdline->verbosity;
|
||||
Py_QuietFlag = cmdline->quiet_flag;
|
||||
_Py_CheckHashBasedPycsMode = cmdline->check_hash_pycs_mode;
|
||||
#ifdef MS_WINDOWS
|
||||
pymain_set_flag(&Py_LegacyWindowsFSEncodingFlag, cmdline->legacy_windows_fs_encoding);
|
||||
pymain_set_flag(&Py_LegacyWindowsStdioFlag, cmdline->legacy_windows_stdio);
|
||||
Py_LegacyWindowsFSEncodingFlag = cmdline->legacy_windows_fs_encoding;
|
||||
Py_LegacyWindowsStdioFlag = cmdline->legacy_windows_stdio;
|
||||
#endif
|
||||
|
||||
pymain_set_flag(&Py_IgnoreEnvironmentFlag, pymain->core_config.ignore_environment);
|
||||
Py_IgnoreEnvironmentFlag = pymain->core_config.ignore_environment;
|
||||
Py_UTF8Mode = pymain->core_config.utf8_mode;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1609,6 +1592,57 @@ _PyMainInterpreterConfig_ReadEnv(_PyMainInterpreterConfig *config)
|
|||
}
|
||||
|
||||
|
||||
static int
|
||||
pymain_init_utf8_mode(_PyMain *pymain)
|
||||
{
|
||||
_PyCoreConfig *core_config = &pymain->core_config;
|
||||
|
||||
#ifdef MS_WINDOWS
|
||||
if (pymain->cmdline.legacy_windows_fs_encoding) {
|
||||
core_config->utf8_mode = 0;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
wchar_t *xopt = pymain_get_xoption(pymain, L"utf8");
|
||||
if (xopt) {
|
||||
wchar_t *sep = wcschr(xopt, L'=');
|
||||
if (sep) {
|
||||
xopt = sep + 1;
|
||||
if (wcscmp(xopt, L"1") == 0) {
|
||||
core_config->utf8_mode = 1;
|
||||
}
|
||||
else if (wcscmp(xopt, L"0") == 0) {
|
||||
core_config->utf8_mode = 0;
|
||||
}
|
||||
else {
|
||||
pymain->err = _Py_INIT_USER_ERR("invalid -X utf8 option value");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
core_config->utf8_mode = 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *opt = pymain_get_env_var("PYTHONUTF8");
|
||||
if (opt) {
|
||||
if (strcmp(opt, "1") == 0) {
|
||||
core_config->utf8_mode = 1;
|
||||
}
|
||||
else if (strcmp(opt, "0") == 0) {
|
||||
core_config->utf8_mode = 0;
|
||||
}
|
||||
else {
|
||||
pymain->err = _Py_INIT_USER_ERR("invalid PYTHONUTF8 environment "
|
||||
"variable value");
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
|
@ -1674,6 +1708,9 @@ pymain_parse_envvars(_PyMain *pymain)
|
|||
pymain->core_config.malloc_stats = 1;
|
||||
}
|
||||
|
||||
if (pymain_init_utf8_mode(pymain) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1702,6 +1739,7 @@ pymain_parse_cmdline_envvars_impl(_PyMain *pymain)
|
|||
if (pymain_parse_envvars(pymain) < 0) {
|
||||
return -1;
|
||||
}
|
||||
/* FIXME: if utf8_mode value changed, parse again cmdline */
|
||||
|
||||
_PyInitError err = _PyMainInterpreterConfig_Read(&pymain->config);
|
||||
if (_Py_INIT_FAILED(err)) {
|
||||
|
@ -1730,6 +1768,7 @@ pymain_parse_cmdline_envvars(_PyMain *pymain)
|
|||
static int
|
||||
pymain_init_python(_PyMain *pymain)
|
||||
{
|
||||
|
||||
pymain_set_global_config(pymain);
|
||||
|
||||
pymain_init_stdio(pymain);
|
||||
|
@ -1788,6 +1827,7 @@ pymain_init(_PyMain *pymain)
|
|||
return -1;
|
||||
}
|
||||
|
||||
pymain->core_config.utf8_mode = Py_UTF8Mode;
|
||||
pymain->core_config._disable_importlib = 0;
|
||||
pymain->config.install_signal_handlers = 1;
|
||||
|
||||
|
|
|
@ -5079,16 +5079,17 @@ onError:
|
|||
return NULL;
|
||||
}
|
||||
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
|
||||
/* Simplified UTF-8 decoder using surrogateescape error handler,
|
||||
used to decode the command line arguments on Mac OS X and Android.
|
||||
/* UTF-8 decoder using the surrogateescape error handler .
|
||||
|
||||
Return a pointer to a newly allocated wide character string (use
|
||||
PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
|
||||
On success, return a pointer to a newly allocated wide character string (use
|
||||
PyMem_RawFree() to free the memory) and write the output length (in number
|
||||
of wchar_t units) into *p_wlen (if p_wlen is set).
|
||||
|
||||
On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen
|
||||
(if p_wlen is set). */
|
||||
wchar_t*
|
||||
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
|
||||
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
|
||||
{
|
||||
const char *e;
|
||||
wchar_t *unicode;
|
||||
|
@ -5096,11 +5097,20 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
|
|||
|
||||
/* Note: size will always be longer than the resulting Unicode
|
||||
character count */
|
||||
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
|
||||
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
|
||||
if (p_wlen) {
|
||||
*p_wlen = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
|
||||
if (!unicode)
|
||||
if (!unicode) {
|
||||
if (p_wlen) {
|
||||
*p_wlen = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Unpack UTF-8 encoded data */
|
||||
e = s + size;
|
||||
|
@ -5130,10 +5140,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
|
|||
}
|
||||
}
|
||||
unicode[outpos] = L'\0';
|
||||
if (p_wlen) {
|
||||
*p_wlen = outpos;
|
||||
}
|
||||
return unicode;
|
||||
}
|
||||
|
||||
#endif /* __APPLE__ or __ANDROID__ */
|
||||
|
||||
/* Primary internal function which creates utf8 encoded bytes objects.
|
||||
|
||||
|
|
|
@ -17,6 +17,15 @@ wmain(int argc, wchar_t **argv)
|
|||
#else
|
||||
|
||||
|
||||
static void _Py_NO_RETURN
|
||||
fatal_error(const char *msg)
|
||||
{
|
||||
fprintf(stderr, "Fatal Python error: %s\n", msg);
|
||||
fflush(stderr);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
|
@ -28,9 +37,7 @@ main(int argc, char **argv)
|
|||
|
||||
_PyInitError err = _PyRuntime_Initialize();
|
||||
if (_Py_INIT_FAILED(err)) {
|
||||
fprintf(stderr, "Fatal Python error: %s\n", err.msg);
|
||||
fflush(stderr);
|
||||
exit(1);
|
||||
fatal_error(err.msg);
|
||||
}
|
||||
|
||||
/* Force default allocator, to be able to release memory above
|
||||
|
@ -40,7 +47,7 @@ main(int argc, char **argv)
|
|||
argv_copy = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
|
||||
argv_copy2 = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1));
|
||||
if (!argv_copy || !argv_copy2) {
|
||||
fprintf(stderr, "out of memory\n");
|
||||
fatal_error("out of memory");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -55,7 +62,7 @@ main(int argc, char **argv)
|
|||
|
||||
oldloc = _PyMem_RawStrdup(setlocale(LC_ALL, NULL));
|
||||
if (!oldloc) {
|
||||
fprintf(stderr, "out of memory\n");
|
||||
fatal_error("out of memory");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -73,6 +80,7 @@ main(int argc, char **argv)
|
|||
* details.
|
||||
*/
|
||||
if (_Py_LegacyLocaleDetected()) {
|
||||
Py_UTF8Mode = 1;
|
||||
_Py_CoerceLegacyLocale();
|
||||
}
|
||||
|
||||
|
@ -81,10 +89,7 @@ main(int argc, char **argv)
|
|||
argv_copy[i] = Py_DecodeLocale(argv[i], NULL);
|
||||
if (!argv_copy[i]) {
|
||||
PyMem_RawFree(oldloc);
|
||||
fprintf(stderr, "Fatal Python error: "
|
||||
"unable to decode the command line argument #%i\n",
|
||||
i + 1);
|
||||
return 1;
|
||||
fatal_error("unable to decode the command line arguments");
|
||||
}
|
||||
argv_copy2[i] = argv_copy[i];
|
||||
}
|
||||
|
|
|
@ -29,6 +29,9 @@ const char *Py_FileSystemDefaultEncoding = NULL; /* set by initfsencoding() */
|
|||
int Py_HasFileSystemDefaultEncoding = 0;
|
||||
#endif
|
||||
const char *Py_FileSystemDefaultEncodeErrors = "surrogateescape";
|
||||
/* UTF-8 mode (PEP 540): if non-zero, use the UTF-8 encoding, and change stdin
|
||||
and stdout error handler to "surrogateescape". */
|
||||
int Py_UTF8Mode = 0;
|
||||
|
||||
_Py_IDENTIFIER(__builtins__);
|
||||
_Py_IDENTIFIER(__dict__);
|
||||
|
|
|
@ -20,9 +20,8 @@ extern int winerror_to_errno(int);
|
|||
#include <fcntl.h>
|
||||
#endif /* HAVE_FCNTL_H */
|
||||
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
|
||||
#endif
|
||||
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
|
||||
size_t *p_wlen);
|
||||
|
||||
#ifdef O_CLOEXEC
|
||||
/* Does open() support the O_CLOEXEC flag? Possible values:
|
||||
|
@ -250,40 +249,9 @@ decode_ascii_surrogateescape(const char *arg, size_t *size)
|
|||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* Decode a byte string from the locale encoding with the
|
||||
surrogateescape error handler: undecodable bytes are decoded as characters
|
||||
in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
|
||||
character, escape the bytes using the surrogateescape error handler instead
|
||||
of decoding them.
|
||||
|
||||
Return a pointer to a newly allocated wide character string, use
|
||||
PyMem_RawFree() to free the memory. If size is not NULL, write the number of
|
||||
wide characters excluding the null character into *size
|
||||
|
||||
Return NULL on decoding error or memory allocation error. If *size* is not
|
||||
NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
|
||||
decoding error.
|
||||
|
||||
Decoding errors should never happen, unless there is a bug in the C
|
||||
library.
|
||||
|
||||
Use the Py_EncodeLocale() function to encode the character string back to a
|
||||
byte string. */
|
||||
wchar_t*
|
||||
Py_DecodeLocale(const char* arg, size_t *size)
|
||||
static wchar_t*
|
||||
decode_locale(const char* arg, size_t *size)
|
||||
{
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
wchar_t *wstr;
|
||||
wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
|
||||
if (size != NULL) {
|
||||
if (wstr != NULL)
|
||||
*size = wcslen(wstr);
|
||||
else
|
||||
*size = (size_t)-1;
|
||||
}
|
||||
return wstr;
|
||||
#else
|
||||
wchar_t *res;
|
||||
size_t argsize;
|
||||
size_t count;
|
||||
|
@ -293,19 +261,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
|
|||
mbstate_t mbs;
|
||||
#endif
|
||||
|
||||
#ifndef MS_WINDOWS
|
||||
if (force_ascii == -1)
|
||||
force_ascii = check_force_ascii();
|
||||
|
||||
if (force_ascii) {
|
||||
/* force ASCII encoding to workaround mbstowcs() issue */
|
||||
res = decode_ascii_surrogateescape(arg, size);
|
||||
if (res == NULL)
|
||||
goto oom;
|
||||
return res;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_BROKEN_MBSTOWCS
|
||||
/* Some platforms have a broken implementation of
|
||||
* mbstowcs which does not count the characters that
|
||||
|
@ -402,43 +357,84 @@ Py_DecodeLocale(const char* arg, size_t *size)
|
|||
goto oom;
|
||||
#endif /* HAVE_MBRTOWC */
|
||||
return res;
|
||||
|
||||
oom:
|
||||
if (size != NULL)
|
||||
if (size != NULL) {
|
||||
*size = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* Decode a byte string from the locale encoding with the
|
||||
surrogateescape error handler: undecodable bytes are decoded as characters
|
||||
in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate
|
||||
character, escape the bytes using the surrogateescape error handler instead
|
||||
of decoding them.
|
||||
|
||||
Return a pointer to a newly allocated wide character string, use
|
||||
PyMem_RawFree() to free the memory. If size is not NULL, write the number of
|
||||
wide characters excluding the null character into *size
|
||||
|
||||
Return NULL on decoding error or memory allocation error. If *size* is not
|
||||
NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on
|
||||
decoding error.
|
||||
|
||||
Decoding errors should never happen, unless there is a bug in the C
|
||||
library.
|
||||
|
||||
Use the Py_EncodeLocale() function to encode the character string back to a
|
||||
byte string. */
|
||||
wchar_t*
|
||||
Py_DecodeLocale(const char* arg, size_t *size)
|
||||
{
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
|
||||
#else
|
||||
if (Py_UTF8Mode) {
|
||||
return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size);
|
||||
}
|
||||
|
||||
#ifndef MS_WINDOWS
|
||||
if (force_ascii == -1)
|
||||
force_ascii = check_force_ascii();
|
||||
|
||||
if (force_ascii) {
|
||||
/* force ASCII encoding to workaround mbstowcs() issue */
|
||||
wchar_t *wstr = decode_ascii_surrogateescape(arg, size);
|
||||
if (wstr == NULL) {
|
||||
if (size != NULL) {
|
||||
*size = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
return wstr;
|
||||
}
|
||||
#endif
|
||||
|
||||
return decode_locale(arg, size);
|
||||
#endif /* __APPLE__ or __ANDROID__ */
|
||||
}
|
||||
|
||||
/* Encode a wide character string to the locale encoding with the
|
||||
surrogateescape error handler: surrogate characters in the range
|
||||
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
|
||||
|
||||
Return a pointer to a newly allocated byte string, use PyMem_Free() to free
|
||||
the memory. Return NULL on encoding or memory allocation error.
|
||||
|
||||
If error_pos is not NULL, *error_pos is set to the index of the invalid
|
||||
character on encoding error, or set to (size_t)-1 otherwise.
|
||||
|
||||
Use the Py_DecodeLocale() function to decode the bytes string back to a wide
|
||||
character string. */
|
||||
char*
|
||||
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
|
||||
static char*
|
||||
_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
|
||||
{
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
Py_ssize_t len;
|
||||
PyObject *unicode, *bytes = NULL;
|
||||
char *cpath;
|
||||
|
||||
unicode = PyUnicode_FromWideChar(text, wcslen(text));
|
||||
if (unicode == NULL)
|
||||
if (unicode == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
|
||||
Py_DECREF(unicode);
|
||||
if (bytes == NULL) {
|
||||
PyErr_Clear();
|
||||
if (error_pos != NULL)
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -447,27 +443,24 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
|
|||
if (cpath == NULL) {
|
||||
PyErr_Clear();
|
||||
Py_DECREF(bytes);
|
||||
if (error_pos != NULL)
|
||||
if (error_pos != NULL) {
|
||||
*error_pos = (size_t)-1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
memcpy(cpath, PyBytes_AsString(bytes), len + 1);
|
||||
Py_DECREF(bytes);
|
||||
return cpath;
|
||||
#else /* __APPLE__ */
|
||||
}
|
||||
|
||||
static char*
|
||||
encode_locale(const wchar_t *text, size_t *error_pos)
|
||||
{
|
||||
const size_t len = wcslen(text);
|
||||
char *result = NULL, *bytes = NULL;
|
||||
size_t i, size, converted;
|
||||
wchar_t c, buf[2];
|
||||
|
||||
#ifndef MS_WINDOWS
|
||||
if (force_ascii == -1)
|
||||
force_ascii = check_force_ascii();
|
||||
|
||||
if (force_ascii)
|
||||
return encode_ascii_surrogateescape(text, error_pos);
|
||||
#endif
|
||||
|
||||
/* The function works in two steps:
|
||||
1. compute the length of the output buffer in bytes (size)
|
||||
2. outputs the bytes */
|
||||
|
@ -522,6 +515,39 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
|
|||
bytes = result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Encode a wide character string to the locale encoding with the
|
||||
surrogateescape error handler: surrogate characters in the range
|
||||
U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
|
||||
|
||||
Return a pointer to a newly allocated byte string, use PyMem_Free() to free
|
||||
the memory. Return NULL on encoding or memory allocation error.
|
||||
|
||||
If error_pos is not NULL, *error_pos is set to (size_t)-1 on success, or set
|
||||
to the index of the invalid character on encoding error.
|
||||
|
||||
Use the Py_DecodeLocale() function to decode the bytes string back to a wide
|
||||
character string. */
|
||||
char*
|
||||
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
|
||||
{
|
||||
#if defined(__APPLE__) || defined(__ANDROID__)
|
||||
return _Py_EncodeLocaleUTF8(text, error_pos);
|
||||
#else /* __APPLE__ */
|
||||
if (Py_UTF8Mode) {
|
||||
return _Py_EncodeLocaleUTF8(text, error_pos);
|
||||
}
|
||||
|
||||
#ifndef MS_WINDOWS
|
||||
if (force_ascii == -1)
|
||||
force_ascii = check_force_ascii();
|
||||
|
||||
if (force_ascii)
|
||||
return encode_ascii_surrogateescape(text, error_pos);
|
||||
#endif
|
||||
|
||||
return encode_locale(text, error_pos);
|
||||
#endif /* __APPLE__ or __ANDROID__ */
|
||||
}
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ extern grammar _PyParser_Grammar; /* From graminit.c */
|
|||
static _PyInitError add_main_module(PyInterpreterState *interp);
|
||||
static _PyInitError initfsencoding(PyInterpreterState *interp);
|
||||
static _PyInitError initsite(void);
|
||||
static _PyInitError init_sys_streams(void);
|
||||
static _PyInitError init_sys_streams(PyInterpreterState *interp);
|
||||
static _PyInitError initsigs(void);
|
||||
static void call_py_exitfuncs(void);
|
||||
static void wait_for_thread_shutdown(void);
|
||||
|
@ -925,7 +925,7 @@ _Py_InitializeMainInterpreter(const _PyMainInterpreterConfig *config)
|
|||
return err;
|
||||
}
|
||||
|
||||
err = init_sys_streams();
|
||||
err = init_sys_streams(interp);
|
||||
if (_Py_INIT_FAILED(err)) {
|
||||
return err;
|
||||
}
|
||||
|
@ -1410,7 +1410,7 @@ new_interpreter(PyThreadState **tstate_p)
|
|||
return err;
|
||||
}
|
||||
|
||||
err = init_sys_streams();
|
||||
err = init_sys_streams(interp);
|
||||
if (_Py_INIT_FAILED(err)) {
|
||||
return err;
|
||||
}
|
||||
|
@ -1558,7 +1558,13 @@ initfsencoding(PyInterpreterState *interp)
|
|||
Py_FileSystemDefaultEncodeErrors = "surrogatepass";
|
||||
}
|
||||
#else
|
||||
if (Py_FileSystemDefaultEncoding == NULL) {
|
||||
if (Py_FileSystemDefaultEncoding == NULL &&
|
||||
interp->core_config.utf8_mode)
|
||||
{
|
||||
Py_FileSystemDefaultEncoding = "utf-8";
|
||||
Py_HasFileSystemDefaultEncoding = 1;
|
||||
}
|
||||
else if (Py_FileSystemDefaultEncoding == NULL) {
|
||||
Py_FileSystemDefaultEncoding = get_locale_encoding();
|
||||
if (Py_FileSystemDefaultEncoding == NULL) {
|
||||
return _Py_INIT_ERR("Unable to get the locale encoding");
|
||||
|
@ -1749,7 +1755,7 @@ error:
|
|||
|
||||
/* Initialize sys.stdin, stdout, stderr and builtins.open */
|
||||
static _PyInitError
|
||||
init_sys_streams(void)
|
||||
init_sys_streams(PyInterpreterState *interp)
|
||||
{
|
||||
PyObject *iomod = NULL, *wrapper;
|
||||
PyObject *bimod = NULL;
|
||||
|
@ -1794,10 +1800,10 @@ init_sys_streams(void)
|
|||
encoding = _Py_StandardStreamEncoding;
|
||||
errors = _Py_StandardStreamErrors;
|
||||
if (!encoding || !errors) {
|
||||
pythonioencoding = Py_GETENV("PYTHONIOENCODING");
|
||||
if (pythonioencoding) {
|
||||
char *opt = Py_GETENV("PYTHONIOENCODING");
|
||||
if (opt && opt[0] != '\0') {
|
||||
char *err;
|
||||
pythonioencoding = _PyMem_Strdup(pythonioencoding);
|
||||
pythonioencoding = _PyMem_Strdup(opt);
|
||||
if (pythonioencoding == NULL) {
|
||||
PyErr_NoMemory();
|
||||
goto error;
|
||||
|
@ -1814,7 +1820,12 @@ init_sys_streams(void)
|
|||
encoding = pythonioencoding;
|
||||
}
|
||||
}
|
||||
if (!errors && !(pythonioencoding && *pythonioencoding)) {
|
||||
else if (interp->core_config.utf8_mode) {
|
||||
encoding = "utf-8";
|
||||
errors = "surrogateescape";
|
||||
}
|
||||
|
||||
if (!errors && !pythonioencoding) {
|
||||
/* Choose the default error handler based on the current locale */
|
||||
errors = get_default_standard_stream_error_handler();
|
||||
}
|
||||
|
|
|
@ -1814,6 +1814,7 @@ static PyStructSequence_Field flags_fields[] = {
|
|||
{"hash_randomization", "-R"},
|
||||
{"isolated", "-I"},
|
||||
{"dev_mode", "-X dev"},
|
||||
{"utf8_mode", "-X utf8"},
|
||||
{0}
|
||||
};
|
||||
|
||||
|
@ -1821,7 +1822,7 @@ static PyStructSequence_Desc flags_desc = {
|
|||
"sys.flags", /* name */
|
||||
flags__doc__, /* doc */
|
||||
flags_fields, /* fields */
|
||||
14
|
||||
15
|
||||
};
|
||||
|
||||
static PyObject*
|
||||
|
@ -1853,8 +1854,9 @@ make_flags(void)
|
|||
SetFlag(Py_QuietFlag);
|
||||
SetFlag(Py_HashRandomizationFlag);
|
||||
SetFlag(Py_IsolatedFlag);
|
||||
#undef SetFlag
|
||||
PyStructSequence_SET_ITEM(seq, pos++, PyBool_FromLong(core_config->dev_mode));
|
||||
SetFlag(Py_UTF8Mode);
|
||||
#undef SetFlag
|
||||
|
||||
if (PyErr_Occurred()) {
|
||||
Py_DECREF(seq);
|
||||
|
|
Loading…
Reference in New Issue