2017-12-13 07:29:09 -04:00
|
|
|
"""
|
|
|
|
Test the implementation of the PEP 540: the UTF-8 Mode.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import locale
|
2020-11-04 06:20:10 -04:00
|
|
|
import subprocess
|
2017-12-13 07:29:09 -04:00
|
|
|
import sys
|
|
|
|
import textwrap
|
|
|
|
import unittest
|
2017-12-15 23:54:22 -04:00
|
|
|
from test import support
|
2017-12-13 07:29:09 -04:00
|
|
|
from test.support.script_helper import assert_python_ok, assert_python_failure
|
2023-10-05 21:37:28 -03:00
|
|
|
from test.support import os_helper, MS_WINDOWS
|
2017-12-13 07:29:09 -04:00
|
|
|
|
|
|
|
|
2018-08-28 07:35:44 -03:00
|
|
|
POSIX_LOCALES = ('C', 'POSIX')
|
2019-03-27 12:11:12 -03:00
|
|
|
VXWORKS = (sys.platform == "vxworks")
|
2018-06-25 21:11:06 -03:00
|
|
|
|
2017-12-13 07:29:09 -04:00
|
|
|
class UTF8ModeTests(unittest.TestCase):
|
2017-12-15 23:54:22 -04:00
|
|
|
DEFAULT_ENV = {
|
|
|
|
'PYTHONUTF8': '',
|
|
|
|
'PYTHONLEGACYWINDOWSFSENCODING': '',
|
|
|
|
'PYTHONCOERCECLOCALE': '0',
|
|
|
|
}
|
2017-12-13 07:29:09 -04:00
|
|
|
|
|
|
|
def posix_locale(self):
|
|
|
|
loc = locale.setlocale(locale.LC_CTYPE, None)
|
2018-08-28 07:35:44 -03:00
|
|
|
return (loc in POSIX_LOCALES)
|
2017-12-13 07:29:09 -04:00
|
|
|
|
|
|
|
def get_output(self, *args, failure=False, **kw):
|
|
|
|
kw = dict(self.DEFAULT_ENV, **kw)
|
|
|
|
if failure:
|
|
|
|
out = assert_python_failure(*args, **kw)
|
|
|
|
out = out[2]
|
|
|
|
else:
|
|
|
|
out = assert_python_ok(*args, **kw)
|
|
|
|
out = out[1]
|
|
|
|
return out.decode().rstrip("\n\r")
|
|
|
|
|
2018-06-25 21:11:06 -03:00
|
|
|
@unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale')
|
2017-12-13 07:29:09 -04:00
|
|
|
def test_posix_locale(self):
|
|
|
|
code = 'import sys; print(sys.flags.utf8_mode)'
|
|
|
|
|
2018-08-28 07:35:44 -03:00
|
|
|
for loc in POSIX_LOCALES:
|
|
|
|
with self.subTest(LC_ALL=loc):
|
|
|
|
out = self.get_output('-c', code, LC_ALL=loc)
|
|
|
|
self.assertEqual(out, '1')
|
2017-12-13 07:29:09 -04:00
|
|
|
|
|
|
|
def test_xoption(self):
|
|
|
|
code = 'import sys; print(sys.flags.utf8_mode)'
|
|
|
|
|
|
|
|
out = self.get_output('-X', 'utf8', '-c', code)
|
|
|
|
self.assertEqual(out, '1')
|
|
|
|
|
|
|
|
# undocumented but accepted syntax: -X utf8=1
|
|
|
|
out = self.get_output('-X', 'utf8=1', '-c', code)
|
|
|
|
self.assertEqual(out, '1')
|
|
|
|
|
|
|
|
out = self.get_output('-X', 'utf8=0', '-c', code)
|
|
|
|
self.assertEqual(out, '0')
|
|
|
|
|
2018-06-25 21:11:06 -03:00
|
|
|
if MS_WINDOWS:
|
2017-12-15 23:54:22 -04:00
|
|
|
# PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 Mode
|
2017-12-13 07:29:09 -04:00
|
|
|
# and has the priority over -X utf8
|
|
|
|
out = self.get_output('-X', 'utf8', '-c', code,
|
|
|
|
PYTHONLEGACYWINDOWSFSENCODING='1')
|
|
|
|
self.assertEqual(out, '0')
|
|
|
|
|
|
|
|
def test_env_var(self):
|
|
|
|
code = 'import sys; print(sys.flags.utf8_mode)'
|
|
|
|
|
|
|
|
out = self.get_output('-c', code, PYTHONUTF8='1')
|
|
|
|
self.assertEqual(out, '1')
|
|
|
|
|
|
|
|
out = self.get_output('-c', code, PYTHONUTF8='0')
|
|
|
|
self.assertEqual(out, '0')
|
|
|
|
|
|
|
|
# -X utf8 has the priority over PYTHONUTF8
|
|
|
|
out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1')
|
|
|
|
self.assertEqual(out, '0')
|
|
|
|
|
2018-06-25 21:11:06 -03:00
|
|
|
if MS_WINDOWS:
|
2017-12-13 07:29:09 -04:00
|
|
|
# PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
|
|
|
|
# and has the priority over PYTHONUTF8
|
|
|
|
out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1',
|
|
|
|
PYTHONLEGACYWINDOWSFSENCODING='1')
|
|
|
|
self.assertEqual(out, '0')
|
|
|
|
|
|
|
|
# Cannot test with the POSIX locale, since the POSIX locale enables
|
|
|
|
# the UTF-8 mode
|
|
|
|
if not self.posix_locale():
|
|
|
|
# PYTHONUTF8 should be ignored if -E is used
|
|
|
|
out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
|
|
|
|
self.assertEqual(out, '0')
|
|
|
|
|
|
|
|
# invalid mode
|
|
|
|
out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
|
|
|
|
self.assertIn('invalid PYTHONUTF8 environment variable value',
|
|
|
|
out.rstrip())
|
|
|
|
|
|
|
|
def test_filesystemencoding(self):
|
|
|
|
code = textwrap.dedent('''
|
|
|
|
import sys
|
|
|
|
print("{}/{}".format(sys.getfilesystemencoding(),
|
|
|
|
sys.getfilesystemencodeerrors()))
|
|
|
|
''')
|
|
|
|
|
2018-06-25 21:11:06 -03:00
|
|
|
if MS_WINDOWS:
|
2017-12-13 07:29:09 -04:00
|
|
|
expected = 'utf-8/surrogatepass'
|
|
|
|
else:
|
|
|
|
expected = 'utf-8/surrogateescape'
|
|
|
|
|
|
|
|
out = self.get_output('-X', 'utf8', '-c', code)
|
|
|
|
self.assertEqual(out, expected)
|
|
|
|
|
2018-06-25 21:11:06 -03:00
|
|
|
if MS_WINDOWS:
|
2017-12-13 07:29:09 -04:00
|
|
|
# PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
|
|
|
|
# and has the priority over -X utf8 and PYTHONUTF8
|
|
|
|
out = self.get_output('-X', 'utf8', '-c', code,
|
2018-09-19 18:56:36 -03:00
|
|
|
PYTHONUTF8='strict',
|
2017-12-13 07:29:09 -04:00
|
|
|
PYTHONLEGACYWINDOWSFSENCODING='1')
|
|
|
|
self.assertEqual(out, 'mbcs/replace')
|
|
|
|
|
|
|
|
def test_stdio(self):
|
|
|
|
code = textwrap.dedent('''
|
|
|
|
import sys
|
|
|
|
print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}")
|
|
|
|
print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}")
|
|
|
|
print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}")
|
|
|
|
''')
|
|
|
|
|
|
|
|
out = self.get_output('-X', 'utf8', '-c', code,
|
|
|
|
PYTHONIOENCODING='')
|
|
|
|
self.assertEqual(out.splitlines(),
|
|
|
|
['stdin: utf-8/surrogateescape',
|
|
|
|
'stdout: utf-8/surrogateescape',
|
|
|
|
'stderr: utf-8/backslashreplace'])
|
|
|
|
|
|
|
|
# PYTHONIOENCODING has the priority over PYTHONUTF8
|
|
|
|
out = self.get_output('-X', 'utf8', '-c', code,
|
|
|
|
PYTHONIOENCODING="latin1")
|
|
|
|
self.assertEqual(out.splitlines(),
|
2018-08-28 18:26:33 -03:00
|
|
|
['stdin: iso8859-1/strict',
|
|
|
|
'stdout: iso8859-1/strict',
|
|
|
|
'stderr: iso8859-1/backslashreplace'])
|
2017-12-13 07:29:09 -04:00
|
|
|
|
|
|
|
out = self.get_output('-X', 'utf8', '-c', code,
|
|
|
|
PYTHONIOENCODING=":namereplace")
|
|
|
|
self.assertEqual(out.splitlines(),
|
2018-08-28 18:26:33 -03:00
|
|
|
['stdin: utf-8/namereplace',
|
|
|
|
'stdout: utf-8/namereplace',
|
|
|
|
'stderr: utf-8/backslashreplace'])
|
2017-12-13 07:29:09 -04:00
|
|
|
|
|
|
|
def test_io(self):
|
|
|
|
code = textwrap.dedent('''
|
|
|
|
import sys
|
|
|
|
filename = sys.argv[1]
|
|
|
|
with open(filename) as fp:
|
|
|
|
print(f"{fp.encoding}/{fp.errors}")
|
|
|
|
''')
|
|
|
|
filename = __file__
|
|
|
|
|
|
|
|
out = self.get_output('-c', code, filename, PYTHONUTF8='1')
|
2022-04-03 23:46:57 -03:00
|
|
|
self.assertEqual(out.lower(), 'utf-8/strict')
|
2017-12-13 07:29:09 -04:00
|
|
|
|
|
|
|
def _check_io_encoding(self, module, encoding=None, errors=None):
|
|
|
|
filename = __file__
|
|
|
|
|
|
|
|
# Encoding explicitly set
|
|
|
|
args = []
|
|
|
|
if encoding:
|
|
|
|
args.append(f'encoding={encoding!r}')
|
|
|
|
if errors:
|
|
|
|
args.append(f'errors={errors!r}')
|
|
|
|
code = textwrap.dedent('''
|
|
|
|
import sys
|
|
|
|
from %s import open
|
|
|
|
filename = sys.argv[1]
|
|
|
|
with open(filename, %s) as fp:
|
|
|
|
print(f"{fp.encoding}/{fp.errors}")
|
|
|
|
''') % (module, ', '.join(args))
|
|
|
|
out = self.get_output('-c', code, filename,
|
|
|
|
PYTHONUTF8='1')
|
|
|
|
|
|
|
|
if not encoding:
|
2022-04-03 23:46:57 -03:00
|
|
|
encoding = 'utf-8'
|
2017-12-13 07:29:09 -04:00
|
|
|
if not errors:
|
|
|
|
errors = 'strict'
|
2022-04-03 23:46:57 -03:00
|
|
|
self.assertEqual(out.lower(), f'{encoding}/{errors}')
|
2017-12-13 07:29:09 -04:00
|
|
|
|
|
|
|
def check_io_encoding(self, module):
|
|
|
|
self._check_io_encoding(module, encoding="latin1")
|
|
|
|
self._check_io_encoding(module, errors="namereplace")
|
|
|
|
self._check_io_encoding(module,
|
|
|
|
encoding="latin1", errors="namereplace")
|
|
|
|
|
|
|
|
def test_io_encoding(self):
|
|
|
|
self.check_io_encoding('io')
|
|
|
|
|
2019-04-22 15:46:27 -03:00
|
|
|
def test_pyio_encoding(self):
|
2017-12-13 07:29:09 -04:00
|
|
|
self.check_io_encoding('_pyio')
|
|
|
|
|
|
|
|
def test_locale_getpreferredencoding(self):
|
|
|
|
code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))'
|
|
|
|
out = self.get_output('-X', 'utf8', '-c', code)
|
2022-04-08 21:54:54 -03:00
|
|
|
self.assertEqual(out, 'utf-8 utf-8')
|
2017-12-13 07:29:09 -04:00
|
|
|
|
2018-08-28 07:35:44 -03:00
|
|
|
for loc in POSIX_LOCALES:
|
|
|
|
with self.subTest(LC_ALL=loc):
|
|
|
|
out = self.get_output('-X', 'utf8', '-c', code, LC_ALL=loc)
|
2022-04-08 21:54:54 -03:00
|
|
|
self.assertEqual(out, 'utf-8 utf-8')
|
2017-12-13 07:29:09 -04:00
|
|
|
|
2018-06-25 21:11:06 -03:00
|
|
|
@unittest.skipIf(MS_WINDOWS, 'test specific to Unix')
|
2017-12-15 23:54:22 -04:00
|
|
|
def test_cmd_line(self):
|
|
|
|
arg = 'h\xe9\u20ac'.encode('utf-8')
|
|
|
|
arg_utf8 = arg.decode('utf-8')
|
|
|
|
arg_ascii = arg.decode('ascii', 'surrogateescape')
|
|
|
|
code = 'import locale, sys; print("%s:%s" % (locale.getpreferredencoding(), ascii(sys.argv[1:])))'
|
|
|
|
|
|
|
|
def check(utf8_opt, expected, **kw):
|
|
|
|
out = self.get_output('-X', utf8_opt, '-c', code, arg, **kw)
|
|
|
|
args = out.partition(':')[2].rstrip()
|
|
|
|
self.assertEqual(args, ascii(expected), out)
|
|
|
|
|
|
|
|
check('utf8', [arg_utf8])
|
2018-08-28 07:35:44 -03:00
|
|
|
for loc in POSIX_LOCALES:
|
|
|
|
with self.subTest(LC_ALL=loc):
|
|
|
|
check('utf8', [arg_utf8], LC_ALL=loc)
|
|
|
|
|
2019-03-27 12:11:12 -03:00
|
|
|
if sys.platform == 'darwin' or support.is_android or VXWORKS:
|
2017-12-15 23:54:22 -04:00
|
|
|
c_arg = arg_utf8
|
2018-08-27 10:40:17 -03:00
|
|
|
elif sys.platform.startswith("aix"):
|
|
|
|
c_arg = arg.decode('iso-8859-1')
|
2017-12-15 23:54:22 -04:00
|
|
|
else:
|
|
|
|
c_arg = arg_ascii
|
2018-08-28 07:35:44 -03:00
|
|
|
for loc in POSIX_LOCALES:
|
|
|
|
with self.subTest(LC_ALL=loc):
|
|
|
|
check('utf8=0', [c_arg], LC_ALL=loc)
|
2017-12-15 23:54:22 -04:00
|
|
|
|
2018-01-25 04:18:36 -04:00
|
|
|
def test_optim_level(self):
|
|
|
|
# CPython: check that Py_Main() doesn't increment Py_OptimizeFlag
|
|
|
|
# twice when -X utf8 requires to parse the configuration twice (when
|
|
|
|
# the encoding changes after reading the configuration, the
|
|
|
|
# configuration is read again with the new encoding).
|
|
|
|
code = 'import sys; print(sys.flags.optimize)'
|
|
|
|
out = self.get_output('-X', 'utf8', '-O', '-c', code)
|
|
|
|
self.assertEqual(out, '1')
|
|
|
|
out = self.get_output('-X', 'utf8', '-OO', '-c', code)
|
|
|
|
self.assertEqual(out, '2')
|
|
|
|
|
|
|
|
code = 'import sys; print(sys.flags.ignore_environment)'
|
|
|
|
out = self.get_output('-X', 'utf8', '-E', '-c', code)
|
|
|
|
self.assertEqual(out, '1')
|
|
|
|
|
2020-11-04 06:20:10 -04:00
|
|
|
@unittest.skipIf(MS_WINDOWS,
|
|
|
|
"os.device_encoding() doesn't implement "
|
|
|
|
"the UTF-8 Mode on Windows")
|
2022-01-25 03:09:06 -04:00
|
|
|
@support.requires_subprocess()
|
2020-11-04 06:20:10 -04:00
|
|
|
def test_device_encoding(self):
|
|
|
|
# Use stdout as TTY
|
|
|
|
if not sys.stdout.isatty():
|
|
|
|
self.skipTest("sys.stdout is not a TTY")
|
|
|
|
|
|
|
|
filename = 'out.txt'
|
|
|
|
self.addCleanup(os_helper.unlink, filename)
|
|
|
|
|
|
|
|
code = (f'import os, sys; fd = sys.stdout.fileno(); '
|
|
|
|
f'out = open({filename!r}, "w", encoding="utf-8"); '
|
|
|
|
f'print(os.isatty(fd), os.device_encoding(fd), file=out); '
|
|
|
|
f'out.close()')
|
|
|
|
cmd = [sys.executable, '-X', 'utf8', '-c', code]
|
|
|
|
# The stdout TTY is inherited to the child process
|
|
|
|
proc = subprocess.run(cmd, text=True)
|
|
|
|
self.assertEqual(proc.returncode, 0, proc)
|
|
|
|
|
|
|
|
# In UTF-8 Mode, device_encoding(fd) returns "UTF-8" if fd is a TTY
|
|
|
|
with open(filename, encoding="utf8") as fp:
|
|
|
|
out = fp.read().rstrip()
|
2022-04-08 21:54:54 -03:00
|
|
|
self.assertEqual(out, 'True utf-8')
|
2020-11-04 06:20:10 -04:00
|
|
|
|
2017-12-13 07:29:09 -04:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
unittest.main()
|