mirror of https://github.com/python/cpython
[3.11] gh-91810: ElementTree: Use text file's encoding by default in XML declaration (GH-91903) (GH-92663)
ElementTree method write() and function tostring() now use the text file's
encoding ("UTF-8" if not available) instead of locale encoding in XML
declaration when encoding="unicode" is specified.
(cherry picked from commit 707839b0fe
)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Automerge-Triggered-By: GH:serhiy-storchaka
This commit is contained in:
parent
5ea8a93e1a
commit
20fec2c265
|
@ -10,7 +10,6 @@ import functools
|
||||||
import html
|
import html
|
||||||
import io
|
import io
|
||||||
import itertools
|
import itertools
|
||||||
import locale
|
|
||||||
import operator
|
import operator
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
|
@ -978,15 +977,13 @@ class ElementTreeTest(unittest.TestCase):
|
||||||
|
|
||||||
def test_tostring_xml_declaration_unicode_encoding(self):
|
def test_tostring_xml_declaration_unicode_encoding(self):
|
||||||
elem = ET.XML('<body><tag/></body>')
|
elem = ET.XML('<body><tag/></body>')
|
||||||
preferredencoding = locale.getpreferredencoding()
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>",
|
ET.tostring(elem, encoding='unicode', xml_declaration=True),
|
||||||
ET.tostring(elem, encoding='unicode', xml_declaration=True)
|
"<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>"
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_tostring_xml_declaration_cases(self):
|
def test_tostring_xml_declaration_cases(self):
|
||||||
elem = ET.XML('<body><tag>ø</tag></body>')
|
elem = ET.XML('<body><tag>ø</tag></body>')
|
||||||
preferredencoding = locale.getpreferredencoding()
|
|
||||||
TESTCASES = [
|
TESTCASES = [
|
||||||
# (expected_retval, encoding, xml_declaration)
|
# (expected_retval, encoding, xml_declaration)
|
||||||
# ... xml_declaration = None
|
# ... xml_declaration = None
|
||||||
|
@ -1013,7 +1010,7 @@ class ElementTreeTest(unittest.TestCase):
|
||||||
b"<body><tag>ø</tag></body>", 'US-ASCII', True),
|
b"<body><tag>ø</tag></body>", 'US-ASCII', True),
|
||||||
(b"<?xml version='1.0' encoding='ISO-8859-1'?>\n"
|
(b"<?xml version='1.0' encoding='ISO-8859-1'?>\n"
|
||||||
b"<body><tag>\xf8</tag></body>", 'ISO-8859-1', True),
|
b"<body><tag>\xf8</tag></body>", 'ISO-8859-1', True),
|
||||||
(f"<?xml version='1.0' encoding='{preferredencoding}'?>\n"
|
("<?xml version='1.0' encoding='utf-8'?>\n"
|
||||||
"<body><tag>ø</tag></body>", 'unicode', True),
|
"<body><tag>ø</tag></body>", 'unicode', True),
|
||||||
|
|
||||||
]
|
]
|
||||||
|
@ -1051,11 +1048,10 @@ class ElementTreeTest(unittest.TestCase):
|
||||||
b"<?xml version='1.0' encoding='us-ascii'?>\n<body><tag /></body>"
|
b"<?xml version='1.0' encoding='us-ascii'?>\n<body><tag /></body>"
|
||||||
)
|
)
|
||||||
|
|
||||||
preferredencoding = locale.getpreferredencoding()
|
|
||||||
stringlist = ET.tostringlist(elem, encoding='unicode', xml_declaration=True)
|
stringlist = ET.tostringlist(elem, encoding='unicode', xml_declaration=True)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
''.join(stringlist),
|
''.join(stringlist),
|
||||||
f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>"
|
"<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>"
|
||||||
)
|
)
|
||||||
self.assertRegex(stringlist[0], r"^<\?xml version='1.0' encoding='.+'?>")
|
self.assertRegex(stringlist[0], r"^<\?xml version='1.0' encoding='.+'?>")
|
||||||
self.assertEqual(['<body', '>', '<tag', ' />', '</body>'], stringlist[1:])
|
self.assertEqual(['<body', '>', '<tag', ' />', '</body>'], stringlist[1:])
|
||||||
|
@ -3740,17 +3736,16 @@ class IOTest(unittest.TestCase):
|
||||||
encoding = f.encoding
|
encoding = f.encoding
|
||||||
os_helper.unlink(TESTFN)
|
os_helper.unlink(TESTFN)
|
||||||
|
|
||||||
try:
|
|
||||||
'\xf8'.encode(encoding)
|
|
||||||
except UnicodeEncodeError:
|
|
||||||
self.skipTest(f'default file encoding {encoding} not supported')
|
|
||||||
|
|
||||||
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
|
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
|
||||||
tree.write(TESTFN, encoding='unicode')
|
tree.write(TESTFN, encoding='unicode')
|
||||||
with open(TESTFN, 'rb') as f:
|
with open(TESTFN, 'rb') as f:
|
||||||
data = f.read()
|
data = f.read()
|
||||||
expected = "<site>\xf8</site>".encode(encoding, 'xmlcharrefreplace')
|
expected = "<site>\xf8</site>".encode(encoding, 'xmlcharrefreplace')
|
||||||
self.assertEqual(data, expected)
|
if encoding.lower() in ('utf-8', 'ascii'):
|
||||||
|
self.assertEqual(data, expected)
|
||||||
|
else:
|
||||||
|
self.assertIn(b"<?xml version='1.0' encoding=", data)
|
||||||
|
self.assertIn(expected, data)
|
||||||
|
|
||||||
def test_write_to_text_file(self):
|
def test_write_to_text_file(self):
|
||||||
self.addCleanup(os_helper.unlink, TESTFN)
|
self.addCleanup(os_helper.unlink, TESTFN)
|
||||||
|
@ -3765,13 +3760,17 @@ class IOTest(unittest.TestCase):
|
||||||
tree.write(f, encoding='unicode')
|
tree.write(f, encoding='unicode')
|
||||||
self.assertFalse(f.closed)
|
self.assertFalse(f.closed)
|
||||||
with open(TESTFN, 'rb') as f:
|
with open(TESTFN, 'rb') as f:
|
||||||
self.assertEqual(f.read(), b'''<site>ø</site>''')
|
self.assertEqual(f.read(), convlinesep(
|
||||||
|
b'''<?xml version='1.0' encoding='ascii'?>\n'''
|
||||||
|
b'''<site>ø</site>'''))
|
||||||
|
|
||||||
with open(TESTFN, 'w', encoding='ISO-8859-1') as f:
|
with open(TESTFN, 'w', encoding='ISO-8859-1') as f:
|
||||||
tree.write(f, encoding='unicode')
|
tree.write(f, encoding='unicode')
|
||||||
self.assertFalse(f.closed)
|
self.assertFalse(f.closed)
|
||||||
with open(TESTFN, 'rb') as f:
|
with open(TESTFN, 'rb') as f:
|
||||||
self.assertEqual(f.read(), b'''<site>\xf8</site>''')
|
self.assertEqual(f.read(), convlinesep(
|
||||||
|
b'''<?xml version='1.0' encoding='ISO-8859-1'?>\n'''
|
||||||
|
b'''<site>\xf8</site>'''))
|
||||||
|
|
||||||
def test_write_to_binary_file(self):
|
def test_write_to_binary_file(self):
|
||||||
self.addCleanup(os_helper.unlink, TESTFN)
|
self.addCleanup(os_helper.unlink, TESTFN)
|
||||||
|
|
|
@ -728,16 +728,10 @@ class ElementTree:
|
||||||
encoding = "utf-8"
|
encoding = "utf-8"
|
||||||
else:
|
else:
|
||||||
encoding = "us-ascii"
|
encoding = "us-ascii"
|
||||||
enc_lower = encoding.lower()
|
with _get_writer(file_or_filename, encoding) as (write, declared_encoding):
|
||||||
with _get_writer(file_or_filename, enc_lower) as write:
|
|
||||||
if method == "xml" and (xml_declaration or
|
if method == "xml" and (xml_declaration or
|
||||||
(xml_declaration is None and
|
(xml_declaration is None and
|
||||||
enc_lower not in ("utf-8", "us-ascii", "unicode"))):
|
declared_encoding.lower() not in ("utf-8", "us-ascii"))):
|
||||||
declared_encoding = encoding
|
|
||||||
if enc_lower == "unicode":
|
|
||||||
# Retrieve the default encoding for the xml declaration
|
|
||||||
import locale
|
|
||||||
declared_encoding = locale.getpreferredencoding()
|
|
||||||
write("<?xml version='1.0' encoding='%s'?>\n" % (
|
write("<?xml version='1.0' encoding='%s'?>\n" % (
|
||||||
declared_encoding,))
|
declared_encoding,))
|
||||||
if method == "text":
|
if method == "text":
|
||||||
|
@ -762,19 +756,20 @@ def _get_writer(file_or_filename, encoding):
|
||||||
write = file_or_filename.write
|
write = file_or_filename.write
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
# file_or_filename is a file name
|
# file_or_filename is a file name
|
||||||
if encoding == "unicode":
|
if encoding.lower() == "unicode":
|
||||||
file = open(file_or_filename, "w")
|
file = open(file_or_filename, "w",
|
||||||
|
errors="xmlcharrefreplace")
|
||||||
else:
|
else:
|
||||||
file = open(file_or_filename, "w", encoding=encoding,
|
file = open(file_or_filename, "w", encoding=encoding,
|
||||||
errors="xmlcharrefreplace")
|
errors="xmlcharrefreplace")
|
||||||
with file:
|
with file:
|
||||||
yield file.write
|
yield file.write, file.encoding
|
||||||
else:
|
else:
|
||||||
# file_or_filename is a file-like object
|
# file_or_filename is a file-like object
|
||||||
# encoding determines if it is a text or binary writer
|
# encoding determines if it is a text or binary writer
|
||||||
if encoding == "unicode":
|
if encoding.lower() == "unicode":
|
||||||
# use a text writer as is
|
# use a text writer as is
|
||||||
yield write
|
yield write, getattr(file_or_filename, "encoding", None) or "utf-8"
|
||||||
else:
|
else:
|
||||||
# wrap a binary writer with TextIOWrapper
|
# wrap a binary writer with TextIOWrapper
|
||||||
with contextlib.ExitStack() as stack:
|
with contextlib.ExitStack() as stack:
|
||||||
|
@ -805,7 +800,7 @@ def _get_writer(file_or_filename, encoding):
|
||||||
# Keep the original file open when the TextIOWrapper is
|
# Keep the original file open when the TextIOWrapper is
|
||||||
# destroyed
|
# destroyed
|
||||||
stack.callback(file.detach)
|
stack.callback(file.detach)
|
||||||
yield file.write
|
yield file.write, encoding
|
||||||
|
|
||||||
def _namespaces(elem, default_namespace=None):
|
def _namespaces(elem, default_namespace=None):
|
||||||
# identify namespaces used in this tree
|
# identify namespaces used in this tree
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
:class:`~xml.etree.ElementTree.ElementTree` method
|
||||||
|
:meth:`~xml.etree.ElementTree.ElementTree.write` and function
|
||||||
|
:func:`~xml.etree.ElementTree.tostring` now use the text file's encoding
|
||||||
|
("UTF-8" if not available) instead of locale encoding in XML declaration
|
||||||
|
when ``encoding="unicode"`` is specified.
|
Loading…
Reference in New Issue