From 20fec2c265f7f47304041abcafdffe7de9937682 Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Wed, 11 May 2022 10:36:52 -0700 Subject: [PATCH] [3.11] gh-91810: ElementTree: Use text file's encoding by default in XML declaration (GH-91903) (GH-92663) ElementTree method write() and function tostring() now use the text file's encoding ("UTF-8" if not available) instead of locale encoding in XML declaration when encoding="unicode" is specified. (cherry picked from commit 707839b0fe02ba2c891a40f40e7a869d84c2c9c5) Co-authored-by: Serhiy Storchaka Automerge-Triggered-By: GH:serhiy-storchaka --- Lib/test/test_xml_etree.py | 31 +++++++++---------- Lib/xml/etree/ElementTree.py | 23 ++++++-------- ...2-04-25-10-23-01.gh-issue-91810.DOHa6B.rst | 5 +++ 3 files changed, 29 insertions(+), 30 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index db25eaba127..aea77b192c1 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -10,7 +10,6 @@ import functools import html import io import itertools -import locale import operator import os import pickle @@ -978,15 +977,13 @@ class ElementTreeTest(unittest.TestCase): def test_tostring_xml_declaration_unicode_encoding(self): elem = ET.XML('') - preferredencoding = locale.getpreferredencoding() self.assertEqual( - f"\n", - ET.tostring(elem, encoding='unicode', xml_declaration=True) + ET.tostring(elem, encoding='unicode', xml_declaration=True), + "\n" ) def test_tostring_xml_declaration_cases(self): elem = ET.XML('ø') - preferredencoding = locale.getpreferredencoding() TESTCASES = [ # (expected_retval, encoding, xml_declaration) # ... xml_declaration = None @@ -1013,7 +1010,7 @@ class ElementTreeTest(unittest.TestCase): b"ø", 'US-ASCII', True), (b"\n" b"\xf8", 'ISO-8859-1', True), - (f"\n" + ("\n" "ø", 'unicode', True), ] @@ -1051,11 +1048,10 @@ class ElementTreeTest(unittest.TestCase): b"\n" ) - preferredencoding = locale.getpreferredencoding() stringlist = ET.tostringlist(elem, encoding='unicode', xml_declaration=True) self.assertEqual( ''.join(stringlist), - f"\n" + "\n" ) self.assertRegex(stringlist[0], r"^<\?xml version='1.0' encoding='.+'?>") self.assertEqual(['', '', ''], stringlist[1:]) @@ -3740,17 +3736,16 @@ class IOTest(unittest.TestCase): encoding = f.encoding os_helper.unlink(TESTFN) - try: - '\xf8'.encode(encoding) - except UnicodeEncodeError: - self.skipTest(f'default file encoding {encoding} not supported') - tree = ET.ElementTree(ET.XML('''\xf8''')) tree.write(TESTFN, encoding='unicode') with open(TESTFN, 'rb') as f: data = f.read() expected = "\xf8".encode(encoding, 'xmlcharrefreplace') - self.assertEqual(data, expected) + if encoding.lower() in ('utf-8', 'ascii'): + self.assertEqual(data, expected) + else: + self.assertIn(b"ø''') + self.assertEqual(f.read(), convlinesep( + b'''\n''' + b'''ø''')) with open(TESTFN, 'w', encoding='ISO-8859-1') as f: tree.write(f, encoding='unicode') self.assertFalse(f.closed) with open(TESTFN, 'rb') as f: - self.assertEqual(f.read(), b'''\xf8''') + self.assertEqual(f.read(), convlinesep( + b'''\n''' + b'''\xf8''')) def test_write_to_binary_file(self): self.addCleanup(os_helper.unlink, TESTFN) diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 5249c7ab82b..a5cc65e789c 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -728,16 +728,10 @@ class ElementTree: encoding = "utf-8" else: encoding = "us-ascii" - enc_lower = encoding.lower() - with _get_writer(file_or_filename, enc_lower) as write: + with _get_writer(file_or_filename, encoding) as (write, declared_encoding): if method == "xml" and (xml_declaration or (xml_declaration is None and - enc_lower not in ("utf-8", "us-ascii", "unicode"))): - declared_encoding = encoding - if enc_lower == "unicode": - # Retrieve the default encoding for the xml declaration - import locale - declared_encoding = locale.getpreferredencoding() + declared_encoding.lower() not in ("utf-8", "us-ascii"))): write("\n" % ( declared_encoding,)) if method == "text": @@ -762,19 +756,20 @@ def _get_writer(file_or_filename, encoding): write = file_or_filename.write except AttributeError: # file_or_filename is a file name - if encoding == "unicode": - file = open(file_or_filename, "w") + if encoding.lower() == "unicode": + file = open(file_or_filename, "w", + errors="xmlcharrefreplace") else: file = open(file_or_filename, "w", encoding=encoding, errors="xmlcharrefreplace") with file: - yield file.write + yield file.write, file.encoding else: # file_or_filename is a file-like object # encoding determines if it is a text or binary writer - if encoding == "unicode": + if encoding.lower() == "unicode": # use a text writer as is - yield write + yield write, getattr(file_or_filename, "encoding", None) or "utf-8" else: # wrap a binary writer with TextIOWrapper with contextlib.ExitStack() as stack: @@ -805,7 +800,7 @@ def _get_writer(file_or_filename, encoding): # Keep the original file open when the TextIOWrapper is # destroyed stack.callback(file.detach) - yield file.write + yield file.write, encoding def _namespaces(elem, default_namespace=None): # identify namespaces used in this tree diff --git a/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst b/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst new file mode 100644 index 00000000000..0711f8466b8 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst @@ -0,0 +1,5 @@ +:class:`~xml.etree.ElementTree.ElementTree` method +:meth:`~xml.etree.ElementTree.ElementTree.write` and function +:func:`~xml.etree.ElementTree.tostring` now use the text file's encoding +("UTF-8" if not available) instead of locale encoding in XML declaration +when ``encoding="unicode"`` is specified.