diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index baa4e1f5342..661ad8b9d4d 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -34,6 +34,7 @@ try: except UnicodeEncodeError: raise unittest.SkipTest("filename is not encodable to utf8") SIMPLE_NS_XMLFILE = findfile("simple-ns.xml", subdir="xmltestdata") +UTF8_BUG_XMLFILE = findfile("expat224_utf8_bug.xml", subdir="xmltestdata") SAMPLE_XML = """\ @@ -1739,6 +1740,37 @@ class BugsTest(unittest.TestCase): self.assertIsInstance(e[0].tag, str) self.assertEqual(e[0].tag, 'changed') + def check_expat224_utf8_bug(self, text): + xml = b'' % text + root = ET.XML(xml) + self.assertEqual(root.get('b'), text.decode('utf-8')) + + def test_expat224_utf8_bug(self): + # bpo-31170: Expat 2.2.3 had a bug in its UTF-8 decoder. + # Check that Expat 2.2.4 fixed the bug. + # + # Test buffer bounds at odd and even positions. + + text = b'\xc3\xa0' * 1024 + self.check_expat224_utf8_bug(text) + + text = b'x' + b'\xc3\xa0' * 1024 + self.check_expat224_utf8_bug(text) + + def test_expat224_utf8_bug_file(self): + with open(UTF8_BUG_XMLFILE, 'rb') as fp: + raw = fp.read() + root = ET.fromstring(raw) + xmlattr = root.get('b') + + # "Parse" manually the XML file to extract the value of the 'b' + # attribute of the XML element + text = raw.decode('utf-8').strip() + text = text.replace('\r\n', ' ') + text = text[6:-4] + self.assertEqual(root.get('b'), text) + + # -------------------------------------------------------------------- diff --git a/Lib/test/xmltestdata/expat224_utf8_bug.xml b/Lib/test/xmltestdata/expat224_utf8_bug.xml new file mode 100644 index 00000000000..d66a8e6b50f --- /dev/null +++ b/Lib/test/xmltestdata/expat224_utf8_bug.xml @@ -0,0 +1,2 @@ +