gh-121188: Sanitize invalid XML characters in regrtest (#121195)

When creating the JUnit XML file, regrtest now escapes characters
which are invalid in XML, such as the chr(27) control character used
in ANSI escape sequences.
This commit is contained in:
Victor Stinner 2024-07-01 10:30:33 +02:00 committed by GitHub
parent f80376b129
commit af8c3d7a26
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 91 additions and 5 deletions

View File

@ -9,6 +9,7 @@ import time
import traceback
import unittest
from test import support
from test.libregrtest.utils import sanitize_xml
class RegressionTestResult(unittest.TextTestResult):
USE_XML = False
@ -65,23 +66,24 @@ class RegressionTestResult(unittest.TextTestResult):
if capture:
if self._stdout_buffer is not None:
stdout = self._stdout_buffer.getvalue().rstrip()
ET.SubElement(e, 'system-out').text = stdout
ET.SubElement(e, 'system-out').text = sanitize_xml(stdout)
if self._stderr_buffer is not None:
stderr = self._stderr_buffer.getvalue().rstrip()
ET.SubElement(e, 'system-err').text = stderr
ET.SubElement(e, 'system-err').text = sanitize_xml(stderr)
for k, v in args.items():
if not k or not v:
continue
e2 = ET.SubElement(e, k)
if hasattr(v, 'items'):
for k2, v2 in v.items():
if k2:
e2.set(k2, str(v2))
e2.set(k2, sanitize_xml(str(v2)))
else:
e2.text = str(v2)
e2.text = sanitize_xml(str(v2))
else:
e2.text = str(v)
e2.text = sanitize_xml(str(v))
@classmethod
def __makeErrorDict(cls, err_type, err_value, err_tb):

View File

@ -5,6 +5,7 @@ import math
import os.path
import platform
import random
import re
import shlex
import signal
import subprocess
@ -712,3 +713,24 @@ def get_signal_name(exitcode):
pass
return None
ILLEGAL_XML_CHARS_RE = re.compile(
'['
# Control characters; newline (\x0A and \x0D) and TAB (\x09) are legal
'\x00-\x08\x0B\x0C\x0E-\x1F'
# Surrogate characters
'\uD800-\uDFFF'
# Special Unicode characters
'\uFFFE'
'\uFFFF'
# Match multiple sequential invalid characters for better effiency
']+')
def _sanitize_xml_replace(regs):
text = regs[0]
return ''.join(f'\\x{ord(ch):02x}' if ch <= '\xff' else ascii(ch)[1:-1]
for ch in text)
def sanitize_xml(text):
return ILLEGAL_XML_CHARS_RE.sub(_sanitize_xml_replace, text)

View File

@ -21,6 +21,8 @@ import sysconfig
import tempfile
import textwrap
import unittest
from xml.etree import ElementTree
from test import support
from test.support import import_helper
from test.support import os_helper
@ -2254,6 +2256,44 @@ class ArgsTestCase(BaseTestCase):
self.check_executed_tests(output, testname, stats=1, parallel=True)
self.assertNotIn('SPAM SPAM SPAM', output)
def test_xml(self):
code = textwrap.dedent(r"""
import unittest
from test import support
class VerboseTests(unittest.TestCase):
def test_failed(self):
print("abc \x1b def")
self.fail()
""")
testname = self.create_test(code=code)
# Run sequentially
filename = os_helper.TESTFN
self.addCleanup(os_helper.unlink, filename)
output = self.run_tests(testname, "--junit-xml", filename,
exitcode=EXITCODE_BAD_TEST)
self.check_executed_tests(output, testname,
failed=testname,
stats=TestStats(1, 1, 0))
# Test generated XML
with open(filename, encoding="utf8") as fp:
content = fp.read()
testsuite = ElementTree.fromstring(content)
self.assertEqual(int(testsuite.get('tests')), 1)
self.assertEqual(int(testsuite.get('errors')), 0)
self.assertEqual(int(testsuite.get('failures')), 1)
testcase = testsuite[0][0]
self.assertEqual(testcase.get('status'), 'run')
self.assertEqual(testcase.get('result'), 'completed')
self.assertGreater(float(testcase.get('time')), 0)
for out in testcase.iter('system-out'):
self.assertEqual(out.text, r"abc \x1b def")
class TestUtils(unittest.TestCase):
def test_format_duration(self):
@ -2437,6 +2477,25 @@ class TestUtils(unittest.TestCase):
self.assertTrue(match_test(test_chdir))
self.assertFalse(match_test(test_copy))
def test_sanitize_xml(self):
sanitize_xml = utils.sanitize_xml
# escape invalid XML characters
self.assertEqual(sanitize_xml('abc \x1b\x1f def'),
r'abc \x1b\x1f def')
self.assertEqual(sanitize_xml('nul:\x00, bell:\x07'),
r'nul:\x00, bell:\x07')
self.assertEqual(sanitize_xml('surrogate:\uDC80'),
r'surrogate:\udc80')
self.assertEqual(sanitize_xml('illegal \uFFFE and \uFFFF'),
r'illegal \ufffe and \uffff')
# no escape for valid XML characters
self.assertEqual(sanitize_xml('a\n\tb'),
'a\n\tb')
self.assertEqual(sanitize_xml('valid t\xe9xt \u20ac'),
'valid t\xe9xt \u20ac')
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,3 @@
When creating the JUnit XML file, regrtest now escapes characters which are
invalid in XML, such as the chr(27) control character used in ANSI escape
sequences. Patch by Victor Stinner.