[3.13] gh-121188: Sanitize invalid XML characters in regrtest (GH-121195) (#121204)

gh-121188: Sanitize invalid XML characters in regrtest (GH-121195)

When creating the JUnit XML file, regrtest now escapes characters
which are invalid in XML, such as the chr(27) control character used
in ANSI escape sequences.
(cherry picked from commit af8c3d7a26)

Co-authored-by: Victor Stinner <vstinner@python.org>
This commit is contained in:
Miss Islington (bot) 2024-07-01 10:55:38 +02:00 committed by GitHub
parent 82777cd024
commit 1f2f9c4ff5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 91 additions and 5 deletions

View File

@ -9,6 +9,7 @@ import time
import traceback import traceback
import unittest import unittest
from test import support from test import support
from test.libregrtest.utils import sanitize_xml
class RegressionTestResult(unittest.TextTestResult): class RegressionTestResult(unittest.TextTestResult):
USE_XML = False USE_XML = False
@ -65,23 +66,24 @@ class RegressionTestResult(unittest.TextTestResult):
if capture: if capture:
if self._stdout_buffer is not None: if self._stdout_buffer is not None:
stdout = self._stdout_buffer.getvalue().rstrip() stdout = self._stdout_buffer.getvalue().rstrip()
ET.SubElement(e, 'system-out').text = stdout ET.SubElement(e, 'system-out').text = sanitize_xml(stdout)
if self._stderr_buffer is not None: if self._stderr_buffer is not None:
stderr = self._stderr_buffer.getvalue().rstrip() stderr = self._stderr_buffer.getvalue().rstrip()
ET.SubElement(e, 'system-err').text = stderr ET.SubElement(e, 'system-err').text = sanitize_xml(stderr)
for k, v in args.items(): for k, v in args.items():
if not k or not v: if not k or not v:
continue continue
e2 = ET.SubElement(e, k) e2 = ET.SubElement(e, k)
if hasattr(v, 'items'): if hasattr(v, 'items'):
for k2, v2 in v.items(): for k2, v2 in v.items():
if k2: if k2:
e2.set(k2, str(v2)) e2.set(k2, sanitize_xml(str(v2)))
else: else:
e2.text = str(v2) e2.text = sanitize_xml(str(v2))
else: else:
e2.text = str(v) e2.text = sanitize_xml(str(v))
@classmethod @classmethod
def __makeErrorDict(cls, err_type, err_value, err_tb): def __makeErrorDict(cls, err_type, err_value, err_tb):

View File

@ -5,6 +5,7 @@ import math
import os.path import os.path
import platform import platform
import random import random
import re
import shlex import shlex
import signal import signal
import subprocess import subprocess
@ -712,3 +713,24 @@ def get_signal_name(exitcode):
pass pass
return None return None
ILLEGAL_XML_CHARS_RE = re.compile(
'['
# Control characters; newline (\x0A and \x0D) and TAB (\x09) are legal
'\x00-\x08\x0B\x0C\x0E-\x1F'
# Surrogate characters
'\uD800-\uDFFF'
# Special Unicode characters
'\uFFFE'
'\uFFFF'
# Match multiple sequential invalid characters for better effiency
']+')
def _sanitize_xml_replace(regs):
text = regs[0]
return ''.join(f'\\x{ord(ch):02x}' if ch <= '\xff' else ascii(ch)[1:-1]
for ch in text)
def sanitize_xml(text):
return ILLEGAL_XML_CHARS_RE.sub(_sanitize_xml_replace, text)

View File

@ -21,6 +21,8 @@ import sysconfig
import tempfile import tempfile
import textwrap import textwrap
import unittest import unittest
from xml.etree import ElementTree
from test import support from test import support
from test.support import os_helper, without_optimizer from test.support import os_helper, without_optimizer
from test.libregrtest import cmdline from test.libregrtest import cmdline
@ -2243,6 +2245,44 @@ class ArgsTestCase(BaseTestCase):
self.check_executed_tests(output, testname, stats=1, parallel=True) self.check_executed_tests(output, testname, stats=1, parallel=True)
self.assertNotIn('SPAM SPAM SPAM', output) self.assertNotIn('SPAM SPAM SPAM', output)
def test_xml(self):
code = textwrap.dedent(r"""
import unittest
from test import support
class VerboseTests(unittest.TestCase):
def test_failed(self):
print("abc \x1b def")
self.fail()
""")
testname = self.create_test(code=code)
# Run sequentially
filename = os_helper.TESTFN
self.addCleanup(os_helper.unlink, filename)
output = self.run_tests(testname, "--junit-xml", filename,
exitcode=EXITCODE_BAD_TEST)
self.check_executed_tests(output, testname,
failed=testname,
stats=TestStats(1, 1, 0))
# Test generated XML
with open(filename, encoding="utf8") as fp:
content = fp.read()
testsuite = ElementTree.fromstring(content)
self.assertEqual(int(testsuite.get('tests')), 1)
self.assertEqual(int(testsuite.get('errors')), 0)
self.assertEqual(int(testsuite.get('failures')), 1)
testcase = testsuite[0][0]
self.assertEqual(testcase.get('status'), 'run')
self.assertEqual(testcase.get('result'), 'completed')
self.assertGreater(float(testcase.get('time')), 0)
for out in testcase.iter('system-out'):
self.assertEqual(out.text, r"abc \x1b def")
class TestUtils(unittest.TestCase): class TestUtils(unittest.TestCase):
def test_format_duration(self): def test_format_duration(self):
@ -2426,6 +2466,25 @@ class TestUtils(unittest.TestCase):
self.assertTrue(match_test(test_chdir)) self.assertTrue(match_test(test_chdir))
self.assertFalse(match_test(test_copy)) self.assertFalse(match_test(test_copy))
def test_sanitize_xml(self):
sanitize_xml = utils.sanitize_xml
# escape invalid XML characters
self.assertEqual(sanitize_xml('abc \x1b\x1f def'),
r'abc \x1b\x1f def')
self.assertEqual(sanitize_xml('nul:\x00, bell:\x07'),
r'nul:\x00, bell:\x07')
self.assertEqual(sanitize_xml('surrogate:\uDC80'),
r'surrogate:\udc80')
self.assertEqual(sanitize_xml('illegal \uFFFE and \uFFFF'),
r'illegal \ufffe and \uffff')
# no escape for valid XML characters
self.assertEqual(sanitize_xml('a\n\tb'),
'a\n\tb')
self.assertEqual(sanitize_xml('valid t\xe9xt \u20ac'),
'valid t\xe9xt \u20ac')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -0,0 +1,3 @@
When creating the JUnit XML file, regrtest now escapes characters which are
invalid in XML, such as the chr(27) control character used in ANSI escape
sequences. Patch by Victor Stinner.