bpo-37531: regrtest ignores output on timeout (GH-16659)
bpo-37531, bpo-38207: On timeout, regrtest no longer attempts to call
`popen.communicate() again: it can hang until all child processes
using stdout and stderr pipes completes. Kill the worker process and
ignores its output.
Reenable test_regrtest.test_multiprocessing_timeout().
bpo-37531: Change also the faulthandler timeout of the main process
from 1 minute to 5 minutes, for Python slowest buildbots.
(cherry picked from commit 0ec618af98
)
Co-authored-by: Victor Stinner <vstinner@python.org>
This commit is contained in:
parent
d004a5b082
commit
359a1975cb
|
@ -22,6 +22,12 @@ from test.libregrtest.utils import format_duration, print_warning
|
||||||
PROGRESS_UPDATE = 30.0 # seconds
|
PROGRESS_UPDATE = 30.0 # seconds
|
||||||
assert PROGRESS_UPDATE >= PROGRESS_MIN_TIME
|
assert PROGRESS_UPDATE >= PROGRESS_MIN_TIME
|
||||||
|
|
||||||
|
# Kill the main process after 5 minutes. It is supposed to write an update
|
||||||
|
# every PROGRESS_UPDATE seconds. Tolerate 5 minutes for Python slowest
|
||||||
|
# buildbot workers.
|
||||||
|
MAIN_PROCESS_TIMEOUT = 5 * 60.0
|
||||||
|
assert MAIN_PROCESS_TIMEOUT >= PROGRESS_UPDATE
|
||||||
|
|
||||||
# Time to wait until a worker completes: should be immediate
|
# Time to wait until a worker completes: should be immediate
|
||||||
JOIN_TIMEOUT = 30.0 # seconds
|
JOIN_TIMEOUT = 30.0 # seconds
|
||||||
|
|
||||||
|
@ -160,20 +166,6 @@ class TestWorkerProcess(threading.Thread):
|
||||||
result = TestResult(test_name, error_type, test_time, None)
|
result = TestResult(test_name, error_type, test_time, None)
|
||||||
return MultiprocessResult(result, stdout, stderr, err_msg)
|
return MultiprocessResult(result, stdout, stderr, err_msg)
|
||||||
|
|
||||||
def _timedout(self, test_name):
|
|
||||||
self._kill()
|
|
||||||
|
|
||||||
stdout = stderr = ''
|
|
||||||
popen = self._popen
|
|
||||||
try:
|
|
||||||
stdout, stderr = popen.communicate(timeout=JOIN_TIMEOUT)
|
|
||||||
except (subprocess.TimeoutExpired, OSError) as exc:
|
|
||||||
print_warning(f"Failed to read {self} output "
|
|
||||||
f"(timeout={format_duration(JOIN_TIMEOUT)}): "
|
|
||||||
f"{exc!r}")
|
|
||||||
|
|
||||||
return self.mp_result_error(test_name, TIMEOUT, stdout, stderr)
|
|
||||||
|
|
||||||
def _run_process(self, test_name):
|
def _run_process(self, test_name):
|
||||||
self.start_time = time.monotonic()
|
self.start_time = time.monotonic()
|
||||||
|
|
||||||
|
@ -197,23 +189,32 @@ class TestWorkerProcess(threading.Thread):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
stdout, stderr = popen.communicate(timeout=self.timeout)
|
stdout, stderr = popen.communicate(timeout=self.timeout)
|
||||||
|
retcode = popen.returncode
|
||||||
|
assert retcode is not None
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
if self._stopped:
|
if self._stopped:
|
||||||
# kill() has been called: communicate() fails
|
# kill() has been called: communicate() fails
|
||||||
# on reading closed stdout/stderr
|
# on reading closed stdout/stderr
|
||||||
raise ExitThread
|
raise ExitThread
|
||||||
|
|
||||||
return self._timedout(test_name)
|
# On timeout, kill the process
|
||||||
|
self._kill()
|
||||||
|
|
||||||
|
# None means TIMEOUT for the caller
|
||||||
|
retcode = None
|
||||||
|
# bpo-38207: Don't attempt to call communicate() again: on it
|
||||||
|
# can hang until all child processes using stdout and stderr
|
||||||
|
# pipes completes.
|
||||||
|
stdout = stderr = ''
|
||||||
except OSError:
|
except OSError:
|
||||||
if self._stopped:
|
if self._stopped:
|
||||||
# kill() has been called: communicate() fails
|
# kill() has been called: communicate() fails
|
||||||
# on reading closed stdout/stderr
|
# on reading closed stdout/stderr
|
||||||
raise ExitThread
|
raise ExitThread
|
||||||
raise
|
raise
|
||||||
|
else:
|
||||||
retcode = popen.returncode
|
stdout = stdout.strip()
|
||||||
stdout = stdout.strip()
|
stderr = stderr.rstrip()
|
||||||
stderr = stderr.rstrip()
|
|
||||||
|
|
||||||
return (retcode, stdout, stderr)
|
return (retcode, stdout, stderr)
|
||||||
except:
|
except:
|
||||||
|
@ -225,13 +226,10 @@ class TestWorkerProcess(threading.Thread):
|
||||||
self.current_test_name = None
|
self.current_test_name = None
|
||||||
|
|
||||||
def _runtest(self, test_name):
|
def _runtest(self, test_name):
|
||||||
result = self._run_process(test_name)
|
retcode, stdout, stderr = self._run_process(test_name)
|
||||||
|
|
||||||
if isinstance(result, MultiprocessResult):
|
if retcode is None:
|
||||||
# _timedout() case
|
return self.mp_result_error(test_name, TIMEOUT, stdout, stderr)
|
||||||
return result
|
|
||||||
|
|
||||||
retcode, stdout, stderr = result
|
|
||||||
|
|
||||||
err_msg = None
|
err_msg = None
|
||||||
if retcode != 0:
|
if retcode != 0:
|
||||||
|
@ -250,7 +248,8 @@ class TestWorkerProcess(threading.Thread):
|
||||||
err_msg = "Failed to parse worker JSON: %s" % exc
|
err_msg = "Failed to parse worker JSON: %s" % exc
|
||||||
|
|
||||||
if err_msg is not None:
|
if err_msg is not None:
|
||||||
return self.mp_result_error(test_name, CHILD_ERROR, stdout, stderr, err_msg)
|
return self.mp_result_error(test_name, CHILD_ERROR,
|
||||||
|
stdout, stderr, err_msg)
|
||||||
|
|
||||||
return MultiprocessResult(result, stdout, stderr, err_msg)
|
return MultiprocessResult(result, stdout, stderr, err_msg)
|
||||||
|
|
||||||
|
@ -289,6 +288,16 @@ class TestWorkerProcess(threading.Thread):
|
||||||
f"{exc!r}")
|
f"{exc!r}")
|
||||||
|
|
||||||
def wait_stopped(self, start_time):
|
def wait_stopped(self, start_time):
|
||||||
|
# bpo-38207: MultiprocessTestRunner.stop_workers() called self.stop()
|
||||||
|
# which killed the process. Sometimes, killing the process from the
|
||||||
|
# main thread does not interrupt popen.communicate() in
|
||||||
|
# TestWorkerProcess thread. This loop with a timeout is a workaround
|
||||||
|
# for that.
|
||||||
|
#
|
||||||
|
# Moreover, if this method fails to join the thread, it is likely
|
||||||
|
# that Python will hang at exit while calling threading._shutdown()
|
||||||
|
# which tries again to join the blocked thread. Regrtest.main()
|
||||||
|
# uses EXIT_TIMEOUT to workaround this second bug.
|
||||||
while True:
|
while True:
|
||||||
# Write a message every second
|
# Write a message every second
|
||||||
self.join(1.0)
|
self.join(1.0)
|
||||||
|
@ -355,7 +364,8 @@ class MultiprocessTestRunner:
|
||||||
timeout = PROGRESS_UPDATE
|
timeout = PROGRESS_UPDATE
|
||||||
while True:
|
while True:
|
||||||
if use_faulthandler:
|
if use_faulthandler:
|
||||||
faulthandler.dump_traceback_later(timeout * 2.0, exit=True)
|
faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT,
|
||||||
|
exit=True)
|
||||||
|
|
||||||
# wait for a thread
|
# wait for a thread
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
On timeout, regrtest no longer attempts to call ``popen.communicate()``
|
||||||
|
again: it can hang until all child processes using stdout and stderr pipes
|
||||||
|
completes. Kill the worker process and ignores its output. Change also the
|
||||||
|
faulthandler timeout of the main process from 1 minute to 5 minutes, for Python
|
||||||
|
slowest buildbots.
|
Loading…
Reference in New Issue