gh-118332: Fix deadlock involving stop the world (#118412)

Avoid detaching thread state when stopping the world. When re-attaching
the thread state, the thread would attempt to resume the top-most
critical section, which might now be held by a thread paused for our
stop-the-world request.
This commit is contained in:
Sam Gross 2024-04-30 15:01:28 -04:00 committed by GitHub
parent 4a1cf66c5c
commit b2c3b70c71
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 96 additions and 7 deletions

View File

@ -150,8 +150,10 @@ PyAPI_FUNC(void) PyEvent_Wait(PyEvent *evt);
// Wait for the event to be set, or until the timeout expires. If the event is
// already set, then this returns immediately. Returns 1 if the event was set,
// and 0 if the timeout expired or thread was interrupted.
PyAPI_FUNC(int) PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns);
// and 0 if the timeout expired or thread was interrupted. If `detach` is
// true, then the thread will detach/release the GIL while waiting.
PyAPI_FUNC(int)
PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns, int detach);
// _PyRawMutex implements a word-sized mutex that that does not depend on the
// parking lot API, and therefore can be used in the parking lot

View File

@ -204,6 +204,90 @@ test_critical_sections_threads(PyObject *self, PyObject *Py_UNUSED(args))
Py_DECREF(test_data.obj1);
Py_RETURN_NONE;
}
static void
pysleep(int ms)
{
#ifdef MS_WINDOWS
Sleep(ms);
#else
usleep(ms * 1000);
#endif
}
struct test_data_gc {
PyObject *obj;
Py_ssize_t num_threads;
Py_ssize_t id;
Py_ssize_t countdown;
PyEvent done_event;
PyEvent ready;
};
static void
thread_gc(void *arg)
{
struct test_data_gc *test_data = arg;
PyGILState_STATE gil = PyGILState_Ensure();
Py_ssize_t id = _Py_atomic_add_ssize(&test_data->id, 1);
if (id == test_data->num_threads - 1) {
_PyEvent_Notify(&test_data->ready);
}
else {
// wait for all test threads to more reliably reproduce the issue.
PyEvent_Wait(&test_data->ready);
}
if (id == 0) {
Py_BEGIN_CRITICAL_SECTION(test_data->obj);
// pause long enough that the lock would be handed off directly to
// a waiting thread.
pysleep(5);
PyGC_Collect();
Py_END_CRITICAL_SECTION();
}
else if (id == 1) {
pysleep(1);
Py_BEGIN_CRITICAL_SECTION(test_data->obj);
pysleep(1);
Py_END_CRITICAL_SECTION();
}
else if (id == 2) {
// sleep long enough so that thread 0 is waiting to stop the world
pysleep(6);
Py_BEGIN_CRITICAL_SECTION(test_data->obj);
pysleep(1);
Py_END_CRITICAL_SECTION();
}
PyGILState_Release(gil);
if (_Py_atomic_add_ssize(&test_data->countdown, -1) == 1) {
// last thread to finish sets done_event
_PyEvent_Notify(&test_data->done_event);
}
}
static PyObject *
test_critical_sections_gc(PyObject *self, PyObject *Py_UNUSED(args))
{
// gh-118332: Contended critical sections should not deadlock with GC
const Py_ssize_t NUM_THREADS = 3;
struct test_data_gc test_data = {
.obj = PyDict_New(),
.countdown = NUM_THREADS,
.num_threads = NUM_THREADS,
};
assert(test_data.obj != NULL);
for (int i = 0; i < NUM_THREADS; i++) {
PyThread_start_new_thread(&thread_gc, &test_data);
}
PyEvent_Wait(&test_data.done_event);
Py_DECREF(test_data.obj);
Py_RETURN_NONE;
}
#endif
static PyMethodDef test_methods[] = {
@ -212,6 +296,7 @@ static PyMethodDef test_methods[] = {
{"test_critical_sections_suspend", test_critical_sections_suspend, METH_NOARGS},
#ifdef Py_CAN_START_THREADS
{"test_critical_sections_threads", test_critical_sections_threads, METH_NOARGS},
{"test_critical_sections_gc", test_critical_sections_gc, METH_NOARGS},
#endif
{NULL, NULL} /* sentinel */
};

View File

@ -501,7 +501,8 @@ ThreadHandle_join(ThreadHandle *self, PyTime_t timeout_ns)
// Wait until the deadline for the thread to exit.
PyTime_t deadline = timeout_ns != -1 ? _PyDeadline_Init(timeout_ns) : 0;
while (!PyEvent_WaitTimed(&self->thread_is_exiting, timeout_ns)) {
int detach = 1;
while (!PyEvent_WaitTimed(&self->thread_is_exiting, timeout_ns, detach)) {
if (deadline) {
// _PyDeadline_Get will return a negative value if the deadline has
// been exceeded.

View File

@ -277,12 +277,12 @@ _PyEvent_Notify(PyEvent *evt)
void
PyEvent_Wait(PyEvent *evt)
{
while (!PyEvent_WaitTimed(evt, -1))
while (!PyEvent_WaitTimed(evt, -1, /*detach=*/1))
;
}
int
PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns)
PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns, int detach)
{
for (;;) {
uint8_t v = _Py_atomic_load_uint8(&evt->v);
@ -298,7 +298,7 @@ PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns)
uint8_t expected = _Py_HAS_PARKED;
(void) _PyParkingLot_Park(&evt->v, &expected, sizeof(evt->v),
timeout_ns, NULL, 1);
timeout_ns, NULL, detach);
return _Py_atomic_load_uint8(&evt->v) == _Py_LOCKED;
}

View File

@ -2238,7 +2238,8 @@ stop_the_world(struct _stoptheworld_state *stw)
}
PyTime_t wait_ns = 1000*1000; // 1ms (arbitrary, may need tuning)
if (PyEvent_WaitTimed(&stw->stop_event, wait_ns)) {
int detach = 0;
if (PyEvent_WaitTimed(&stw->stop_event, wait_ns, detach)) {
assert(stw->thread_countdown == 0);
break;
}