208 lines
6.8 KiB
Diff
208 lines
6.8 KiB
Diff
From 4d8fc1d2a0e7385bb27cf38795cce91e8bd5729e Mon Sep 17 00:00:00 2001
|
|
From: Thomas Gleixner <tglx@linutronix.de>
|
|
Date: Fri, 27 Jun 2014 16:24:52 +0200
|
|
Subject: [PATCH 325/352] workqueue: Prevent deadlock/stall on RT
|
|
|
|
Austin reported a XFS deadlock/stall on RT where scheduled work gets
|
|
never exececuted and tasks are waiting for each other for ever.
|
|
|
|
The underlying problem is the modification of the RT code to the
|
|
handling of workers which are about to go to sleep. In mainline a
|
|
worker thread which goes to sleep wakes an idle worker if there is
|
|
more work to do. This happens from the guts of the schedule()
|
|
function. On RT this must be outside and the accessed data structures
|
|
are not protected against scheduling due to the spinlock to rtmutex
|
|
conversion. So the naive solution to this was to move the code outside
|
|
of the scheduler and protect the data structures by the pool
|
|
lock. That approach turned out to be a little naive as we cannot call
|
|
into that code when the thread blocks on a lock, as it is not allowed
|
|
to block on two locks in parallel. So we dont call into the worker
|
|
wakeup magic when the worker is blocked on a lock, which causes the
|
|
deadlock/stall observed by Austin and Mike.
|
|
|
|
Looking deeper into that worker code it turns out that the only
|
|
relevant data structure which needs to be protected is the list of
|
|
idle workers which can be woken up.
|
|
|
|
So the solution is to protect the list manipulation operations with
|
|
preempt_enable/disable pairs on RT and call unconditionally into the
|
|
worker code even when the worker is blocked on a lock. The preemption
|
|
protection is safe as there is nothing which can fiddle with the list
|
|
outside of thread context.
|
|
|
|
Reported-and_tested-by: Austin Schuh <austin@peloton-tech.com>
|
|
Reported-and_tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Link: http://vger.kernel.org/r/alpine.DEB.2.10.1406271249510.5170@nanos
|
|
Cc: Richard Weinberger <richard.weinberger@gmail.com>
|
|
Cc: Steven Rostedt <rostedt@goodmis.org>
|
|
---
|
|
kernel/sched/core.c | 7 +++++--
|
|
kernel/workqueue.c | 60 +++++++++++++++++++++++++++++++++++++++++------------
|
|
2 files changed, 52 insertions(+), 15 deletions(-)
|
|
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index d252c77..ec9a8e8 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -3768,9 +3768,8 @@ void __noreturn do_task_dead(void)
|
|
|
|
static inline void sched_submit_work(struct task_struct *tsk)
|
|
{
|
|
- if (!tsk->state || tsk_is_pi_blocked(tsk))
|
|
+ if (!tsk->state)
|
|
return;
|
|
-
|
|
/*
|
|
* If a worker went to sleep, notify and ask workqueue whether
|
|
* it wants to wake up a task to maintain concurrency.
|
|
@@ -3778,6 +3777,10 @@ static inline void sched_submit_work(struct task_struct *tsk)
|
|
if (tsk->flags & PF_WQ_WORKER)
|
|
wq_worker_sleeping(tsk);
|
|
|
|
+
|
|
+ if (tsk_is_pi_blocked(tsk))
|
|
+ return;
|
|
+
|
|
/*
|
|
* If we are going to sleep and we have plugged IO queued,
|
|
* make sure to submit it to avoid deadlocks.
|
|
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
|
|
index b69456e..67eb847 100644
|
|
--- a/kernel/workqueue.c
|
|
+++ b/kernel/workqueue.c
|
|
@@ -125,6 +125,11 @@ enum {
|
|
* cpu or grabbing pool->lock is enough for read access. If
|
|
* POOL_DISASSOCIATED is set, it's identical to L.
|
|
*
|
|
+ * On RT we need the extra protection via rt_lock_idle_list() for
|
|
+ * the list manipulations against read access from
|
|
+ * wq_worker_sleeping(). All other places are nicely serialized via
|
|
+ * pool->lock.
|
|
+ *
|
|
* A: pool->attach_mutex protected.
|
|
*
|
|
* PL: wq_pool_mutex protected.
|
|
@@ -430,6 +435,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
|
|
if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
|
|
else
|
|
|
|
+#ifdef CONFIG_PREEMPT_RT_BASE
|
|
+static inline void rt_lock_idle_list(struct worker_pool *pool)
|
|
+{
|
|
+ preempt_disable();
|
|
+}
|
|
+static inline void rt_unlock_idle_list(struct worker_pool *pool)
|
|
+{
|
|
+ preempt_enable();
|
|
+}
|
|
+static inline void sched_lock_idle_list(struct worker_pool *pool) { }
|
|
+static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
|
|
+#else
|
|
+static inline void rt_lock_idle_list(struct worker_pool *pool) { }
|
|
+static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
|
|
+static inline void sched_lock_idle_list(struct worker_pool *pool)
|
|
+{
|
|
+ spin_lock_irq(&pool->lock);
|
|
+}
|
|
+static inline void sched_unlock_idle_list(struct worker_pool *pool)
|
|
+{
|
|
+ spin_unlock_irq(&pool->lock);
|
|
+}
|
|
+#endif
|
|
+
|
|
+
|
|
#ifdef CONFIG_DEBUG_OBJECTS_WORK
|
|
|
|
static struct debug_obj_descr work_debug_descr;
|
|
@@ -836,10 +866,16 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
|
|
*/
|
|
static void wake_up_worker(struct worker_pool *pool)
|
|
{
|
|
- struct worker *worker = first_idle_worker(pool);
|
|
+ struct worker *worker;
|
|
+
|
|
+ rt_lock_idle_list(pool);
|
|
+
|
|
+ worker = first_idle_worker(pool);
|
|
|
|
if (likely(worker))
|
|
wake_up_process(worker->task);
|
|
+
|
|
+ rt_unlock_idle_list(pool);
|
|
}
|
|
|
|
/**
|
|
@@ -868,7 +904,7 @@ void wq_worker_running(struct task_struct *task)
|
|
*/
|
|
void wq_worker_sleeping(struct task_struct *task)
|
|
{
|
|
- struct worker *next, *worker = kthread_data(task);
|
|
+ struct worker *worker = kthread_data(task);
|
|
struct worker_pool *pool;
|
|
|
|
/*
|
|
@@ -885,26 +921,18 @@ void wq_worker_sleeping(struct task_struct *task)
|
|
return;
|
|
|
|
worker->sleeping = 1;
|
|
- spin_lock_irq(&pool->lock);
|
|
|
|
/*
|
|
* The counterpart of the following dec_and_test, implied mb,
|
|
* worklist not empty test sequence is in insert_work().
|
|
* Please read comment there.
|
|
- *
|
|
- * NOT_RUNNING is clear. This means that we're bound to and
|
|
- * running on the local cpu w/ rq lock held and preemption
|
|
- * disabled, which in turn means that none else could be
|
|
- * manipulating idle_list, so dereferencing idle_list without pool
|
|
- * lock is safe.
|
|
*/
|
|
if (atomic_dec_and_test(&pool->nr_running) &&
|
|
!list_empty(&pool->worklist)) {
|
|
- next = first_idle_worker(pool);
|
|
- if (next)
|
|
- wake_up_process(next->task);
|
|
+ sched_lock_idle_list(pool);
|
|
+ wake_up_worker(pool);
|
|
+ sched_unlock_idle_list(pool);
|
|
}
|
|
- spin_unlock_irq(&pool->lock);
|
|
}
|
|
|
|
/**
|
|
@@ -1634,7 +1662,9 @@ static void worker_enter_idle(struct worker *worker)
|
|
worker->last_active = jiffies;
|
|
|
|
/* idle_list is LIFO */
|
|
+ rt_lock_idle_list(pool);
|
|
list_add(&worker->entry, &pool->idle_list);
|
|
+ rt_unlock_idle_list(pool);
|
|
|
|
if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
|
|
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
|
|
@@ -1667,7 +1697,9 @@ static void worker_leave_idle(struct worker *worker)
|
|
return;
|
|
worker_clr_flags(worker, WORKER_IDLE);
|
|
pool->nr_idle--;
|
|
+ rt_lock_idle_list(pool);
|
|
list_del_init(&worker->entry);
|
|
+ rt_unlock_idle_list(pool);
|
|
}
|
|
|
|
static struct worker *alloc_worker(int node)
|
|
@@ -1833,7 +1865,9 @@ static void destroy_worker(struct worker *worker)
|
|
pool->nr_workers--;
|
|
pool->nr_idle--;
|
|
|
|
+ rt_lock_idle_list(pool);
|
|
list_del_init(&worker->entry);
|
|
+ rt_unlock_idle_list(pool);
|
|
worker->flags |= WORKER_DIE;
|
|
wake_up_process(worker->task);
|
|
}
|
|
--
|
|
2.7.4
|
|
|