Re: [kernel.org users] [KORG] Panics on master backend

Peter Zijlstra <peterz@xxxxxxxxxxxxx> · Wed, 24 Aug 2011 11:59:52 +0200

On Tue, 2011-08-23 at 16:32 -0500, James Bottomley wrote:
> > scsi_request_fn(), which does:
> > 
> >                 /*
> >                  * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will
> >                  *              take the lock again.
> >                  */
> >                 spin_unlock_irq(shost->host_lock);
> 
> Um, that one looks like a mistake missed in cleanup (probably years
> ago).   The other host_lock acquisitions aren't _irq, so this one
> shouldn't be either.  I can patch it up (or you can).

I would much appreciate if someone who knows that whole stack would
audit it, my preferred solution is removing that blk plug stuff from the
scheduler guts since clearly nobody bothered making sure its sane.


Something like the three patches below, lifted from -rt:

sched-separate-the-scheduler-entry-for-preemption.patch
sched-move-blk_schedule_flush_plug-out-of-__schedule.patch
block-shorten-interrupt-disabled-regions.patch

Subject: block: Shorten interrupt disabled regions
From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Date: Wed, 22 Jun 2011 19:47:02 +0200

Moving the blk_sched_flush_plug() call out of the interrupt/preempt
disabled region in the scheduler allows us to replace
local_irq_save/restore(flags) by local_irq_disable/enable() in
blk_flush_plug().

Now instead of doing this we disable interrupts explicitely when we
lock the request_queue and reenable them when we drop the lock. That
allows interrupts to be handled when the plug list contains requests
for more than one queue.

Aside of that this change makes the scope of the irq disabled region
more obvious. The current code confused the hell out of me when
looking at:

 local_irq_save(flags);
   spin_lock(q->queue_lock);
   ...
   queue_unplugged(q...);
     scsi_request_fn();
       spin_unlock(q->queue_lock);
       spin_lock(shost->host_lock);
       spin_unlock_irq(shost->host_lock);

-------------------^^^ ????

       spin_lock_irq(q->queue_lock);
       spin_unlock(q->lock);
 local_irq_restore(flags);

Also add a comment to __blk_run_queue() documenting that
q->request_fn() can drop q->queue_lock and reenable interrupts, but
must return with q->queue_lock held and interrupts disabled.

Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: Jens Axboe <axboe@xxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Link: http://lkml.kernel.org/r/20110622174919.025446432@xxxxxxxxxxxxx
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
 block/blk-core.c |   20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

Index: linux-2.6/block/blk-core.c
===================================================================

--- linux-2.6.orig/block/blk-core.c
+++ linux-2.6/block/blk-core.c
@@ -301,7 +301,11 @@ void __blk_run_queue(struct request_queu
 {
 	if (unlikely(blk_queue_stopped(q)))
 		return;
-
+	/*
+	 * q->request_fn() can drop q->queue_lock and reenable
+	 * interrupts, but must return with q->queue_lock held and
+	 * interrupts disabled.
+	 */
 	q->request_fn(q);
 }
 EXPORT_SYMBOL(__blk_run_queue);
@@ -2667,11 +2671,11 @@ static void queue_unplugged(struct reque
 	 * this lock).
 	 */
 	if (from_schedule) {
-		spin_unlock(q->queue_lock);
+		spin_unlock_irq(q->queue_lock);
 		blk_run_queue_async(q);
 	} else {
 		__blk_run_queue(q);
-		spin_unlock(q->queue_lock);
+		spin_unlock_irq(q->queue_lock);
 	}
 
 }
@@ -2697,7 +2701,6 @@ static void flush_plug_callbacks(struct 
 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct request_queue *q;
-	unsigned long flags;
 	struct request *rq;
 	LIST_HEAD(list);
 	unsigned int depth;
@@ -2718,11 +2721,6 @@ void blk_flush_plug_list(struct blk_plug
 	q = NULL;
 	depth = 0;
 
-	/*
-	 * Save and disable interrupts here, to avoid doing it for every
-	 * queue lock we have to take.
-	 */
-	local_irq_save(flags);
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
 		list_del_init(&rq->queuelist);
@@ -2735,7 +2733,7 @@ void blk_flush_plug_list(struct blk_plug
 				queue_unplugged(q, depth, from_schedule);
 			q = rq->q;
 			depth = 0;
-			spin_lock(q->queue_lock);
+			spin_lock_irq(q->queue_lock);
 		}
 		/*
 		 * rq is already accounted, so use raw insert
@@ -2753,8 +2751,6 @@ void blk_flush_plug_list(struct blk_plug
 	 */
 	if (q)
 		queue_unplugged(q, depth, from_schedule);
-
-	local_irq_restore(flags);
 }
 
 void blk_finish_plug(struct blk_plug *plug)



Subject: sched: Move blk_schedule_flush_plug() out of __schedule()
From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Date: Wed, 22 Jun 2011 19:47:01 +0200

There is no real reason to run blk_schedule_flush_plug() with
interrupts and preemption disabled.

Move it into schedule() and call it when the task is going voluntarily
to sleep. There might be false positives when the task is woken
between that call and actually scheduling, but that's not really
different from being woken immediately after switching away.

Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: Jens Axboe <axboe@xxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Link: http://lkml.kernel.org/r/20110622174918.917236198@xxxxxxxxxxxxx
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
 kernel/sched.c |   25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -4253,16 +4253,6 @@ need_resched:
 				if (to_wakeup)
 					try_to_wake_up_local(to_wakeup);
 			}
-
-			/*
-			 * If we are going to sleep and we have plugged IO
-			 * queued, make sure to submit it to avoid deadlocks.
-			 */
-			if (blk_needs_flush_plug(prev)) {
-				raw_spin_unlock(&rq->lock);
-				blk_schedule_flush_plug(prev);
-				raw_spin_lock(&rq->lock);
-			}
 		}
 		switch_count = &prev->nvcsw;
 	}
@@ -4301,8 +4291,23 @@ need_resched:
 		goto need_resched;
 }
 
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+	if (!tsk->state)
+		return;
+	/*
+	 * If we are going to sleep and we have plugged IO queued,
+	 * make sure to submit it to avoid deadlocks.
+	 */
+	if (blk_needs_flush_plug(tsk))
+		blk_schedule_flush_plug(tsk);
+}
+
 asmlinkage void schedule(void)
 {
+	struct task_struct *tsk = current;
+
+	sched_submit_work(tsk);
 	__schedule();
 }
 EXPORT_SYMBOL(schedule);



Subject: sched: Separate the scheduler entry for preemption
From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Date: Wed, 22 Jun 2011 19:47:00 +0200

Block-IO and workqueues call into notifier functions from the
scheduler core code with interrupts and preemption disabled. These
calls should be made before entering the scheduler core.

To simplify this, separate the scheduler core code into
__schedule(). __schedule() is directly called from the places which
set PREEMPT_ACTIVE and from schedule(). This allows us to add the work
checks into schedule(), so they are only called when a task voluntary
goes to sleep.

Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: Jens Axboe <axboe@xxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Link: http://lkml.kernel.org/r/20110622174918.813258321@xxxxxxxxxxxxx
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
 kernel/sched.c |   15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -4210,9 +4210,9 @@ pick_next_task(struct rq *rq)
 }
 
 /*
- * schedule() is the main scheduler function.
+ * __schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+static void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
@@ -4300,6 +4300,11 @@ need_resched:
 	if (need_resched())
 		goto need_resched;
 }
+
+asmlinkage void schedule(void)
+{
+	__schedule();
+}
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
@@ -4373,7 +4378,7 @@ asmlinkage void __sched notrace preempt_
 
 	do {
 		add_preempt_count_notrace(PREEMPT_ACTIVE);
-		schedule();
+		__schedule();
 		sub_preempt_count_notrace(PREEMPT_ACTIVE);
 
 		/*
@@ -4401,7 +4406,7 @@ asmlinkage void __sched preempt_schedule
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		local_irq_enable();
-		schedule();
+		__schedule();
 		local_irq_disable();
 		sub_preempt_count(PREEMPT_ACTIVE);
 
@@ -5526,7 +5531,7 @@ static inline int should_resched(void)
 static void __cond_resched(void)
 {
 	add_preempt_count(PREEMPT_ACTIVE);
-	schedule();
+	__schedule();
 	sub_preempt_count(PREEMPT_ACTIVE);
 }