On Tue, 2011-08-23 at 16:32 -0500, James Bottomley wrote: > > scsi_request_fn(), which does: > > > > /* > > * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will > > * take the lock again. > > */ > > spin_unlock_irq(shost->host_lock); > > Um, that one looks like a mistake missed in cleanup (probably years > ago). The other host_lock acquisitions aren't _irq, so this one > shouldn't be either. I can patch it up (or you can). I would much appreciate if someone who knows that whole stack would audit it, my preferred solution is removing that blk plug stuff from the scheduler guts since clearly nobody bothered making sure its sane. Something like the three patches below, lifted from -rt: sched-separate-the-scheduler-entry-for-preemption.patch sched-move-blk_schedule_flush_plug-out-of-__schedule.patch block-shorten-interrupt-disabled-regions.patch
Subject: block: Shorten interrupt disabled regions From: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Date: Wed, 22 Jun 2011 19:47:02 +0200 Moving the blk_sched_flush_plug() call out of the interrupt/preempt disabled region in the scheduler allows us to replace local_irq_save/restore(flags) by local_irq_disable/enable() in blk_flush_plug(). Now instead of doing this we disable interrupts explicitely when we lock the request_queue and reenable them when we drop the lock. That allows interrupts to be handled when the plug list contains requests for more than one queue. Aside of that this change makes the scope of the irq disabled region more obvious. The current code confused the hell out of me when looking at: local_irq_save(flags); spin_lock(q->queue_lock); ... queue_unplugged(q...); scsi_request_fn(); spin_unlock(q->queue_lock); spin_lock(shost->host_lock); spin_unlock_irq(shost->host_lock); -------------------^^^ ???? spin_lock_irq(q->queue_lock); spin_unlock(q->lock); local_irq_restore(flags); Also add a comment to __blk_run_queue() documenting that q->request_fn() can drop q->queue_lock and reenable interrupts, but must return with q->queue_lock held and interrupts disabled. Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Tejun Heo <tj@xxxxxxxxxx> Cc: Jens Axboe <axboe@xxxxxxxxx> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Link: http://lkml.kernel.org/r/20110622174919.025446432@xxxxxxxxxxxxx Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> --- block/blk-core.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) Index: linux-2.6/block/blk-core.c =================================================================== --- linux-2.6.orig/block/blk-core.c +++ linux-2.6/block/blk-core.c @@ -301,7 +301,11 @@ void __blk_run_queue(struct request_queu { if (unlikely(blk_queue_stopped(q))) return; - + /* + * q->request_fn() can drop q->queue_lock and reenable + * interrupts, but must return with q->queue_lock held and + * interrupts disabled. + */ q->request_fn(q); } EXPORT_SYMBOL(__blk_run_queue); @@ -2667,11 +2671,11 @@ static void queue_unplugged(struct reque * this lock). */ if (from_schedule) { - spin_unlock(q->queue_lock); + spin_unlock_irq(q->queue_lock); blk_run_queue_async(q); } else { __blk_run_queue(q); - spin_unlock(q->queue_lock); + spin_unlock_irq(q->queue_lock); } } @@ -2697,7 +2701,6 @@ static void flush_plug_callbacks(struct void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request_queue *q; - unsigned long flags; struct request *rq; LIST_HEAD(list); unsigned int depth; @@ -2718,11 +2721,6 @@ void blk_flush_plug_list(struct blk_plug q = NULL; depth = 0; - /* - * Save and disable interrupts here, to avoid doing it for every - * queue lock we have to take. - */ - local_irq_save(flags); while (!list_empty(&list)) { rq = list_entry_rq(list.next); list_del_init(&rq->queuelist); @@ -2735,7 +2733,7 @@ void blk_flush_plug_list(struct blk_plug queue_unplugged(q, depth, from_schedule); q = rq->q; depth = 0; - spin_lock(q->queue_lock); + spin_lock_irq(q->queue_lock); } /* * rq is already accounted, so use raw insert @@ -2753,8 +2751,6 @@ void blk_flush_plug_list(struct blk_plug */ if (q) queue_unplugged(q, depth, from_schedule); - - local_irq_restore(flags); } void blk_finish_plug(struct blk_plug *plug)
Subject: sched: Move blk_schedule_flush_plug() out of __schedule() From: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Date: Wed, 22 Jun 2011 19:47:01 +0200 There is no real reason to run blk_schedule_flush_plug() with interrupts and preemption disabled. Move it into schedule() and call it when the task is going voluntarily to sleep. There might be false positives when the task is woken between that call and actually scheduling, but that's not really different from being woken immediately after switching away. Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Tejun Heo <tj@xxxxxxxxxx> Cc: Jens Axboe <axboe@xxxxxxxxx> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Link: http://lkml.kernel.org/r/20110622174918.917236198@xxxxxxxxxxxxx Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> --- kernel/sched.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -4253,16 +4253,6 @@ need_resched: if (to_wakeup) try_to_wake_up_local(to_wakeup); } - - /* - * If we are going to sleep and we have plugged IO - * queued, make sure to submit it to avoid deadlocks. - */ - if (blk_needs_flush_plug(prev)) { - raw_spin_unlock(&rq->lock); - blk_schedule_flush_plug(prev); - raw_spin_lock(&rq->lock); - } } switch_count = &prev->nvcsw; } @@ -4301,8 +4291,23 @@ need_resched: goto need_resched; } +static inline void sched_submit_work(struct task_struct *tsk) +{ + if (!tsk->state) + return; + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(tsk)) + blk_schedule_flush_plug(tsk); +} + asmlinkage void schedule(void) { + struct task_struct *tsk = current; + + sched_submit_work(tsk); __schedule(); } EXPORT_SYMBOL(schedule);
Subject: sched: Separate the scheduler entry for preemption From: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Date: Wed, 22 Jun 2011 19:47:00 +0200 Block-IO and workqueues call into notifier functions from the scheduler core code with interrupts and preemption disabled. These calls should be made before entering the scheduler core. To simplify this, separate the scheduler core code into __schedule(). __schedule() is directly called from the places which set PREEMPT_ACTIVE and from schedule(). This allows us to add the work checks into schedule(), so they are only called when a task voluntary goes to sleep. Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Tejun Heo <tj@xxxxxxxxxx> Cc: Jens Axboe <axboe@xxxxxxxxx> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Link: http://lkml.kernel.org/r/20110622174918.813258321@xxxxxxxxxxxxx Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> --- kernel/sched.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -4210,9 +4210,9 @@ pick_next_task(struct rq *rq) } /* - * schedule() is the main scheduler function. + * __schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +static void __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -4300,6 +4300,11 @@ need_resched: if (need_resched()) goto need_resched; } + +asmlinkage void schedule(void) +{ + __schedule(); +} EXPORT_SYMBOL(schedule); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER @@ -4373,7 +4378,7 @@ asmlinkage void __sched notrace preempt_ do { add_preempt_count_notrace(PREEMPT_ACTIVE); - schedule(); + __schedule(); sub_preempt_count_notrace(PREEMPT_ACTIVE); /* @@ -4401,7 +4406,7 @@ asmlinkage void __sched preempt_schedule do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); - schedule(); + __schedule(); local_irq_disable(); sub_preempt_count(PREEMPT_ACTIVE); @@ -5526,7 +5531,7 @@ static inline int should_resched(void) static void __cond_resched(void) { add_preempt_count(PREEMPT_ACTIVE); - schedule(); + __schedule(); sub_preempt_count(PREEMPT_ACTIVE); }