On Fri, 2017-11-03 at 23:18 +0800, Ming Lei wrote: > Subject: [PATCH] SCSI_MQ: fix IO hang in case of queue busy > > We have to insert the rq back before checking .device_busy, > otherwise When IO completes just after the check and before > this req is added to hctx->dispatch, this queue may never get > chance to be run, then this IO may hang forever. > > This patch introduces BLK_STS_RESOURCE_OK for handling this > issue. > > Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> > --- > block/blk-mq.c | 17 +++++++++++++++++ > drivers/scsi/scsi_lib.c | 8 ++++++++ > include/linux/blk-mq.h | 1 + > include/linux/blk_types.h | 1 + > 4 files changed, 27 insertions(+) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index e4d2490f4e7e..e1e03576edca 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -660,6 +660,16 @@ static void __blk_mq_requeue_request(struct request *rq) > } > } > > +void blk_mq_reinsert_request_hctx(struct blk_mq_hw_ctx *hctx, struct request *rq) > +{ > + __blk_mq_requeue_request(rq); > + > + spin_lock(&hctx->lock); > + list_add_tail(&rq->queuelist, &hctx->dispatch); > + spin_unlock(&hctx->lock); > +} > +EXPORT_SYMBOL(blk_mq_reinsert_request_hctx); > + > void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) > { > __blk_mq_requeue_request(rq); > @@ -1165,6 +1175,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > list_add(&rq->queuelist, list); > __blk_mq_requeue_request(rq); > break; > + } else if (ret == BLK_STS_RESOURCE_OK) { > + /* > + * BLK_STS_RESOURCE_OK means driver handled this > + * STS_RESOURCE already, we just need to stop dispatch. > + */ > + break; > } > > fail_rq: > @@ -1656,6 +1672,7 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, > ret = q->mq_ops->queue_rq(hctx, &bd); > switch (ret) { > case BLK_STS_OK: > + case BLK_STS_RESOURCE_OK: > *cookie = new_cookie; > return; > case BLK_STS_RESOURCE: > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c > index 7f218ef61900..0165c1caed82 100644 > --- a/drivers/scsi/scsi_lib.c > +++ b/drivers/scsi/scsi_lib.c > @@ -2030,9 +2030,17 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, > case BLK_STS_OK: > break; > case BLK_STS_RESOURCE: > + /* > + * We have to insert the rq back before checking .device_busy, > + * otherwise when IO completes just after the check and before > + * this req is added to hctx->dispatch, this queue may never get > + * chance to be run, then this IO may hang forever. > + */ > + blk_mq_reinsert_request_hctx(hctx, req); > if (atomic_read(&sdev->device_busy) == 0 && > !scsi_device_blocked(sdev)) > blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY); > + ret = BLK_STS_RESOURCE_OK; > break; > default: > /* > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index e5e6becd57d3..4740f643d8c5 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -244,6 +244,7 @@ void blk_mq_start_request(struct request *rq); > void blk_mq_end_request(struct request *rq, blk_status_t error); > void __blk_mq_end_request(struct request *rq, blk_status_t error); > > +void blk_mq_reinsert_request_hctx(struct blk_mq_hw_ctx *hctx, struct request *rq); > void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); > void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, > bool kick_requeue_list); > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > index 3385c89f402e..b630cc026a93 100644 > --- a/include/linux/blk_types.h > +++ b/include/linux/blk_types.h > @@ -32,6 +32,7 @@ typedef u8 __bitwise blk_status_t; > #define BLK_STS_PROTECTION ((__force blk_status_t)8) > #define BLK_STS_RESOURCE ((__force blk_status_t)9) > #define BLK_STS_IOERR ((__force blk_status_t)10) > +#define BLK_STS_RESOURCE_OK ((__force blk_status_t)11) > > /* hack for device mapper, don't use elsewhere: */ > #define BLK_STS_DM_REQUEUE ((__force blk_status_t)11) Running test one with this patch applied on top of Jens' for-next branch yields the same result as without this patch: [ ... ] Error: unloading kernel module ib_srp failed Test /home/bart/software/infiniband/srp-test/tests/01 failed # dmesg -c >/dev/null; echo w > /proc/sysrq-trigger; dmesg [ 325.167295] sysrq: SysRq : Show Blocked State [ 325.173612] task PC stack pid father [ 325.181903] kworker/10:1 D 0 165 2 0x80000000 [ 325.189396] Workqueue: srp_remove srp_remove_work [ib_srp] [ 325.196222] Call Trace: [ 325.199622] __schedule+0x2fa/0xbb0 [ 325.204227] schedule+0x36/0x90 [ 325.208390] async_synchronize_cookie_domain+0x88/0x130 [ 325.214915] ? finish_wait+0x90/0x90 [ 325.219574] async_synchronize_full_domain+0x18/0x20 [ 325.225816] sd_remove+0x4d/0xc0 [sd_mod] [ 325.230949] device_release_driver_internal+0x160/0x210 [ 325.237479] device_release_driver+0x12/0x20 [ 325.242904] bus_remove_device+0x100/0x180 [ 325.248163] device_del+0x1d8/0x340 [ 325.252735] __scsi_remove_device+0xfc/0x130 [ 325.258192] scsi_forget_host+0x25/0x70 [ 325.263173] scsi_remove_host+0x79/0x120 [ 325.268227] srp_remove_work+0x90/0x1d0 [ib_srp] [ 325.274059] process_one_work+0x20a/0x660 [ 325.279243] worker_thread+0x3d/0x3b0 [ 325.284004] kthread+0x13a/0x150 [ 325.288232] ? process_one_work+0x660/0x660 [ 325.293575] ? kthread_create_on_node+0x40/0x40 [ 325.299334] ret_from_fork+0x27/0x40 [ 325.304002] kworker/5:1 D 0 180 2 0x80000000 [ 325.310768] Workqueue: kaluad alua_rtpg_work [scsi_dh_alua] [ 325.317674] Call Trace: [ 325.321047] __schedule+0x2fa/0xbb0 [ 325.325615] ? trace_hardirqs_on_caller+0xfb/0x190 [ 325.331612] schedule+0x36/0x90 [ 325.335797] schedule_timeout+0x22c/0x570 [ 325.340917] ? call_timer_fn+0x330/0x330 [ 325.345972] ? wait_for_completion_io_timeout+0xf7/0x180 [ 325.352558] io_schedule_timeout+0x1e/0x50 [ 325.357808] ? io_schedule_timeout+0x1e/0x50 [ 325.363262] wait_for_completion_io_timeout+0x11f/0x180 [ 325.369793] ? wake_up_q+0x80/0x80 [ 325.374267] blk_execute_rq+0x86/0xc0 [ 325.379012] scsi_execute+0xdb/0x1f0 [ 325.383678] alua_rtpg_work+0x2b3/0xe8a [scsi_dh_alua] [ 325.390130] process_one_work+0x20a/0x660 [ 325.395818] worker_thread+0x3d/0x3b0 [ 325.401248] kthread+0x13a/0x150 [ 325.406056] ? process_one_work+0x660/0x660 [ 325.412070] ? kthread_create_on_node+0x40/0x40 [ 325.418474] ret_from_fork+0x27/0x40 [ 325.423874] kworker/u66:1 D 0 331 2 0x80000000 [ 325.431374] Workqueue: events_unbound async_run_entry_fn [ 325.438651] Call Trace: [ 325.442662] __schedule+0x2fa/0xbb0 [ 325.447835] ? trace_hardirqs_on_caller+0xfb/0x190 [ 325.454522] schedule+0x36/0x90 [ 325.459339] schedule_timeout+0x22c/0x570 [ 325.465097] ? call_timer_fn+0x330/0x330 [ 325.470800] ? wait_for_completion_io_timeout+0xf7/0x180 [ 325.478017] io_schedule_timeout+0x1e/0x50 [ 325.483935] ? io_schedule_timeout+0x1e/0x50 [ 325.490015] wait_for_completion_io_timeout+0x11f/0x180 [ 325.497134] ? wake_up_q+0x80/0x80 [ 325.502272] blk_execute_rq+0x86/0xc0 [ 325.507652] scsi_execute+0xdb/0x1f0 [ 325.512960] sd_revalidate_disk+0xed/0x1c70 [sd_mod] [ 325.519846] sd_probe_async+0xc3/0x1d0 [sd_mod] [ 325.526199] async_run_entry_fn+0x38/0x160 [ 325.532121] process_one_work+0x20a/0x660 [ 325.537897] worker_thread+0x3d/0x3b0 [ 325.543319] kthread+0x13a/0x150 [ 325.548200] ? process_one_work+0x660/0x660 [ 325.554204] ? kthread_create_on_node+0x40/0x40 [ 325.560605] ret_from_fork+0x27/0x40