On Thu, Nov 09, 2017 at 11:41:40AM +0800, Ming Lei wrote: > On Wed, Nov 08, 2017 at 03:48:51PM -0700, Jens Axboe wrote: > > This patch attempts to make the case of hctx re-running on driver tag > > failure more robust. Without this patch, it's pretty easy to trigger a > > stall condition with shared tags. An example is using null_blk like > > this: > > > > modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 > > > > which sets up 4 devices, sharing the same tag set with a depth of 1. > > Running a fio job ala: > > > > [global] > > bs=4k > > rw=randread > > norandommap > > direct=1 > > ioengine=libaio > > iodepth=4 > > > > [nullb0] > > filename=/dev/nullb0 > > [nullb1] > > filename=/dev/nullb1 > > [nullb2] > > filename=/dev/nullb2 > > [nullb3] > > filename=/dev/nullb3 > > > > will inevitably end with one or more threads being stuck waiting for a > > scheduler tag. That IO is then stuck forever, until someone else > > triggers a run of the queue. > > > > Ensure that we always re-run the hardware queue, if the driver tag we > > were waiting for got freed before we added our leftover request entries > > back on the dispatch list. > > > > Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> > > > > --- > > > > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c > > index 7f4a1ba532af..bb7f08415203 100644 > > --- a/block/blk-mq-debugfs.c > > +++ b/block/blk-mq-debugfs.c > > @@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = { > > HCTX_STATE_NAME(STOPPED), > > HCTX_STATE_NAME(TAG_ACTIVE), > > HCTX_STATE_NAME(SCHED_RESTART), > > - HCTX_STATE_NAME(TAG_WAITING), > > HCTX_STATE_NAME(START_ON_RUN), > > }; > > #undef HCTX_STATE_NAME > > diff --git a/block/blk-mq.c b/block/blk-mq.c > > index 3d759bb8a5bb..8dc5db40df9d 100644 > > --- a/block/blk-mq.c > > +++ b/block/blk-mq.c > > @@ -998,49 +998,64 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, > > return rq->tag != -1; > > } > > > > -static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, > > - void *key) > > +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, > > + int flags, void *key) > > { > > struct blk_mq_hw_ctx *hctx; > > > > hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); > > > > - list_del(&wait->entry); > > - clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); > > + list_del_init(&wait->entry); > > blk_mq_run_hw_queue(hctx, true); > > return 1; > > } > > > > -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) > > +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, > > + struct request *rq) > > { > > + struct blk_mq_hw_ctx *this_hctx = *hctx; > > + wait_queue_entry_t *wait = &this_hctx->dispatch_wait; > > struct sbq_wait_state *ws; > > > > + if (!list_empty_careful(&wait->entry)) > > + return false; > > + > > + spin_lock(&this_hctx->lock); > > + if (!list_empty(&wait->entry)) { > > + spin_unlock(&this_hctx->lock); > > + return false; > > + } > > + > > + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); > > + add_wait_queue(&ws->wait, wait); > > + > > /* > > - * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. > > - * The thread which wins the race to grab this bit adds the hardware > > - * queue to the wait queue. > > + * It's possible that a tag was freed in the window between the > > + * allocation failure and adding the hardware queue to the wait > > + * queue. > > */ > > - if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || > > - test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) > > + if (!blk_mq_get_driver_tag(rq, hctx, false)) { > > + spin_unlock(&this_hctx->lock); > > return false; > > - > > - init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); > > - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); > > + } > > > > /* > > - * As soon as this returns, it's no longer safe to fiddle with > > - * hctx->dispatch_wait, since a completion can wake up the wait queue > > - * and unlock the bit. > > + * We got a tag, remove outselves from the wait queue to ensure > > + * someone else gets the wakeup. > > */ > > - add_wait_queue(&ws->wait, &hctx->dispatch_wait); > > + spin_lock_irq(&ws->wait.lock); > > + list_del_init(&wait->entry); > > + spin_unlock_irq(&ws->wait.lock); > > + spin_unlock(&this_hctx->lock); > > return true; > > } > > > > bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > > - bool got_budget) > > + bool got_budget) > > { > > struct blk_mq_hw_ctx *hctx; > > struct request *rq, *nxt; > > + bool no_tag = false; > > int errors, queued; > > > > if (list_empty(list)) > > @@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > > if (!blk_mq_get_driver_tag(rq, &hctx, false)) { > > /* > > * The initial allocation attempt failed, so we need to > > - * rerun the hardware queue when a tag is freed. > > + * rerun the hardware queue when a tag is freed. The > > + * waitqueue takes care of that. If the queue is run > > + * before we add this entry back on the dispatch list, > > + * we'll re-run it below. > > */ > > - if (!blk_mq_dispatch_wait_add(hctx)) { > > - if (got_budget) > > - blk_mq_put_dispatch_budget(hctx); > > - break; > > - } > > - > > - /* > > - * It's possible that a tag was freed in the window > > - * between the allocation failure and adding the > > - * hardware queue to the wait queue. > > - */ > > - if (!blk_mq_get_driver_tag(rq, &hctx, false)) { > > + if (!blk_mq_dispatch_wait_add(&hctx, rq)) { > > if (got_budget) > > blk_mq_put_dispatch_budget(hctx); > > + no_tag = true; > > break; > > } > > } > > @@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > > * it is no longer set that means that it was cleared by another > > * thread and hence that a queue rerun is needed. > > * > > - * If TAG_WAITING is set that means that an I/O scheduler has > > - * been configured and another thread is waiting for a driver > > - * tag. To guarantee fairness, do not rerun this hardware queue > > - * but let the other thread grab the driver tag. > > + * If 'no_tag' is set, that means that we failed getting > > + * a driver tag with an I/O scheduler attached. If our dispatch > > + * waitqueue is no longer active, ensure that we run the queue > > + * AFTER adding our entries back to the list. > > * > > * If no I/O scheduler has been configured it is possible that > > * the hardware queue got stopped and restarted before requests > > @@ -1156,7 +1164,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, > > * and dm-rq. > > */ > > if (!blk_mq_sched_needs_restart(hctx) && > > - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) > > + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) > > blk_mq_run_hw_queue(hctx, true); > > If one rq is just completed after the check on list_empty_careful(&hctx->dispatch_wait.entry), > the queue may not be run any more. May that be an issue? Looks that can be an issue. If I revert "Revert "blk-mq: don't handle TAG_SHARED in restart"", and apply 'blk-mq: put driver tag if dispatch budget can't be got' against for-4.15/block, I still can trigger IO hang in one or two minutes easily: 1) script #!/bin/sh modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 RUNTIME=10 while true; do fio --bs=4k --size=512G --rw=randread --norandommap --direct=1 --ioengine=libaio --iodepth=4 --runtime=$RUNTIME --group_reporting=1 --name=nullb0 --filename=/dev/nullb0 --name=nullb1 --filename=/dev/nullb1 --name=nullb2 --filename=/dev/nullb2 --name=nullb3 --filename=/dev/nullb3 done 2) debugfs log(similar with your previous log) [ming@VM]$sudo ./debug/dump-blk-info /dev/nullb0 =============nullb0/hctx0================== active 0 busy /sys/kernel/debug/block/nullb0//hctx0/cpu0 completed 252195 0 dispatched 252197 0 merged 0 rq_list /sys/kernel/debug/block/nullb0//hctx0/cpu1 completed 135710 0 dispatched 135710 0 merged 0 rq_list /sys/kernel/debug/block/nullb0//hctx0/cpu2 completed 277611 0 dispatched 277611 0 merged 0 rq_list /sys/kernel/debug/block/nullb0//hctx0/cpu3 completed 145017 0 dispatched 145017 0 merged 0 rq_list ctx_map 00000000: 00 dispatch ffff8802605c8000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0} ffff8802605c8240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1} dispatched 0 733616 1 807709 2 1412 4 0 8 0 16 0 32+ 0 flags alloc_policy=FIFO SHOULD_MERGE|TAG_SHARED io_poll considered=0 invoked=0 success=0 queued 810535 run 1478298 sched_tags nr_tags=2 nr_reserved_tags=0 active_queues=0 bitmap_tags: depth=2 busy=2 bits_per_word=64 map_nr=1 alloc_hint={0, 0, 0, 0, 1, 1, 0, 0} wake_batch=1 wake_index=1 ws={ {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=active}, } round_robin=0 sched_tags_bitmap 00000000: 03 state SCHED_RESTART tags nr_tags=1 nr_reserved_tags=0 active_queues=0 bitmap_tags: depth=1 busy=0 bits_per_word=64 map_nr=1 alloc_hint={0, 0, 0, 0, 0, 0, 0, 0} wake_batch=1 wake_index=1 ws={ {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, {.wait_cnt=1, .wait=inactive}, } round_robin=0 tags_bitmap 00000000: 00 -- Ming