Re: [PATCH] blk-mq: fix issue with shared tag queue re-running

Ming Lei <ming.lei@xxxxxxxxxx> · Thu, 9 Nov 2017 18:00:42 +0800

On Thu, Nov 09, 2017 at 11:41:40AM +0800, Ming Lei wrote:
> On Wed, Nov 08, 2017 at 03:48:51PM -0700, Jens Axboe wrote:
> > This patch attempts to make the case of hctx re-running on driver tag
> > failure more robust. Without this patch, it's pretty easy to trigger a
> > stall condition with shared tags. An example is using null_blk like
> > this:
> > 
> > modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
> > 
> > which sets up 4 devices, sharing the same tag set with a depth of 1.
> > Running a fio job ala:
> > 
> > [global]
> > bs=4k
> > rw=randread
> > norandommap
> > direct=1
> > ioengine=libaio
> > iodepth=4
> > 
> > [nullb0]
> > filename=/dev/nullb0
> > [nullb1]
> > filename=/dev/nullb1
> > [nullb2]
> > filename=/dev/nullb2
> > [nullb3]
> > filename=/dev/nullb3
> > 
> > will inevitably end with one or more threads being stuck waiting for a
> > scheduler tag. That IO is then stuck forever, until someone else
> > triggers a run of the queue.
> > 
> > Ensure that we always re-run the hardware queue, if the driver tag we
> > were waiting for got freed before we added our leftover request entries
> > back on the dispatch list.
> > 
> > Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
> > 
> > ---
> > 
> > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> > index 7f4a1ba532af..bb7f08415203 100644
> > --- a/block/blk-mq-debugfs.c
> > +++ b/block/blk-mq-debugfs.c
> > @@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = {
> >  	HCTX_STATE_NAME(STOPPED),
> >  	HCTX_STATE_NAME(TAG_ACTIVE),
> >  	HCTX_STATE_NAME(SCHED_RESTART),
> > -	HCTX_STATE_NAME(TAG_WAITING),
> >  	HCTX_STATE_NAME(START_ON_RUN),
> >  };
> >  #undef HCTX_STATE_NAME
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 3d759bb8a5bb..8dc5db40df9d 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -998,49 +998,64 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
> >  	return rq->tag != -1;
> >  }
> >  
> > -static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
> > -				void *key)
> > +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
> > +				int flags, void *key)
> >  {
> >  	struct blk_mq_hw_ctx *hctx;
> >  
> >  	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
> >  
> > -	list_del(&wait->entry);
> > -	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
> > +	list_del_init(&wait->entry);
> >  	blk_mq_run_hw_queue(hctx, true);
> >  	return 1;
> >  }
> >  
> > -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
> > +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
> > +				     struct request *rq)
> >  {
> > +	struct blk_mq_hw_ctx *this_hctx = *hctx;
> > +	wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
> >  	struct sbq_wait_state *ws;
> >  
> > +	if (!list_empty_careful(&wait->entry))
> > +		return false;
> > +
> > +	spin_lock(&this_hctx->lock);
> > +	if (!list_empty(&wait->entry)) {
> > +		spin_unlock(&this_hctx->lock);
> > +		return false;
> > +	}
> > +
> > +	ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
> > +	add_wait_queue(&ws->wait, wait);
> > +
> >  	/*
> > -	 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
> > -	 * The thread which wins the race to grab this bit adds the hardware
> > -	 * queue to the wait queue.
> > +	 * It's possible that a tag was freed in the window between the
> > +	 * allocation failure and adding the hardware queue to the wait
> > +	 * queue.
> >  	 */
> > -	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
> > -	    test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
> > +	if (!blk_mq_get_driver_tag(rq, hctx, false)) {
> > +		spin_unlock(&this_hctx->lock);
> >  		return false;
> > -
> > -	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
> > -	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
> > +	}
> >  
> >  	/*
> > -	 * As soon as this returns, it's no longer safe to fiddle with
> > -	 * hctx->dispatch_wait, since a completion can wake up the wait queue
> > -	 * and unlock the bit.
> > +	 * We got a tag, remove outselves from the wait queue to ensure
> > +	 * someone else gets the wakeup.
> >  	 */
> > -	add_wait_queue(&ws->wait, &hctx->dispatch_wait);
> > +	spin_lock_irq(&ws->wait.lock);
> > +	list_del_init(&wait->entry);
> > +	spin_unlock_irq(&ws->wait.lock);
> > +	spin_unlock(&this_hctx->lock);
> >  	return true;
> >  }
> >  
> >  bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
> > -		bool got_budget)
> > +			     bool got_budget)
> >  {
> >  	struct blk_mq_hw_ctx *hctx;
> >  	struct request *rq, *nxt;
> > +	bool no_tag = false;
> >  	int errors, queued;
> >  
> >  	if (list_empty(list))
> > @@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
> >  		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
> >  			/*
> >  			 * The initial allocation attempt failed, so we need to
> > -			 * rerun the hardware queue when a tag is freed.
> > +			 * rerun the hardware queue when a tag is freed. The
> > +			 * waitqueue takes care of that. If the queue is run
> > +			 * before we add this entry back on the dispatch list,
> > +			 * we'll re-run it below.
> >  			 */
> > -			if (!blk_mq_dispatch_wait_add(hctx)) {
> > -				if (got_budget)
> > -					blk_mq_put_dispatch_budget(hctx);
> > -				break;
> > -			}
> > -
> > -			/*
> > -			 * It's possible that a tag was freed in the window
> > -			 * between the allocation failure and adding the
> > -			 * hardware queue to the wait queue.
> > -			 */
> > -			if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
> > +			if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
> >  				if (got_budget)
> >  					blk_mq_put_dispatch_budget(hctx);
> > +				no_tag = true;
> >  				break;
> >  			}
> >  		}
> > @@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
> >  		 * it is no longer set that means that it was cleared by another
> >  		 * thread and hence that a queue rerun is needed.
> >  		 *
> > -		 * If TAG_WAITING is set that means that an I/O scheduler has
> > -		 * been configured and another thread is waiting for a driver
> > -		 * tag. To guarantee fairness, do not rerun this hardware queue
> > -		 * but let the other thread grab the driver tag.
> > +		 * If 'no_tag' is set, that means that we failed getting
> > +		 * a driver tag with an I/O scheduler attached. If our dispatch
> > +		 * waitqueue is no longer active, ensure that we run the queue
> > +		 * AFTER adding our entries back to the list.
> >  		 *
> >  		 * If no I/O scheduler has been configured it is possible that
> >  		 * the hardware queue got stopped and restarted before requests
> > @@ -1156,7 +1164,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
> >  		 *   and dm-rq.
> >  		 */
> >  		if (!blk_mq_sched_needs_restart(hctx) &&
> > -		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
> > +		    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
> >  			blk_mq_run_hw_queue(hctx, true);
> 
> If one rq is just completed after the check on list_empty_careful(&hctx->dispatch_wait.entry),
> the queue may not be run any more. May that be an issue?

Looks that can be an issue.

If I revert "Revert "blk-mq: don't handle TAG_SHARED in restart"", and
apply 'blk-mq: put driver tag if dispatch budget can't be got' against
for-4.15/block, I still can trigger IO hang in one or two minutes easily:

1) script
#!/bin/sh
modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1
RUNTIME=10

while true; do
	fio --bs=4k --size=512G  --rw=randread --norandommap --direct=1 --ioengine=libaio --iodepth=4 --runtime=$RUNTIME --group_reporting=1  --name=nullb0 --filename=/dev/nullb0 --name=nullb1 --filename=/dev/nullb1 --name=nullb2 --filename=/dev/nullb2 --name=nullb3 --filename=/dev/nullb3
done

2) debugfs log(similar with your previous log)
[ming@VM]$sudo ./debug/dump-blk-info /dev/nullb0
=============nullb0/hctx0==================
active
0

busy

/sys/kernel/debug/block/nullb0//hctx0/cpu0
	completed
	252195 0
	dispatched
	252197 0
	merged
	0
	rq_list

/sys/kernel/debug/block/nullb0//hctx0/cpu1
	completed
	135710 0
	dispatched
	135710 0
	merged
	0
	rq_list

/sys/kernel/debug/block/nullb0//hctx0/cpu2
	completed
	277611 0
	dispatched
	277611 0
	merged
	0
	rq_list

/sys/kernel/debug/block/nullb0//hctx0/cpu3
	completed
	145017 0
	dispatched
	145017 0
	merged
	0
	rq_list

ctx_map
00000000: 00

dispatch
ffff8802605c8000 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=0}
ffff8802605c8240 {.op=READ, .cmd_flags=, .rq_flags=STARTED|IO_STAT, .atomic_flags=COMPLETE, .tag=-1, .internal_tag=1}

dispatched
       0	733616
       1	807709
       2	1412
       4	0
       8	0
      16	0
      32+	0

flags
alloc_policy=FIFO SHOULD_MERGE|TAG_SHARED

io_poll
considered=0
invoked=0
success=0

queued
810535

run
1478298

sched_tags
nr_tags=2
nr_reserved_tags=0
active_queues=0

bitmap_tags:
depth=2
busy=2
bits_per_word=64
map_nr=1
alloc_hint={0, 0, 0, 0, 1, 1, 0, 0}
wake_batch=1
wake_index=1
ws={
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=active},
}
round_robin=0

sched_tags_bitmap
00000000: 03

state
SCHED_RESTART

tags
nr_tags=1
nr_reserved_tags=0
active_queues=0

bitmap_tags:
depth=1
busy=0
bits_per_word=64
map_nr=1
alloc_hint={0, 0, 0, 0, 0, 0, 0, 0}
wake_batch=1
wake_index=1
ws={
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
	{.wait_cnt=1, .wait=inactive},
}
round_robin=0

tags_bitmap
00000000: 00

-- 
Ming