Re: [PATCH V7 8/9] blk-mq: handle requests dispatched from IO scheduler in case of inactive hctx

Christoph Hellwig <hch@xxxxxx> · Thu, 23 Apr 2020 09:51:55 +0200

On Sat, Apr 18, 2020 at 11:09:24AM +0800, Ming Lei wrote:
> If one hctx becomes inactive when its CPUs are all offline, all in-queue
> requests aimed at this hctx have to be re-submitted.
> 
> Re-submit requests from both sw queue or scheduler queue when the hctx
> is found as inactive.
> 
> Cc: John Garry <john.garry@xxxxxxxxxx>
> Cc: Bart Van Assche <bvanassche@xxxxxxx>
> Cc: Hannes Reinecke <hare@xxxxxxxx>
> Cc: Christoph Hellwig <hch@xxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
> ---
>  block/blk-mq.c | 100 ++++++++++++++++++++++++++++++-------------------
>  1 file changed, 62 insertions(+), 38 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index ae1e57c64ca1..54ba8a9c3c93 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2456,6 +2456,52 @@ static void blk_mq_resubmit_io(struct request *rq)
>  		blk_mq_resubmit_fs_io(rq);
>  }
>  
> +static void blk_mq_hctx_deactivate(struct blk_mq_hw_ctx *hctx)
> +{
> +	LIST_HEAD(sched_tmp);
> +	LIST_HEAD(re_submit);
> +	LIST_HEAD(flush_in);
> +	LIST_HEAD(flush_out);
> +	struct request *rq, *nxt;
> +	struct elevator_queue *e = hctx->queue->elevator;
> +
> +	if (!e) {
> +		blk_mq_flush_busy_ctxs(hctx, &re_submit);
> +	} else {
> +		while ((rq = e->type->ops.dispatch_request(hctx))) {
> +			if (rq->mq_hctx != hctx)
> +				list_add(&rq->queuelist, &sched_tmp);
> +			else
> +				list_add(&rq->queuelist, &re_submit);
> +		}
> +	}
> +	while (!list_empty(&sched_tmp)) {
> +		rq = list_entry(sched_tmp.next, struct request,
> +				queuelist);
> +		list_del_init(&rq->queuelist);
> +		blk_mq_sched_insert_request(rq, true, true, true);
> +	}
> +
> +	/* requests in dispatch list have to be re-submitted too */
> +	spin_lock(&hctx->lock);
> +	list_splice_tail_init(&hctx->dispatch, &re_submit);
> +	spin_unlock(&hctx->lock);
> +
> +	/* blk_end_flush_machinery will cover flush request */
> +	list_for_each_entry_safe(rq, nxt, &re_submit, queuelist) {
> +		if (rq->rq_flags & RQF_FLUSH_SEQ)
> +			list_move(&rq->queuelist, &flush_in);
> +	}
> +	blk_end_flush_machinery(hctx, &flush_in, &flush_out);
> +	list_splice_tail(&flush_out, &re_submit);
> +
> +	while (!list_empty(&re_submit)) {
> +		rq = list_first_entry(&re_submit, struct request, queuelist);
> +		list_del_init(&rq->queuelist);
> +		blk_mq_resubmit_io(rq);
> +	}
> +}
> +
>  /*
>   * 'cpu' has gone away. If this hctx is inactive, we can't dispatch request
>   * to the hctx any more, so steal bios from requests of this hctx, and
> @@ -2463,54 +2509,32 @@ static void blk_mq_resubmit_io(struct request *rq)
>   */
>  static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
>  {
> +	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
> +			struct blk_mq_hw_ctx, cpuhp_dead);
>  
>  	if (!cpumask_test_cpu(cpu, hctx->cpumask))
>  		return 0;
>  
> +	if (test_bit(BLK_MQ_S_INACTIVE, &hctx->state)) {
> +		blk_mq_hctx_deactivate(hctx);

Actually it probably also makes sense to introduce the
blk_mq_hctx_deactivate helper in the previous patch to avoid some
churn here.

> +	} else if (!hctx->queue->elevator) {
> +		struct blk_mq_ctx *ctx = __blk_mq_get_ctx(hctx->queue, cpu);
> +		enum hctx_type type = hctx->type;
> +		LIST_HEAD(tmp);
> +
> +		spin_lock(&ctx->lock);
> +		if (!list_empty(&ctx->rq_lists[type])) {
> +			list_splice_init(&ctx->rq_lists[type], &tmp);
> +			blk_mq_hctx_clear_pending(hctx, ctx);
> +		}
> +		spin_unlock(&ctx->lock);
>  
>  		if (!list_empty(&tmp)) {
>  			spin_lock(&hctx->lock);
>  			list_splice_tail_init(&tmp, &hctx->dispatch);
>  			spin_unlock(&hctx->lock);
>  
> +			blk_mq_run_hw_queue(hctx, true);
>  		}

And another helper for the !inactive case.