On Sat, Apr 18, 2020 at 11:09:24AM +0800, Ming Lei wrote: > If one hctx becomes inactive when its CPUs are all offline, all in-queue > requests aimed at this hctx have to be re-submitted. > > Re-submit requests from both sw queue or scheduler queue when the hctx > is found as inactive. > > Cc: John Garry <john.garry@xxxxxxxxxx> > Cc: Bart Van Assche <bvanassche@xxxxxxx> > Cc: Hannes Reinecke <hare@xxxxxxxx> > Cc: Christoph Hellwig <hch@xxxxxx> > Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> > --- > block/blk-mq.c | 100 ++++++++++++++++++++++++++++++------------------- > 1 file changed, 62 insertions(+), 38 deletions(-) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index ae1e57c64ca1..54ba8a9c3c93 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -2456,6 +2456,52 @@ static void blk_mq_resubmit_io(struct request *rq) > blk_mq_resubmit_fs_io(rq); > } > > +static void blk_mq_hctx_deactivate(struct blk_mq_hw_ctx *hctx) > +{ > + LIST_HEAD(sched_tmp); > + LIST_HEAD(re_submit); > + LIST_HEAD(flush_in); > + LIST_HEAD(flush_out); > + struct request *rq, *nxt; > + struct elevator_queue *e = hctx->queue->elevator; > + > + if (!e) { > + blk_mq_flush_busy_ctxs(hctx, &re_submit); > + } else { > + while ((rq = e->type->ops.dispatch_request(hctx))) { > + if (rq->mq_hctx != hctx) > + list_add(&rq->queuelist, &sched_tmp); > + else > + list_add(&rq->queuelist, &re_submit); > + } > + } > + while (!list_empty(&sched_tmp)) { > + rq = list_entry(sched_tmp.next, struct request, > + queuelist); > + list_del_init(&rq->queuelist); > + blk_mq_sched_insert_request(rq, true, true, true); > + } > + > + /* requests in dispatch list have to be re-submitted too */ > + spin_lock(&hctx->lock); > + list_splice_tail_init(&hctx->dispatch, &re_submit); > + spin_unlock(&hctx->lock); > + > + /* blk_end_flush_machinery will cover flush request */ > + list_for_each_entry_safe(rq, nxt, &re_submit, queuelist) { > + if (rq->rq_flags & RQF_FLUSH_SEQ) > + list_move(&rq->queuelist, &flush_in); > + } > + blk_end_flush_machinery(hctx, &flush_in, &flush_out); > + list_splice_tail(&flush_out, &re_submit); > + > + while (!list_empty(&re_submit)) { > + rq = list_first_entry(&re_submit, struct request, queuelist); > + list_del_init(&rq->queuelist); > + blk_mq_resubmit_io(rq); > + } > +} > + > /* > * 'cpu' has gone away. If this hctx is inactive, we can't dispatch request > * to the hctx any more, so steal bios from requests of this hctx, and > @@ -2463,54 +2509,32 @@ static void blk_mq_resubmit_io(struct request *rq) > */ > static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) > { > + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, > + struct blk_mq_hw_ctx, cpuhp_dead); > > if (!cpumask_test_cpu(cpu, hctx->cpumask)) > return 0; > > + if (test_bit(BLK_MQ_S_INACTIVE, &hctx->state)) { > + blk_mq_hctx_deactivate(hctx); Actually it probably also makes sense to introduce the blk_mq_hctx_deactivate helper in the previous patch to avoid some churn here. > + } else if (!hctx->queue->elevator) { > + struct blk_mq_ctx *ctx = __blk_mq_get_ctx(hctx->queue, cpu); > + enum hctx_type type = hctx->type; > + LIST_HEAD(tmp); > + > + spin_lock(&ctx->lock); > + if (!list_empty(&ctx->rq_lists[type])) { > + list_splice_init(&ctx->rq_lists[type], &tmp); > + blk_mq_hctx_clear_pending(hctx, ctx); > + } > + spin_unlock(&ctx->lock); > > if (!list_empty(&tmp)) { > spin_lock(&hctx->lock); > list_splice_tail_init(&tmp, &hctx->dispatch); > spin_unlock(&hctx->lock); > > + blk_mq_run_hw_queue(hctx, true); > } And another helper for the !inactive case.