On 8/12/19 3:43 PM, Ming Lei wrote: > When all CPUs in one hctx are offline, we shouldn't run this hw queue > for completing request any more. > > So steal bios from the request, and resubmit them, and finally free > the request in blk_mq_hctx_notify_dead(). > > Cc: Bart Van Assche <bvanassche@xxxxxxx> > Cc: Hannes Reinecke <hare@xxxxxxxx> > Cc: Christoph Hellwig <hch@xxxxxx> > Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > Cc: Keith Busch <keith.busch@xxxxxxxxx> > Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> > --- > block/blk-mq.c | 48 +++++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 41 insertions(+), 7 deletions(-) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 6931b2ba2776..ed334fd867c4 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -2261,10 +2261,30 @@ static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) > return 0; > } > > +static void blk_mq_resubmit_io(struct request *rq) > +{ > + struct bio_list list; > + struct bio *bio; > + > + bio_list_init(&list); > + blk_steal_bios(&list, rq); > + > + while (true) { > + bio = bio_list_pop(&list); > + if (!bio) > + break; > + > + generic_make_request(bio); > + } > + > + blk_mq_cleanup_rq(rq); > + blk_mq_end_request(rq, 0); > +} > + > /* > - * 'cpu' is going away. splice any existing rq_list entries from this > - * software queue to the hw queue dispatch list, and ensure that it > - * gets run. > + * 'cpu' has gone away. If this hctx is dead, we can't dispatch request > + * to the hctx any more, so steal bios from requests of this hctx, and > + * re-submit them to the request queue, and free these requests finally. > */ > static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) > { > @@ -2272,6 +2292,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) > struct blk_mq_ctx *ctx; > LIST_HEAD(tmp); > enum hctx_type type; > + bool hctx_dead; > + struct request *rq; > > hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); > ctx = __blk_mq_get_ctx(hctx->queue, cpu); > @@ -2279,6 +2301,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) > > clear_bit(BLK_MQ_S_INTERNAL_STOPPED, &hctx->state); > > + hctx_dead = cpumask_first_and(hctx->cpumask, cpu_online_mask) >= > + nr_cpu_ids; > + > spin_lock(&ctx->lock); > if (!list_empty(&ctx->rq_lists[type])) { > list_splice_init(&ctx->rq_lists[type], &tmp); > @@ -2289,11 +2314,20 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) > if (list_empty(&tmp)) > return 0; > > - spin_lock(&hctx->lock); > - list_splice_tail_init(&tmp, &hctx->dispatch); > - spin_unlock(&hctx->lock); > + if (!hctx_dead) { > + spin_lock(&hctx->lock); > + list_splice_tail_init(&tmp, &hctx->dispatch); > + spin_unlock(&hctx->lock); > + blk_mq_run_hw_queue(hctx, true); > + return 0; > + } > + > + while (!list_empty(&tmp)) { > + rq = list_entry(tmp.next, struct request, queuelist); > + list_del_init(&rq->queuelist); > + blk_mq_resubmit_io(rq); > + } > > - blk_mq_run_hw_queue(hctx, true); > return 0; > } > > So what happens when all CPUs assigned to a hardware queue go offline? Wouldn't blk_steal_bios() etc resend the I/O to the same hw queue, causing an infinite loop? Don't we have to rearrange the hardware queues here? Cheers, Hannes -- Dr. Hannes Reinecke Teamlead Storage & Networking hare@xxxxxxx +49 911 74053 688 SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: Felix Imendörffer, Mary Higgins, Sri Rasiah HRB 21284 (AG Nürnberg)