> +static void blk_mq_resubmit_passthrough_io(struct request *rq) > +{ > + struct request *nrq; > + unsigned int flags = 0, cmd_flags = 0; > + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; > + struct blk_mq_tags *tags = rq->q->elevator ? hctx->sched_tags : > + hctx->tags; > + bool reserved = blk_mq_tag_is_reserved(tags, rq->internal_tag); > + > + if (rq->rq_flags & RQF_PREEMPT) > + flags |= BLK_MQ_REQ_PREEMPT; > + if (reserved) > + flags |= BLK_MQ_REQ_RESERVED; > + > + /* avoid allocation failure & IO merge */ > + cmd_flags = (rq->cmd_flags & ~REQ_NOWAIT) | REQ_NOMERGE; > + > + nrq = blk_get_request(rq->q, cmd_flags, flags); > + if (!nrq) > + return; > + > + nrq->__sector = blk_rq_pos(rq); > + nrq->__data_len = blk_rq_bytes(rq); > + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { > + nrq->rq_flags |= RQF_SPECIAL_PAYLOAD; > + nrq->special_vec = rq->special_vec; > + } > +#if defined(CONFIG_BLK_DEV_INTEGRITY) > + nrq->nr_integrity_segments = rq->nr_integrity_segments; > +#endif > + nrq->nr_phys_segments = rq->nr_phys_segments; > + nrq->ioprio = rq->ioprio; > + nrq->extra_len = rq->extra_len; > + nrq->rq_disk = rq->rq_disk; > + nrq->part = rq->part; > + nrq->write_hint = rq->write_hint; > + nrq->timeout = rq->timeout; This should share code with blk_rq_prep_clone() using a helper. Note that blk_rq_prep_clone seems to miss things like the write_hint and timeout, which we should fix as well. > +static void blk_mq_resubmit_fs_io(struct request *rq) > +{ > + struct bio_list list; > + struct bio *bio; > + > + bio_list_init(&list); > + blk_steal_bios(&list, rq); > + > + while (true) { > + bio = bio_list_pop(&list); > + if (!bio) > + break; > + > + generic_make_request(bio); > + } This could be simplified to: while ((bio = bio_list_pop(&list))) generic_make_request(bio); but then again the generic_make_request seems weird. Do we need actually need any of the checks in generic_make_request? Shouldn't we call into blk_mq_make_request directly? Then again I wonder why the passthrough case doesn't work for FS requests? > static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) > { > @@ -2394,14 +2482,38 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) > } > spin_unlock(&ctx->lock); > > + if (!test_bit(BLK_MQ_S_INACTIVE, &hctx->state)) { > + if (!list_empty(&tmp)) { > + spin_lock(&hctx->lock); > + list_splice_tail_init(&tmp, &hctx->dispatch); > + spin_unlock(&hctx->lock); > + blk_mq_run_hw_queue(hctx, true); > + } > + } else { What about an early return or two here to save a level of indentation later? if (!test_bit(BLK_MQ_S_INACTIVE, &hctx->state)) { if (list_empty(&tmp)) return 0; spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); blk_mq_run_hw_queue(hctx, true); return 0; } ...