On Thu, Apr 23, 2020 at 09:50:11AM +0200, Christoph Hellwig wrote: > > +static void blk_mq_resubmit_passthrough_io(struct request *rq) > > +{ > > + struct request *nrq; > > + unsigned int flags = 0, cmd_flags = 0; > > + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; > > + struct blk_mq_tags *tags = rq->q->elevator ? hctx->sched_tags : > > + hctx->tags; > > + bool reserved = blk_mq_tag_is_reserved(tags, rq->internal_tag); > > + > > + if (rq->rq_flags & RQF_PREEMPT) > > + flags |= BLK_MQ_REQ_PREEMPT; > > + if (reserved) > > + flags |= BLK_MQ_REQ_RESERVED; > > + > > + /* avoid allocation failure & IO merge */ > > + cmd_flags = (rq->cmd_flags & ~REQ_NOWAIT) | REQ_NOMERGE; > > + > > + nrq = blk_get_request(rq->q, cmd_flags, flags); > > + if (!nrq) > > + return; > > + > > + nrq->__sector = blk_rq_pos(rq); > > + nrq->__data_len = blk_rq_bytes(rq); > > + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { > > + nrq->rq_flags |= RQF_SPECIAL_PAYLOAD; > > + nrq->special_vec = rq->special_vec; > > + } > > +#if defined(CONFIG_BLK_DEV_INTEGRITY) > > + nrq->nr_integrity_segments = rq->nr_integrity_segments; > > +#endif > > + nrq->nr_phys_segments = rq->nr_phys_segments; > > + nrq->ioprio = rq->ioprio; > > + nrq->extra_len = rq->extra_len; > > + nrq->rq_disk = rq->rq_disk; > > + nrq->part = rq->part; > > + nrq->write_hint = rq->write_hint; > > + nrq->timeout = rq->timeout; > > This should share code with blk_rq_prep_clone() using a helper. > Note that blk_rq_prep_clone seems to miss things like the > write_hint and timeout, which we should fix as well. Looks requests in both cases are inserted directly, so it is reasonable to share similar clone helper. Will do that in next version. > > > +static void blk_mq_resubmit_fs_io(struct request *rq) > > +{ > > + struct bio_list list; > > + struct bio *bio; > > + > > + bio_list_init(&list); > > + blk_steal_bios(&list, rq); > > + > > + while (true) { > > + bio = bio_list_pop(&list); > > + if (!bio) > > + break; > > + > > + generic_make_request(bio); > > + } > > This could be simplified to: > > while ((bio = bio_list_pop(&list))) > generic_make_request(bio); > > but then again the generic_make_request seems weird. Do we need > actually need any of the checks in generic_make_request? Shouldn't > we call into blk_mq_make_request directly? Good catch, I think we should call into blk_mq_make_request() directly for avoiding double check in generic_make_request. > > Then again I wonder why the passthrough case doesn't work for > FS requests? Good question, just not think of this way cause re-submitting passthrough request is done a bit late. I believe we can do this way, which is very similar with dm-rq's usage. > > > static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) > > { > > @@ -2394,14 +2482,38 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) > > } > > spin_unlock(&ctx->lock); > > > > + if (!test_bit(BLK_MQ_S_INACTIVE, &hctx->state)) { > > + if (!list_empty(&tmp)) { > > + spin_lock(&hctx->lock); > > + list_splice_tail_init(&tmp, &hctx->dispatch); > > + spin_unlock(&hctx->lock); > > + blk_mq_run_hw_queue(hctx, true); > > + } > > + } else { > > What about an early return or two here to save a level of indentation > later? > > if (!test_bit(BLK_MQ_S_INACTIVE, &hctx->state)) { > if (list_empty(&tmp)) > return 0; > > spin_lock(&hctx->lock); > list_splice_tail_init(&tmp, &hctx->dispatch); > spin_unlock(&hctx->lock); > blk_mq_run_hw_queue(hctx, true); > return 0; > } OK. Thanks, Ming