On 11/19/24 09:28, Bart Van Assche wrote: > Zoned writes may be requeued, e.g. if a block driver returns > BLK_STS_RESOURCE. Requests may be requeued in another order than > submitted. Restore the request order if requests are requeued. > > Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx> > --- > block/bfq-iosched.c | 2 ++ > block/blk-mq.c | 20 +++++++++++++++++++- > block/blk-mq.h | 2 ++ > block/kyber-iosched.c | 2 ++ > block/mq-deadline.c | 7 ++++++- > include/linux/blk-mq.h | 2 +- > 6 files changed, 32 insertions(+), 3 deletions(-) > > diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c > index 0747d9d0e48c..13bedbf03bd2 100644 > --- a/block/bfq-iosched.c > +++ b/block/bfq-iosched.c > @@ -6265,6 +6265,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, > > if (flags & BLK_MQ_INSERT_AT_HEAD) { > list_add(&rq->queuelist, &bfqd->dispatch); > + } else if (flags & BLK_MQ_INSERT_ORDERED) { > + blk_mq_insert_ordered(rq, &bfqd->dispatch); > } else if (!bfqq) { > list_add_tail(&rq->queuelist, &bfqd->dispatch); > } else { > diff --git a/block/blk-mq.c b/block/blk-mq.c > index f134d5e1c4a1..1302ccbf2a7d 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -1564,7 +1564,9 @@ static void blk_mq_requeue_work(struct work_struct *work) > * already. Insert it into the hctx dispatch list to avoid > * block layer merges for the request. > */ > - if (rq->rq_flags & RQF_DONTPREP) > + if (blk_rq_is_seq_zoned_write(rq)) > + blk_mq_insert_request(rq, BLK_MQ_INSERT_ORDERED); Is this OK to do without any starvation prevention ? A high LBA write that constantly gets requeued behind low LBA writes could end up in a timeout situation, no ? > + else if (rq->rq_flags & RQF_DONTPREP) > blk_mq_request_bypass_insert(rq, 0); > else > blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); > @@ -2599,6 +2601,20 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, > blk_mq_run_hw_queue(hctx, run_queue_async); > } > > +void blk_mq_insert_ordered(struct request *rq, struct list_head *list) > +{ > + struct request_queue *q = rq->q; > + struct request *rq2; > + > + list_for_each_entry(rq2, list, queuelist) > + if (rq2->q == q && blk_rq_pos(rq2) > blk_rq_pos(rq)) > + break; > + > + /* Insert rq before rq2. If rq2 is the list head, append at the end. */ > + list_add_tail(&rq->queuelist, &rq2->queuelist); > +} > +EXPORT_SYMBOL_GPL(blk_mq_insert_ordered); > + > static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) > { > struct request_queue *q = rq->q; > @@ -2653,6 +2669,8 @@ static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) > spin_lock(&ctx->lock); > if (flags & BLK_MQ_INSERT_AT_HEAD) > list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); > + else if (flags & BLK_MQ_INSERT_ORDERED) > + blk_mq_insert_ordered(rq, &ctx->rq_lists[hctx->type]); > else > list_add_tail(&rq->queuelist, > &ctx->rq_lists[hctx->type]); > diff --git a/block/blk-mq.h b/block/blk-mq.h > index 309db553aba6..10b9fb3ca762 100644 > --- a/block/blk-mq.h > +++ b/block/blk-mq.h > @@ -40,8 +40,10 @@ enum { > > typedef unsigned int __bitwise blk_insert_t; > #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) > +#define BLK_MQ_INSERT_ORDERED ((__force blk_insert_t)0x02) > > void blk_mq_submit_bio(struct bio *bio); > +void blk_mq_insert_ordered(struct request *rq, struct list_head *list); > int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, > unsigned int flags); > void blk_mq_exit_queue(struct request_queue *q); > diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c > index 4155594aefc6..77bb41bab68d 100644 > --- a/block/kyber-iosched.c > +++ b/block/kyber-iosched.c > @@ -603,6 +603,8 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, > trace_block_rq_insert(rq); > if (flags & BLK_MQ_INSERT_AT_HEAD) > list_move(&rq->queuelist, head); > + else if (flags & BLK_MQ_INSERT_ORDERED) > + blk_mq_insert_ordered(rq, head); > else > list_move_tail(&rq->queuelist, head); > sbitmap_set_bit(&khd->kcq_map[sched_domain], > diff --git a/block/mq-deadline.c b/block/mq-deadline.c > index 2edf84b1bc2a..200e5a2928ce 100644 > --- a/block/mq-deadline.c > +++ b/block/mq-deadline.c > @@ -711,7 +711,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, > * set expire time and add to fifo list > */ > rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; > - list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); > + if (flags & BLK_MQ_INSERT_ORDERED) > + blk_mq_insert_ordered(rq, > + &per_prio->fifo_list[data_dir]); > + else > + list_add_tail(&rq->queuelist, > + &per_prio->fifo_list[data_dir]); > } > } > > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index ac05974f08f9..f7514eefccfd 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -85,7 +85,7 @@ enum { > > /* flags that prevent us from merging requests: */ > #define RQF_NOMERGE_FLAGS \ > - (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) > + (RQF_STARTED | RQF_FLUSH_SEQ | RQF_DONTPREP | RQF_SPECIAL_PAYLOAD) > > enum mq_rq_state { > MQ_RQ_IDLE = 0, -- Damien Le Moal Western Digital Research