If a queue is run before all requeued requests have been sent to the I/O scheduler, the I/O scheduler may dispatch the wrong request. Fix this by making blk_mq_run_hw_queue() process the requeue_list instead of blk_mq_requeue_work(). Cc: Christoph Hellwig <hch@xxxxxx> Cc: Damien Le Moal <dlemoal@xxxxxxxxxx> Cc: Ming Lei <ming.lei@xxxxxxxxxx> Cc: Mike Snitzer <snitzer@xxxxxxxxxx> Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx> --- block/blk-mq.c | 63 +++++++++++++++++++++--------------------- include/linux/blkdev.h | 1 - 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9ef6fa5d7471..52dffdc70480 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -68,6 +68,8 @@ static inline blk_qc_t blk_rq_to_qc(struct request *rq) static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { return !list_empty_careful(&hctx->dispatch) || + !list_empty_careful(&hctx->queue->requeue_list) || + !list_empty_careful(&hctx->queue->flush_list) || sbitmap_any_bit_set(&hctx->ctx_map) || blk_mq_sched_has_work(hctx); } @@ -1432,52 +1434,52 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) } EXPORT_SYMBOL(blk_mq_requeue_request); -static void blk_mq_requeue_work(struct work_struct *work) +static void blk_mq_process_requeue_list(struct blk_mq_hw_ctx *hctx) { - struct request_queue *q = - container_of(work, struct request_queue, requeue_work.work); - LIST_HEAD(requeue_list); - LIST_HEAD(flush_list); + struct request_queue *q = hctx->queue; struct request *rq, *next; + LIST_HEAD(at_head); + LIST_HEAD(at_tail); - spin_lock_irq(&q->requeue_lock); - list_splice_init(&q->requeue_list, &requeue_list); - list_splice_init(&q->flush_list, &flush_list); - spin_unlock_irq(&q->requeue_lock); + if (list_empty_careful(&q->requeue_list) && + list_empty_careful(&q->flush_list)) + return; - list_for_each_entry_safe(rq, next, &requeue_list, queuelist) { - if (!(rq->rq_flags & RQF_DONTPREP)) { + spin_lock_irq(&q->requeue_lock); + list_for_each_entry_safe(rq, next, &q->requeue_list, queuelist) { + if (!blk_queue_sq_sched(q) && rq->mq_hctx != hctx) + continue; + if (rq->rq_flags & RQF_DONTPREP) { + list_move_tail(&rq->queuelist, &at_tail); + } else { list_del_init(&rq->queuelist); - blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); + list_move_tail(&rq->queuelist, &at_head); } } - - while (!list_empty(&requeue_list)) { - rq = list_entry(requeue_list.next, struct request, queuelist); - list_del_init(&rq->queuelist); - blk_mq_insert_request(rq, 0); + list_for_each_entry_safe(rq, next, &q->flush_list, queuelist) { + if (!blk_queue_sq_sched(q) && rq->mq_hctx != hctx) + continue; + list_move_tail(&rq->queuelist, &at_tail); } + spin_unlock_irq(&q->requeue_lock); - while (!list_empty(&flush_list)) { - rq = list_entry(flush_list.next, struct request, queuelist); - list_del_init(&rq->queuelist); - blk_mq_insert_request(rq, 0); - } + list_for_each_entry_safe(rq, next, &at_head, queuelist) + blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); - blk_mq_run_hw_queues(q, false); + list_for_each_entry_safe(rq, next, &at_tail, queuelist) + blk_mq_insert_request(rq, 0); } void blk_mq_kick_requeue_list(struct request_queue *q) { - kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); + blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs) { - kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, - msecs_to_jiffies(msecs)); + blk_mq_delay_run_hw_queues(q, msecs); } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); @@ -2244,6 +2246,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) return; } + blk_mq_process_requeue_list(hctx); blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } @@ -2292,7 +2295,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) * scheduler. */ if (!sq_hctx || sq_hctx == hctx || - !list_empty_careful(&hctx->dispatch)) + blk_mq_hctx_has_pending(hctx)) blk_mq_run_hw_queue(hctx, async); } } @@ -2328,7 +2331,7 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) * scheduler. */ if (!sq_hctx || sq_hctx == hctx || - !list_empty_careful(&hctx->dispatch)) + blk_mq_hctx_has_pending(hctx)) blk_mq_delay_run_hw_queue(hctx, msecs); } } @@ -2413,6 +2416,7 @@ static void blk_mq_run_work_fn(struct work_struct *work) struct blk_mq_hw_ctx *hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + blk_mq_process_requeue_list(hctx); blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } @@ -4237,7 +4241,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; blk_mq_update_poll_flag(q); - INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->flush_list); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); @@ -4786,8 +4789,6 @@ void blk_mq_cancel_work_sync(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; - cancel_delayed_work_sync(&q->requeue_work); - queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fe99948688df..f410cce7289b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -491,7 +491,6 @@ struct request_queue { struct list_head requeue_list; spinlock_t requeue_lock; - struct delayed_work requeue_work; struct mutex sysfs_lock; struct mutex sysfs_dir_lock;