Flushes bypass the I/O scheduler and get added to hctx->dispatch in blk_mq_sched_bypass_insert. This can happen while a kworker is running hctx->run_work work item and is past the point in blk_mq_sched_dispatch_requests where hctx->dispatch is checked. The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time, because the I/O scheduler can feed an arbitrary number of commands. Since we have only one hctx->run_work, the commands waiting in hctx->dispatch will wait an arbitrary length of time for run_work to be rerun. A similar phenomenon exists with dispatches from the software queue. The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and blk_mq_do_dispatch_ctx and return from the run_work handler and let it rerun. Signed-off-by: Salman Qazi <sqazi@xxxxxxxxxx> Reviewed-by: Ming Lei <ming.lei@xxxxxxxxxx> --- block/blk-mq-sched.c | 49 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index ca22afd47b3d..3e78c5bbb4d9 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -84,12 +84,17 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * + * Returns true if hctx->dispatch was found non-empty and + * run_work has to be run again. This is necessary to avoid + * starving flushes. */ -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; LIST_HEAD(rq_list); + bool ret = false; do { struct request *rq; @@ -97,6 +102,11 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; + if (!list_empty_careful(&hctx->dispatch)) { + ret = true; + break; + } + if (!blk_mq_get_dispatch_budget(hctx)) break; @@ -113,6 +123,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) */ list_add(&rq->queuelist, &rq_list); } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + + return ret; } static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, @@ -130,16 +142,26 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * + * Returns true if hctx->dispatch was found non-empty and + * run_work has to be run again. This is necessary to avoid + * starving flushes. */ -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; LIST_HEAD(rq_list); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); + bool ret = false; do { struct request *rq; + if (!list_empty_careful(&hctx->dispatch)) { + ret = true; + break; + } + if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; @@ -165,6 +187,7 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); WRITE_ONCE(hctx->dispatch_from, ctx); + return ret; } void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) @@ -172,6 +195,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.dispatch_request; + bool run_again; + bool restarted = false; LIST_HEAD(rq_list); /* RCU or SRCU read lock is needed before checking quiesced flag */ @@ -180,6 +205,9 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) hctx->run++; +again: + run_again = false; + /* * If we have previous entries on our dispatch list, grab them first for * more fair dispatch. @@ -208,19 +236,28 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) blk_mq_sched_mark_restart_hctx(hctx); if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { if (has_sched_dispatch) - blk_mq_do_dispatch_sched(hctx); + run_again = blk_mq_do_dispatch_sched(hctx); else - blk_mq_do_dispatch_ctx(hctx); + run_again = blk_mq_do_dispatch_ctx(hctx); } } else if (has_sched_dispatch) { - blk_mq_do_dispatch_sched(hctx); + run_again = blk_mq_do_dispatch_sched(hctx); } else if (hctx->dispatch_busy) { /* dequeue request one by one from sw queue if queue is busy */ - blk_mq_do_dispatch_ctx(hctx); + run_again = blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list, false); } + + if (run_again) { + if (!restarted) { + restarted = true; + goto again; + } + + blk_mq_run_hw_queue(hctx, true); + } } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, -- 2.25.0.341.g760bfbb309-goog