On Wed, Feb 05, 2020 at 11:57:06AM -0800, Salman Qazi wrote: > Flushes bypass the I/O scheduler and get added to hctx->dispatch > in blk_mq_sched_bypass_insert. This can happen while a kworker is running > hctx->run_work work item and is past the point in > blk_mq_sched_dispatch_requests where hctx->dispatch is checked. > > The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time, > because the I/O scheduler can feed an arbitrary number of commands. > > Since we have only one hctx->run_work, the commands waiting in > hctx->dispatch will wait an arbitrary length of time for run_work to be > rerun. > > A similar phenomenon exists with dispatches from the software queue. > > The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and > blk_mq_do_dispatch_ctx and return from the run_work handler and let it > rerun. > > Signed-off-by: Salman Qazi <sqazi@xxxxxxxxxx> > --- > block/blk-mq-sched.c | 37 +++++++++++++++++++++++++++++++------ > 1 file changed, 31 insertions(+), 6 deletions(-) > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c > index ca22afd47b3d..52249fddeb66 100644 > --- a/block/blk-mq-sched.c > +++ b/block/blk-mq-sched.c > @@ -84,12 +84,16 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) > * Only SCSI implements .get_budget and .put_budget, and SCSI restarts > * its queue by itself in its completion handler, so we don't need to > * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. > + * > + * Returns true if hctx->dispatch was found non-empty and > + * run_work has to be run again. > */ > -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) > +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) > { > struct request_queue *q = hctx->queue; > struct elevator_queue *e = q->elevator; > LIST_HEAD(rq_list); > + bool ret = false; > > do { > struct request *rq; > @@ -97,6 +101,11 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) > if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) > break; > > + if (!list_empty_careful(&hctx->dispatch)) { > + ret = true; > + break; > + } > + > if (!blk_mq_get_dispatch_budget(hctx)) > break; > > @@ -113,6 +122,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) > */ > list_add(&rq->queuelist, &rq_list); > } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); > + > + return ret; > } > > static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, > @@ -130,16 +141,25 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, > * Only SCSI implements .get_budget and .put_budget, and SCSI restarts > * its queue by itself in its completion handler, so we don't need to > * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. > + * > + * Returns true if hctx->dispatch was found non-empty and > + * run_work has to be run again. > */ > -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) > +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) > { > struct request_queue *q = hctx->queue; > LIST_HEAD(rq_list); > struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); > + bool ret = false; > > do { > struct request *rq; > > + if (!list_empty_careful(&hctx->dispatch)) { > + ret = true; > + break; > + } > + > if (!sbitmap_any_bit_set(&hctx->ctx_map)) > break; > > @@ -165,6 +185,7 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) > } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); > > WRITE_ONCE(hctx->dispatch_from, ctx); > + return ret; > } > > void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) > @@ -172,6 +193,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) > struct request_queue *q = hctx->queue; > struct elevator_queue *e = q->elevator; > const bool has_sched_dispatch = e && e->type->ops.dispatch_request; > + bool run_again = false; > LIST_HEAD(rq_list); > > /* RCU or SRCU read lock is needed before checking quiesced flag */ > @@ -208,19 +230,22 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) > blk_mq_sched_mark_restart_hctx(hctx); > if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { > if (has_sched_dispatch) > - blk_mq_do_dispatch_sched(hctx); > + run_again = blk_mq_do_dispatch_sched(hctx); > else > - blk_mq_do_dispatch_ctx(hctx); > + run_again = blk_mq_do_dispatch_ctx(hctx); > } > } else if (has_sched_dispatch) { > - blk_mq_do_dispatch_sched(hctx); > + run_again = blk_mq_do_dispatch_sched(hctx); > } else if (hctx->dispatch_busy) { > /* dequeue request one by one from sw queue if queue is busy */ > - blk_mq_do_dispatch_ctx(hctx); > + run_again = blk_mq_do_dispatch_ctx(hctx); > } else { > blk_mq_flush_busy_ctxs(hctx, &rq_list); > blk_mq_dispatch_rq_list(q, &rq_list, false); > } > + > + if (run_again) > + blk_mq_run_hw_queue(hctx, true); One improvement may be to run again locally in this function by limited times(such as 1) first, then switch to blk_mq_run_hw_queue() if run again is still needed. This way may save one async run hw queue. Thanks, Ming