On Mon, Jun 29, 2020 at 06:32:39PM +0800, Ming Lei wrote: > On Mon, Jun 29, 2020 at 10:04:52AM +0100, Christoph Hellwig wrote: > > > +/* > > > + * We know bfq and deadline apply single scheduler queue instead of multi > > > + * queue. However, the two are often used on single queue devices, also > > > + * the current @hctx should affect the real device status most of times > > > + * because of locality principle. > > > + * > > > + * So use current hctx->dispatch_busy directly for figuring out batching > > > + * dispatch count. > > > + */ > > > > I don't really understand this comment. Also I think the code might > > be cleaner if this function is inlined as an if/else in the only > > caller. > > > > > +static inline bool blk_mq_do_dispatch_rq_lists(struct blk_mq_hw_ctx *hctx, > > > + struct list_head *lists, bool multi_hctxs, unsigned count) > > > +{ > > > + bool ret; > > > + > > > + if (!count) > > > + return false; > > > + > > > + if (likely(!multi_hctxs)) > > > + return blk_mq_dispatch_rq_list(hctx, lists, count); > > > > Keeping these checks in the callers would keep things a little cleaner, > > especially as the multi hctx case only really needs the lists argument. > > > > > + LIST_HEAD(list); > > > + struct request *new, *rq = list_first_entry(lists, > > > + struct request, queuelist); > > > + unsigned cnt = 0; > > > + > > > + list_for_each_entry(new, lists, queuelist) { > > > + if (new->mq_hctx != rq->mq_hctx) > > > + break; > > > + cnt++; > > > + } > > > + > > > + if (new->mq_hctx == rq->mq_hctx) > > > + list_splice_tail_init(lists, &list); > > > + else > > > + list_cut_before(&list, lists, &new->queuelist); > > > + > > > + ret = blk_mq_dispatch_rq_list(rq->mq_hctx, &list, cnt); > > > + } > > > > I think this has two issues: for one ret should be ORed as any dispatch > > or error should leaave ret set. Also we need to splice the dispatch > > OK. > > > list back onto the main list here, otherwise we can lose those requests. > > The dispatch list always becomes empty after blk_mq_dispatch_rq_list() > returns, so no need to splice it back. > > > > > FYI, while reviewing this I ended up editing things into a shape I > > could better understand. Let me know what you think of this version? > > > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c > > index 4c72073830f3cb..466dce99699ae4 100644 > > --- a/block/blk-mq-sched.c > > +++ b/block/blk-mq-sched.c > > @@ -7,6 +7,7 @@ > > #include <linux/kernel.h> > > #include <linux/module.h> > > #include <linux/blk-mq.h> > > +#include <linux/list_sort.h> > > > > #include <trace/events/block.h> > > > > @@ -80,6 +81,38 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) > > blk_mq_run_hw_queue(hctx, true); > > } > > > > +static int sched_rq_cmp(void *priv, struct list_head *a, struct list_head *b) > > +{ > > + struct request *rqa = container_of(a, struct request, queuelist); > > + struct request *rqb = container_of(b, struct request, queuelist); > > + > > + return rqa->mq_hctx > rqb->mq_hctx; > > +} > > + > > +static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) > > +{ > > + struct blk_mq_hw_ctx *hctx = > > + list_first_entry(rq_list, struct request, queuelist)->mq_hctx; > > + struct request *rq; > > + LIST_HEAD(hctx_list); > > + unsigned int count = 0; > > + bool ret; > > + > > + list_for_each_entry(rq, rq_list, queuelist) { > > + if (rq->mq_hctx != hctx) { > > + list_cut_before(&hctx_list, rq_list, &rq->queuelist); > > + goto dispatch; > > + } > > + count++; > > + } > > + list_splice_tail_init(rq_list, &hctx_list); > > + > > +dispatch: > > + ret = blk_mq_dispatch_rq_list(hctx, &hctx_list, count); > > + list_splice(&hctx_list, rq_list); > > The above line isn't needed. > > > + return ret; > > +} > > + > > #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ > > > > /* > > @@ -90,20 +123,29 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) > > * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to > > * be run again. This is necessary to avoid starving flushes. > > */ > > -static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) > > +static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) > > The return type can be changed to 'bool'. > > > { > > struct request_queue *q = hctx->queue; > > struct elevator_queue *e = q->elevator; > > + bool multi_hctxs = false, run_queue = false; > > + bool dispatched = false, busy = false; > > + unsigned int max_dispatch; > > LIST_HEAD(rq_list); > > - int ret = 0; > > - struct request *rq; > > + int count = 0; > > + > > + if (hctx->dispatch_busy) > > + max_dispatch = 1; > > + else > > + max_dispatch = hctx->queue->nr_requests; > > > > do { > > + struct request *rq; > > + > > if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) > > break; > > > > if (!list_empty_careful(&hctx->dispatch)) { > > - ret = -EAGAIN; > > + busy = true; > > break; > > } > > > > @@ -120,7 +162,7 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) > > * no guarantee anyone will kick the queue. Kick it > > * ourselves. > > */ > > - blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); > > + run_queue = true; > > break; > > } > > > > @@ -130,7 +172,43 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) > > * in blk_mq_dispatch_rq_list(). > > */ > > list_add(&rq->queuelist, &rq_list); > > The above should change to list_add_tail(&rq->queuelist, &rq_list). Hi Christoph, Follows the revised patch, and will post it as V7 if you are fine: diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 4c72073830f3..1c52e56a19b1 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/blk-mq.h> +#include <linux/list_sort.h> #include <trace/events/block.h> @@ -80,6 +81,37 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) blk_mq_run_hw_queue(hctx, true); } +static int sched_rq_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct request *rqa = container_of(a, struct request, queuelist); + struct request *rqb = container_of(b, struct request, queuelist); + + return rqa->mq_hctx > rqb->mq_hctx; +} + +static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) +{ + struct blk_mq_hw_ctx *hctx = + list_first_entry(rq_list, struct request, queuelist)->mq_hctx; + struct request *rq; + LIST_HEAD(hctx_list); + unsigned int count = 0; + bool ret; + + list_for_each_entry(rq, rq_list, queuelist) { + if (rq->mq_hctx != hctx) { + list_cut_before(&hctx_list, rq_list, &rq->queuelist); + goto dispatch; + } + count++; + } + list_splice_tail_init(rq_list, &hctx_list); + +dispatch: + ret = blk_mq_dispatch_rq_list(hctx, &hctx_list, count); + return ret; +} + #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ /* @@ -90,20 +122,29 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. */ -static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; + bool multi_hctxs = false, run_queue = false; + bool dispatched = false, busy = false; + unsigned int max_dispatch; LIST_HEAD(rq_list); - int ret = 0; - struct request *rq; + int count = 0; + + if (hctx->dispatch_busy) + max_dispatch = 1; + else + max_dispatch = hctx->queue->nr_requests; do { + struct request *rq; + if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; if (!list_empty_careful(&hctx->dispatch)) { - ret = -EAGAIN; + busy = true; break; } @@ -120,7 +161,7 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * no guarantee anyone will kick the queue. Kick it * ourselves. */ - blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); + run_queue = true; break; } @@ -129,8 +170,42 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * if this rq won't be queued to driver via .queue_rq() * in blk_mq_dispatch_rq_list(). */ - list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); + list_add_tail(&rq->queuelist, &rq_list); + if (rq->mq_hctx != hctx) + multi_hctxs = true; + } while (++count < max_dispatch); + + if (!count) { + if (run_queue) + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); + } else if (multi_hctxs) { + /* + * Requests from different hctx may be dequeued from some + * schedulers, such as bfq and deadline. + * + * Sort the requests in the list according to their hctx, + * dispatch batching requests from same hctx at a time. + */ + list_sort(NULL, &rq_list, sched_rq_cmp); + do { + dispatched |= blk_mq_dispatch_hctx_list(&rq_list); + } while (!list_empty(&rq_list)); + } else { + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); + } + + if (busy) + return -EAGAIN; + return !!dispatched; +} + +static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +{ + int ret; + + do { + ret = __blk_mq_do_dispatch_sched(hctx); + } while (ret == 1); return ret; } diff --git a/block/blk-mq.c b/block/blk-mq.c index d273a56f11c0..57ae018d5cc8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1323,8 +1323,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, if (list_empty(list)) return false; - WARN_ON(!list_is_singular(list) && nr_budgets); - /* * Now process all the entries, sending them to the driver. */ Thanks, Ming