On Tue, Sep 19, 2017 at 02:37:50PM -0600, Jens Axboe wrote: > On 09/02/2017 09:17 AM, Ming Lei wrote: > > @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) > > if (!list_empty(&rq_list)) { > > blk_mq_sched_mark_restart_hctx(hctx); > > do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list); > > - } else if (!has_sched_dispatch) { > > + } else if (!has_sched_dispatch && !q->queue_depth) { > > + /* > > + * If there is no per-request_queue depth, we > > + * flush all requests in this hw queue, otherwise > > + * pick up request one by one from sw queue for > > + * avoiding to mess up I/O merge when dispatch > > + * is busy, which can be triggered easily by > > + * per-request_queue queue depth > > + */ > > blk_mq_flush_busy_ctxs(hctx, &rq_list); > > blk_mq_dispatch_rq_list(q, &rq_list); > > } > > I don't like this part at all. It's a heuristic, and on top of that, > it's a heuristic based on a weak assumption that if q->queue_depth == 0, > then we never run into BUSY constraints. I think that would be stronger > if it was based on "is this using shared tags", but even that is not > great at all. It's perfectly possible to have a shared tags setup and > never run into resource constraints. The opposite condition is also true > - you can run without shared tags, and yet hit resource constraints > constantly. Hence this is NOT a good test for deciding whether to flush > everything at once or not. In short, I think a much better test case > would be "has this device ever returned BLK_STS_RESOURCE. As it so > happens, that's easy enough to keep track of and base this test on. Hi Jens, The attached patch follows your suggestion, and uses EWMA to compute the average length of hctx->dispatch, then only flush all requests from ctxs if the average length is zero, what do you think about this approach? Or other suggestions? diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7a27f262c96a..c420c775b9c0 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -232,6 +232,14 @@ static int hctx_flags_show(void *data, struct seq_file *m) return 0; } +static int hctx_dispatch_len_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + seq_printf(m, "%u\n", hctx->dispatch_len); + return 0; +} + #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const op_name[] = { REQ_OP_NAME(READ), @@ -763,6 +771,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, hctx_state_show}, {"flags", 0400, hctx_flags_show}, {"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops}, + {"dispatch_length", 0400, hctx_dispatch_len_show}, {"busy", 0400, hctx_busy_show}, {"ctx_map", 0400, hctx_ctx_map_show}, {"tags", 0400, hctx_tags_show}, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index a5712dd67783..91a6eeaadaf1 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -102,7 +102,7 @@ static void blk_mq_do_dispatch_sched(struct request_queue *q, if (!rq) break; list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(q, &rq_list)); + } while (blk_mq_dispatch_rq_list(q, &rq_list, 1)); } static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, @@ -134,19 +134,51 @@ static void blk_mq_do_dispatch_ctx(struct request_queue *q, /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); - dispatched = blk_mq_dispatch_rq_list(q, &rq_list); + dispatched = blk_mq_dispatch_rq_list(q, &rq_list, 1); } while (dispatched); if (!dispatched) WRITE_ONCE(hctx->dispatch_from, ctx); } +/* borrowed from bcache */ +void ewma_add(unsigned *ewma, unsigned val, unsigned weight) +{ + unsigned result = READ_ONCE(*ewma); + + if (val == 1) { + result += 1; + } else { + result *= weight - 1; + result += val; + result /= weight; + } + WRITE_ONCE(*ewma, result); +} + +/* + * We use EWMA to compute the average length of dispatch list. + * It is totally lockless, but we can survive that since it + * is just a hint. + * + * We only flush requests from all ctxs if the average length + * of dispatch list is zero, otherwise the hw queue can be + * thought as busy and we dequeue request from ctxs one by + * one + */ +static void blk_mq_update_dispatch_length(struct blk_mq_hw_ctx *hctx, + unsigned cnt) +{ + ewma_add(&hctx->dispatch_len, cnt, 8); +} + void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; LIST_HEAD(rq_list); + unsigned rq_cnt = 0; /* RCU or SRCU read lock is needed before checking quiesced flag */ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) @@ -162,9 +194,14 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) spin_lock(&hctx->lock); if (!list_empty(&hctx->dispatch)) list_splice_init(&hctx->dispatch, &rq_list); + rq_cnt = hctx->cnt; + hctx->cnt = 0; spin_unlock(&hctx->lock); } + if (!has_sched_dispatch) + blk_mq_update_dispatch_length(hctx, rq_cnt); + /* * Only ask the scheduler for requests, if we didn't have residual * requests from the dispatch list. This is to avoid the case where @@ -176,7 +213,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - blk_mq_dispatch_rq_list(q, &rq_list); + blk_mq_dispatch_rq_list(q, &rq_list, rq_cnt); /* * We may clear DISPATCH_BUSY just after it @@ -204,16 +241,16 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (!has_sched_dispatch) { /* - * If there is no per-request_queue depth, we + * If dispatch doesn't run out of resource, we * flush all requests in this hw queue, otherwise * pick up request one by one from sw queue for * avoiding to mess up I/O merge when dispatch * run out of resource, which can be triggered * easily by per-request_queue queue depth */ - if (!q->queue_depth) { - blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(q, &rq_list); + if (!hctx->dispatch_len) { + rq_cnt = blk_mq_flush_busy_ctxs(hctx, &rq_list); + blk_mq_dispatch_rq_list(q, &rq_list, rq_cnt); } else { blk_mq_do_dispatch_ctx(q, hctx); } @@ -346,6 +383,7 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, * the dispatch list. */ spin_lock(&hctx->lock); + hctx->cnt++; list_add(&rq->queuelist, &hctx->dispatch); set_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state); spin_unlock(&hctx->lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index 1bb45e995da3..b2db5d6d568e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -850,6 +850,7 @@ static void blk_mq_timeout_work(struct work_struct *work) struct flush_busy_ctx_data { struct blk_mq_hw_ctx *hctx; struct list_head *list; + unsigned cnt; }; static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) @@ -857,11 +858,16 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) struct flush_busy_ctx_data *flush_data = data; struct blk_mq_hw_ctx *hctx = flush_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + unsigned cnt = 0; sbitmap_clear_bit(sb, bitnr); spin_lock(&ctx->lock); list_splice_tail_init(&ctx->rq_list, flush_data->list); + cnt = ctx->cnt; + ctx->cnt = 0;; spin_unlock(&ctx->lock); + flush_data->cnt += cnt; + return true; } @@ -869,14 +875,17 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) * Process software queues that have been marked busy, splicing them * to the for-dispatch */ -void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) +unsigned blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct flush_busy_ctx_data data = { .hctx = hctx, .list = list, + .cnt = 0, }; sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); + + return data.cnt; } EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); @@ -897,6 +906,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, void *d list_del_init(&dispatch_data->rq->queuelist); if (list_empty(&ctx->rq_list)) sbitmap_clear_bit(sb, bitnr); + ctx->cnt--; } spin_unlock(&ctx->lock); @@ -1052,7 +1062,9 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) return true; } -bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) +/* @cnt represents the number of requests in @list */ +bool blk_mq_dispatch_rq_list(struct request_queue *q, + struct list_head *list, unsigned cnt) { struct blk_mq_hw_ctx *hctx; struct request *rq; @@ -1139,6 +1151,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) blk_mq_put_driver_tag(rq); spin_lock(&hctx->lock); + hctx->cnt += cnt - queued - errors; list_splice_init(list, &hctx->dispatch); /* * DISPATCH_BUSY won't be cleared until all requests @@ -1431,6 +1444,7 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, list_add(&rq->queuelist, &ctx->rq_list); else list_add_tail(&rq->queuelist, &ctx->rq_list); + ctx->cnt++; } void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, @@ -1454,6 +1468,7 @@ void blk_mq_request_bypass_insert(struct request *rq) struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); spin_lock(&hctx->lock); + hctx->cnt++; list_add_tail(&rq->queuelist, &hctx->dispatch); set_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state); spin_unlock(&hctx->lock); @@ -1941,6 +1956,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) if (!list_empty(&ctx->rq_list)) { list_splice_init(&ctx->rq_list, &tmp); blk_mq_hctx_clear_pending(hctx, ctx); + ctx->cnt = 0; } spin_unlock(&ctx->lock); @@ -1950,6 +1966,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); set_bit(BLK_MQ_S_DISPATCH_BUSY, &hctx->state); + hctx->cnt++; spin_unlock(&hctx->lock); blk_mq_run_hw_queue(hctx, true); diff --git a/block/blk-mq.h b/block/blk-mq.h index 231cfb0d973b..0da5a81cc41f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -9,6 +9,7 @@ struct blk_mq_ctx { struct { spinlock_t lock; struct list_head rq_list; + unsigned cnt; } ____cacheline_aligned_in_smp; unsigned int cpu; @@ -30,8 +31,9 @@ void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); -bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *); -void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); +bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, + unsigned); +unsigned blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, bool wait); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 13f6c25fa461..778dbdd596f3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -11,6 +11,7 @@ struct blk_flush_queue; struct blk_mq_hw_ctx { struct { spinlock_t lock; + unsigned cnt; struct list_head dispatch; unsigned long state; /* BLK_MQ_S_* flags */ } ____cacheline_aligned_in_smp; @@ -31,6 +32,7 @@ struct blk_mq_hw_ctx { struct sbitmap ctx_map; struct blk_mq_ctx *dispatch_from; + unsigned dispatch_len; struct blk_mq_ctx **ctxs; unsigned int nr_ctx; Thanks, Ming