On 12/17/18 5:55 AM, Jens Axboe wrote: > On 12/17/18 5:08 AM, Ming Lei wrote: >> When requst is added to rq list of sw queue(ctx), the rq may be from >> different type of hctx, especially after multi queue mapping is introduced. >> >> So when dispach request from sw queue via blk_mq_flush_busy_ctxs() or >> blk_mq_dequeue_from_ctx(), one request belonging to other queue type of >> hctx can be dispatch to current hctx in case that read queue or poll queue >> is enabled. >> >> This patch fixes this issue by introducing per-queue-type list. > > Looks good, just one comment: > >> diff --git a/block/blk-mq.h b/block/blk-mq.h >> index d1ed096723fb..0973a91eb1dd 100644 >> --- a/block/blk-mq.h >> +++ b/block/blk-mq.h >> @@ -12,14 +12,16 @@ struct blk_mq_ctxs { >> struct blk_mq_ctx __percpu *queue_ctx; >> }; >> >> +struct blk_mq_ctx_list { >> + spinlock_t lock; >> + struct list_head rq_list; >> +} ____cacheline_aligned_in_smp; >> + >> /** >> * struct blk_mq_ctx - State for a software queue facing the submitting CPUs >> */ >> struct blk_mq_ctx { >> - struct { >> - spinlock_t lock; >> - struct list_head rq_list; >> - } ____cacheline_aligned_in_smp; >> + struct blk_mq_ctx_list list[HCTX_MAX_TYPES]; > > Let's not make that use 3 cachelines. There is no good reason to split > these across cachelines, if we have heavy traffic to multiple of these, > then we're not going very fast anyway. So just make it: > > struct { > spinlock_t lock; > struct list_head rq_list[HCTX_MAX_TYPES]; > } ____cacheline_aligned_in_smp; Did it just to check, turns out fine and is of course less changes. Let me know. diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 1e12033be9ea..90d68760af08 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -652,36 +652,43 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m) return 0; } -static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) - __acquires(&ctx->lock) -{ - struct blk_mq_ctx *ctx = m->private; - - spin_lock(&ctx->lock); - return seq_list_start(&ctx->rq_list, *pos); -} - -static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct blk_mq_ctx *ctx = m->private; - - return seq_list_next(v, &ctx->rq_list, pos); -} +#define CTX_RQ_SEQ_OPS(name, type) \ +static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \ + __acquires(&ctx->lock) \ +{ \ + struct blk_mq_ctx *ctx = m->private; \ + \ + spin_lock(&ctx->lock); \ + return seq_list_start(&ctx->rq_lists[type], *pos); \ +} \ + \ +static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v, \ + loff_t *pos) \ +{ \ + struct blk_mq_ctx *ctx = m->private; \ + \ + return seq_list_next(v, &ctx->rq_lists[type], pos); \ +} \ + \ +static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v) \ + __releases(&ctx->lock) \ +{ \ + struct blk_mq_ctx *ctx = m->private; \ + \ + spin_unlock(&ctx->lock); \ +} \ + \ +static const struct seq_operations ctx_##name##_rq_list_seq_ops = { \ + .start = ctx_##name##_rq_list_start, \ + .next = ctx_##name##_rq_list_next, \ + .stop = ctx_##name##_rq_list_stop, \ + .show = blk_mq_debugfs_rq_show, \ +} + +CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT); +CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ); +CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); -static void ctx_rq_list_stop(struct seq_file *m, void *v) - __releases(&ctx->lock) -{ - struct blk_mq_ctx *ctx = m->private; - - spin_unlock(&ctx->lock); -} - -static const struct seq_operations ctx_rq_list_seq_ops = { - .start = ctx_rq_list_start, - .next = ctx_rq_list_next, - .stop = ctx_rq_list_stop, - .show = blk_mq_debugfs_rq_show, -}; static int ctx_dispatched_show(void *data, struct seq_file *m) { struct blk_mq_ctx *ctx = data; @@ -819,7 +826,9 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { - {"rq_list", 0400, .seq_ops = &ctx_rq_list_seq_ops}, + {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops}, + {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops}, + {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops}, {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write}, {"merged", 0600, ctx_merged_show, ctx_merged_write}, {"completed", 0600, ctx_completed_show, ctx_completed_write}, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 1efd781fcdea..5af40bbf4fc6 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -301,11 +301,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); * too much time checking for merges. */ static bool blk_mq_attempt_merge(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct bio *bio) { + enum hctx_type type = hctx->type; + lockdep_assert_held(&ctx->lock); - if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) { + if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) { ctx->rq_merged++; return true; } @@ -319,17 +322,19 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu); bool ret = false; + enum hctx_type type; if (e && e->type->ops.bio_merge) { blk_mq_put_ctx(ctx); return e->type->ops.bio_merge(hctx, bio); } + type = hctx->type; if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - !list_empty_careful(&ctx->rq_list)) { + !list_empty_careful(&ctx->rq_lists[type])) { /* default per sw-queue merge */ spin_lock(&ctx->lock); - ret = blk_mq_attempt_merge(q, ctx, bio); + ret = blk_mq_attempt_merge(q, hctx, ctx, bio); spin_unlock(&ctx->lock); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 313f28b2d079..1546e88fe59c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -958,9 +958,10 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) struct flush_busy_ctx_data *flush_data = data; struct blk_mq_hw_ctx *hctx = flush_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + enum hctx_type type = hctx->type; spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, flush_data->list); + list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); sbitmap_clear_bit(sb, bitnr); spin_unlock(&ctx->lock); return true; @@ -992,12 +993,13 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, struct dispatch_rq_data *dispatch_data = data; struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + enum hctx_type type = hctx->type; spin_lock(&ctx->lock); - if (!list_empty(&ctx->rq_list)) { - dispatch_data->rq = list_entry_rq(ctx->rq_list.next); + if (!list_empty(&ctx->rq_lists[type])) { + dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); list_del_init(&dispatch_data->rq->queuelist); - if (list_empty(&ctx->rq_list)) + if (list_empty(&ctx->rq_lists[type])) sbitmap_clear_bit(sb, bitnr); } spin_unlock(&ctx->lock); @@ -1608,15 +1610,16 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; + enum hctx_type type = hctx->type; lockdep_assert_held(&ctx->lock); trace_block_rq_insert(hctx->queue, rq); if (at_head) - list_add(&rq->queuelist, &ctx->rq_list); + list_add(&rq->queuelist, &ctx->rq_lists[type]); else - list_add_tail(&rq->queuelist, &ctx->rq_list); + list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); } void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, @@ -1651,6 +1654,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, { struct request *rq; + enum hctx_type type = hctx->type; /* * preemption doesn't flush plug list, so it's possible ctx->cpu is @@ -1662,7 +1666,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, } spin_lock(&ctx->lock); - list_splice_tail_init(list, &ctx->rq_list); + list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } @@ -2200,13 +2204,15 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); + enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); ctx = __blk_mq_get_ctx(hctx->queue, cpu); + type = hctx->type; spin_lock(&ctx->lock); - if (!list_empty(&ctx->rq_list)) { - list_splice_init(&ctx->rq_list, &tmp); + if (!list_empty(&ctx->rq_lists[type])) { + list_splice_init(&ctx->rq_lists[type], &tmp); blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); @@ -2343,10 +2349,14 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, for_each_possible_cpu(i) { struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); struct blk_mq_hw_ctx *hctx; + int k; __ctx->cpu = i; + spin_lock_init(&__ctx->lock); - INIT_LIST_HEAD(&__ctx->rq_list); + for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) { + INIT_LIST_HEAD(&__ctx->rq_lists[k]); + } __ctx->queue = q; /* diff --git a/block/blk-mq.h b/block/blk-mq.h index d1ed096723fb..d943d46b0785 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -18,8 +18,8 @@ struct blk_mq_ctxs { struct blk_mq_ctx { struct { spinlock_t lock; - struct list_head rq_list; - } ____cacheline_aligned_in_smp; + struct list_head rq_lists[HCTX_MAX_TYPES]; + } ____cacheline_aligned_in_smp; unsigned int cpu; unsigned short index_hw[HCTX_MAX_TYPES]; -- Jens Axboe