From: Yu Kuai <yukuai3@xxxxxxxxxx> Start tag fair sharing when a device start to issue io will waste resources, same number of tags will be assigned to each disk/hctx, and such tags can't be used for other disk/hctx, which means a disk/hctx can't use more than assinged tags even if there are still lots of tags that is assinged to other disks are unused. Add a new api blk_mq_driver_tag_busy(), it will be called when get driver tag failed, and move tag sharing from blk_mq_tag_busy() to blk_mq_driver_tag_busy(). This approch will work well if total tags are not exhausted, and follow up patches will try to refactor how tag is shared to handle this case. Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx> --- block/blk-mq-debugfs.c | 4 ++- block/blk-mq-tag.c | 60 ++++++++++++++++++++++++++++++++++-------- block/blk-mq.c | 4 ++- block/blk-mq.h | 13 ++++++--- include/linux/blk-mq.h | 6 +++-- include/linux/blkdev.h | 1 + 6 files changed, 70 insertions(+), 18 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 431aaa3eb181..de5a911b07c2 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -400,8 +400,10 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, { seq_printf(m, "nr_tags=%u\n", tags->nr_tags); seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); - seq_printf(m, "active_queues=%d\n", + seq_printf(m, "active_queues=%u\n", READ_ONCE(tags->ctl.active_queues)); + seq_printf(m, "share_queues=%u\n", + READ_ONCE(tags->ctl.share_queues)); seq_puts(m, "\nbitmap_tags:\n"); sbitmap_queue_show(&tags->bitmap_tags, m); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index fe41a0d34fc0..1c2bde917195 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -29,6 +29,32 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, users); } +void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; + + /* + * calling test_bit() prior to test_and_set_bit() is intentional, + * it avoids dirtying the cacheline if the queue is already active. + */ + if (blk_mq_is_shared_tags(hctx->flags)) { + struct request_queue *q = hctx->queue; + + if (test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags) || + test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags)) + return; + } else { + if (test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state) || + test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state)) + return; + } + + spin_lock_irq(&tags->lock); + WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues); + blk_mq_update_wake_batch(tags, tags->ctl.share_queues); + spin_unlock_irq(&tags->lock); +} + /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail @@ -37,7 +63,6 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, */ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { - unsigned int users; struct blk_mq_tags *tags = hctx->tags; /* @@ -57,9 +82,7 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) } spin_lock_irq(&tags->lock); - users = tags->ctl.active_queues + 1; - WRITE_ONCE(tags->ctl.active_queues, users); - blk_mq_update_wake_batch(tags, users); + WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues + 1); spin_unlock_irq(&tags->lock); } @@ -73,6 +96,14 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) sbitmap_queue_wake_all(&tags->breserved_tags); } +static void __blk_mq_driver_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (blk_mq_is_shared_tags(hctx->flags)) + clear_bit(QUEUE_FLAG_HCTX_BUSY, &hctx->queue->queue_flags); + else + clear_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state); +} + /* * If a previously busy queue goes inactive, potential waiters could now * be allowed to queue. Wake them up and check. @@ -80,7 +111,6 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; - unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; @@ -94,9 +124,10 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) } spin_lock_irq(&tags->lock); - users = tags->ctl.active_queues - 1; - WRITE_ONCE(tags->ctl.active_queues, users); - blk_mq_update_wake_batch(tags, users); + __blk_mq_driver_tag_idle(hctx); + WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues - 1); + WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues); + blk_mq_update_wake_batch(tags, tags->ctl.share_queues); spin_unlock_irq(&tags->lock); blk_mq_tag_wakeup_all(tags, false); @@ -105,14 +136,21 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { + int ret = BLK_MQ_NO_TAG; + if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && !hctx_may_queue(data->hctx, bt)) - return BLK_MQ_NO_TAG; + goto out; + /* shallow_depth is only used for elevator */ if (data->shallow_depth) return sbitmap_queue_get_shallow(bt, data->shallow_depth); - else - return __sbitmap_queue_get(bt); + + ret = __sbitmap_queue_get(bt); +out: + if (ret == BLK_MQ_NO_TAG && !(data->rq_flags & RQF_SCHED_TAGS)) + blk_mq_driver_tag_busy(data->hctx); + return ret; } unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index da650a2c4ca1..171ee4ac97ef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1753,8 +1753,10 @@ static bool __blk_mq_alloc_driver_tag(struct request *rq) bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { - if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) { + blk_mq_driver_tag_busy(hctx); return false; + } if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && !(rq->rq_flags & RQF_MQ_INFLIGHT)) { diff --git a/block/blk-mq.h b/block/blk-mq.h index ca1c13127868..01441a5e9910 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -193,8 +193,9 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, return sbq_wait_ptr(bt, &hctx->wait_index); } -void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); -void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); +void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx); +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx); +void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx); static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { @@ -208,6 +209,12 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) __blk_mq_tag_idle(hctx); } +static inline void blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_driver_tag_busy(hctx); +} + static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { @@ -412,7 +419,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return true; } - users = READ_ONCE(hctx->tags->ctl.active_queues); + users = READ_ONCE(hctx->tags->ctl.share_queues); if (!users) return true; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8d2cd6b9d305..bc3ac22edb07 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -675,10 +675,11 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, - BLK_MQ_S_SCHED_RESTART = 2, + BLK_MQ_S_DTAG_BUSY = 2, + BLK_MQ_S_SCHED_RESTART = 3, /* hw queue is inactive after all its CPUs become offline */ - BLK_MQ_S_INACTIVE = 3, + BLK_MQ_S_INACTIVE = 4, BLK_MQ_MAX_DEPTH = 10240, @@ -735,6 +736,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, struct tag_sharing_ctl { unsigned int active_queues; + unsigned int share_queues; }; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ed44a997f629..0994707f6a68 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -546,6 +546,7 @@ struct request_queue { #define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ +#define QUEUE_FLAG_HCTX_BUSY 23 /* at least one blk-mq hctx failed to get driver tag */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ -- 2.39.2