If pending_queues is increased once, it will only be decreased when nr_active is zero, and that will lead to the under-utilization of host tags because pending_queues is non-zero and the available tags for the queue will be max(host tags / active_queues, 4) instead of the needed tags of the queue. Fix it by adding an expiration time for the increasement of pending_queues, and decrease it when it expires, so pending_queues will be decreased to zero if there is no tag allocation failure, and the available tags for the queue will be the whole host tags. Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx> Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx> --- block/blk-mq-tag.c | 29 ++++++++++++++++++++++++++--- block/blk-mq-tag.h | 6 +++--- block/blk-mq.c | 5 +++-- block/blk-mq.h | 2 +- include/linux/blk-mq.h | 6 +++++- 5 files changed, 38 insertions(+), 10 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 6dcd6dd9123a..e3ffe0fde052 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -46,12 +46,16 @@ void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) struct blk_mq_tag_set *set = q->tag_set; if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) && - !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) + !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) { + hctx->dtag_wait_time = jiffies; atomic_inc(&set->pending_queues_shared_sbitmap); + } } else { if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) && - !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) + !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) { + hctx->dtag_wait_time = jiffies; atomic_inc(&hctx->tags->pending_queues); + } } } @@ -89,12 +93,28 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) blk_mq_tag_wakeup_all(tags, false); } -void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) +#define BLK_MQ_DTAG_WAIT_EXPIRE (5 * HZ) + +void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx, bool force) { struct blk_mq_tags *tags = hctx->tags; struct request_queue *q = hctx->queue; struct blk_mq_tag_set *set = q->tag_set; + if (!force) { + if (blk_mq_is_sbitmap_shared(hctx->flags)) { + if (!(test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) && + time_after(jiffies, hctx->dtag_wait_time + + BLK_MQ_DTAG_WAIT_EXPIRE))) + return; + } else { + if (!(test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) && + time_after(jiffies, hctx->dtag_wait_time + + BLK_MQ_DTAG_WAIT_EXPIRE))) + return; + } + } + if (blk_mq_is_sbitmap_shared(hctx->flags) && test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) atomic_dec(&set->pending_queues_shared_sbitmap); @@ -202,6 +222,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) sbitmap_finish_wait(bt, ws, &wait); found_tag: + if (!data->q->elevator) + blk_mq_dtag_idle(data->hctx, false); + /* * Give up this allocation if the hctx is inactive. The caller will * retry on an active hctx. diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 52d08a92f683..888692498ef2 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -68,7 +68,7 @@ enum { extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *); -extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *); +extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *, bool); static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { @@ -94,12 +94,12 @@ static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) __blk_mq_dtag_busy(hctx); } -static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) +static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx, bool force) { if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return; - __blk_mq_dtag_idle(hctx); + __blk_mq_dtag_idle(hctx, force); } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index 2b8fa49bccb4..9ac976107154 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1015,7 +1015,7 @@ static void blk_mq_timeout_work(struct work_struct *work) /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) { blk_mq_tag_idle(hctx); - blk_mq_dtag_idle(hctx); + blk_mq_dtag_idle(hctx, true); } } } @@ -2568,7 +2568,7 @@ static void blk_mq_exit_hctx(struct request_queue *q, { if (blk_mq_hw_queue_mapped(hctx)) { blk_mq_tag_idle(hctx); - blk_mq_dtag_idle(hctx); + blk_mq_dtag_idle(hctx, true); } if (set->ops->exit_request) @@ -2667,6 +2667,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; + hctx->dtag_wait_time = jiffies; INIT_LIST_HEAD(&hctx->hctx_list); diff --git a/block/blk-mq.h b/block/blk-mq.h index 228c5c442be4..93ede498f5e6 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -214,7 +214,7 @@ static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap); else if (!atomic_dec_return(&hctx->nr_active)) { blk_mq_tag_idle(hctx); - blk_mq_dtag_idle(hctx); + blk_mq_dtag_idle(hctx, true); } } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2a473865ee7f..82591c2f76cc 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -158,7 +158,11 @@ struct blk_mq_hw_ctx { unsigned long poll_invoked; /** @poll_success: Count how many polled requests were completed. */ unsigned long poll_success; - + /** + * record when hardware queue is pending, specifically when + * BLK_MQ_S_DTAG_WAIT is set in state. + */ + unsigned long dtag_wait_time; #ifdef CONFIG_BLK_DEBUG_FS /** * @debugfs_dir: debugfs directory for this hardware queue. Named -- 2.25.4