On Thu, Jan 13, 2022 at 10:55:36AM +0800, Laibin Qiu wrote: > In case of shared tags, there might be more than one hctx which > allocates from the same tags, and each hctx is limited to allocate at > most: > hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U); > > tag idle detection is lazy, and may be delayed for 30sec, so there > could be just one real active hctx(queue) but all others are actually > idle and still accounted as active because of the lazy idle detection. > Then if wake_batch is > hctx_max_depth, driver tag allocation may wait > forever on this real active hctx. > > Fix this by recalculating wake_batch when inc or dec active_queues. FWIW, Reviewed-by: Andy Shevchenko <andriy.shevchenko@xxxxxxxxxxxxxxx> Thanks! > Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps") > Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx> > Suggested-by: John Garry <john.garry@xxxxxxxxxx> > Signed-off-by: Laibin Qiu <qiulaibin@xxxxxxxxxx> > --- > block/blk-mq-tag.c | 40 +++++++++++++++++++++++++++++++++------- > include/linux/sbitmap.h | 11 +++++++++++ > lib/sbitmap.c | 25 ++++++++++++++++++++++--- > 3 files changed, 66 insertions(+), 10 deletions(-) > > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > index e55a6834c9a6..845f74e8dd7b 100644 > --- a/block/blk-mq-tag.c > +++ b/block/blk-mq-tag.c > @@ -16,6 +16,21 @@ > #include "blk-mq-sched.h" > #include "blk-mq-tag.h" > > +/* > + * Recalculate wakeup batch when tag is shared by hctx. > + */ > +static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, > + unsigned int users) > +{ > + if (!users) > + return; > + > + sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, > + users); > + sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, > + users); > +} > + > /* > * If a previously inactive queue goes active, bump the active user count. > * We need to do this before try to allocate driver tag, then even if fail > @@ -24,18 +39,26 @@ > */ > bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) > { > + unsigned int users; > + > if (blk_mq_is_shared_tags(hctx->flags)) { > struct request_queue *q = hctx->queue; > > - if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && > - !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) > - atomic_inc(&hctx->tags->active_queues); > + if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || > + test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) { > + return true; > + } > } else { > - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && > - !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) > - atomic_inc(&hctx->tags->active_queues); > + if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || > + test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) { > + return true; > + } > } > > + users = atomic_inc_return(&hctx->tags->active_queues); > + > + blk_mq_update_wake_batch(hctx->tags, users); > + > return true; > } > > @@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) > void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) > { > struct blk_mq_tags *tags = hctx->tags; > + unsigned int users; > > if (blk_mq_is_shared_tags(hctx->flags)) { > struct request_queue *q = hctx->queue; > @@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) > return; > } > > - atomic_dec(&tags->active_queues); > + users = atomic_dec_return(&tags->active_queues); > + > + blk_mq_update_wake_batch(tags, users); > > blk_mq_tag_wakeup_all(tags, false); > } > diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h > index fc0357a6e19b..95df357ec009 100644 > --- a/include/linux/sbitmap.h > +++ b/include/linux/sbitmap.h > @@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) > sbitmap_free(&sbq->sb); > } > > +/** > + * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch > + * @sbq: Bitmap queue to recalculate wake batch. > + * @users: Number of shares. > + * > + * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch > + * by depth. This interface is for HCTX shared tags or queue shared tags. > + */ > +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, > + unsigned int users); > + > /** > * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. > * @sbq: Bitmap queue to resize. > diff --git a/lib/sbitmap.c b/lib/sbitmap.c > index 2709ab825499..6220fa67fb7e 100644 > --- a/lib/sbitmap.c > +++ b/lib/sbitmap.c > @@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, > } > EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); > > -static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, > - unsigned int depth) > +static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, > + unsigned int wake_batch) > { > - unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth); > int i; > > if (sbq->wake_batch != wake_batch) { > @@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, > } > } > > +static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, > + unsigned int depth) > +{ > + unsigned int wake_batch; > + > + wake_batch = sbq_calc_wake_batch(sbq, depth); > + __sbitmap_queue_update_wake_batch(sbq, wake_batch); > +} > + > +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, > + unsigned int users) > +{ > + unsigned int wake_batch; > + > + wake_batch = clamp_val((sbq->sb.depth + users - 1) / > + users, 4, SBQ_WAKE_BATCH); > + __sbitmap_queue_update_wake_batch(sbq, wake_batch); > +} > +EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); > + > void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) > { > sbitmap_queue_update_wake_batch(sbq, depth); > -- > 2.22.0 > -- With Best Regards, Andy Shevchenko