Re: [PATCH RFC V2 3/5] blk-mq: Facilitate a shared sbitmap per tagset

Ming Lei <ming.lei@xxxxxxxxxx> · Mon, 25 Nov 2019 11:00:39 +0800

On Thu, Nov 21, 2019 at 10:24:16AM +0000, John Garry wrote:
> > >   int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> > >   {
> > > +	struct blk_mq_tag_set *tag_set = q->tag_set;
> > >   	struct blk_mq_hw_ctx *hctx;
> > >   	struct elevator_queue *eq;
> > >   	unsigned int i;
> > > @@ -537,6 +538,19 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> > >   		blk_mq_debugfs_register_sched_hctx(q, hctx);
> > >   	}
> > > +	if (blk_mq_is_sbitmap_shared(tag_set)) {
> > > +		if (!blk_mq_init_sched_shared_sbitmap(tag_set, q->nr_requests)) {
> > > +			ret = -ENOMEM;
> > > +			goto err;
> > > +		}
> > > +		queue_for_each_hw_ctx(q, hctx, i) {
> > > +			struct blk_mq_tags *tags = hctx->sched_tags;
> > > +
> > > +			tags->pbitmap_tags = &tag_set->sched_shared_bitmap_tags;
> > > +			tags->pbreserved_tags = &tag_set->sched_shared_breserved_tags;
> > 
> > This kind of sharing is wrong, sched tags should be request queue wide
> > instead of tagset wide, and each request queue has its own & independent
> > scheduler queue.
> 
> Right, so if we get get a scheduler tag we still need to get a driver tag,
> and this would be the "shared" tag.
> 
> That makes things simpler then.
> 
> > 
> > > +		}
> > > +	}
> > > +
> > >   	return 0;
> > >   err:
> > > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > > index 42792942b428..6625bebb46c3 100644
> > > --- a/block/blk-mq-tag.c
> > > +++ b/block/blk-mq-tag.c
> > > @@ -35,9 +35,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
> > >    */
> > >   void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
> > >   {
> > > -	sbitmap_queue_wake_all(&tags->bitmap_tags);
> > > +	sbitmap_queue_wake_all(tags->pbitmap_tags);
> > >   	if (include_reserve)
> > > -		sbitmap_queue_wake_all(&tags->breserved_tags);
> > > +		sbitmap_queue_wake_all(tags->pbreserved_tags);
> > >   }
> 
> [...]
> 
> 
> > >   	mutex_init(&set->tag_list_lock);
> > >   	INIT_LIST_HEAD(&set->tag_list);
> > > @@ -3137,6 +3151,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
> > >   {
> > >   	struct blk_mq_tag_set *set = q->tag_set;
> > >   	struct blk_mq_hw_ctx *hctx;
> > > +	bool sched_tags = false;
> > >   	int i, ret;
> > >   	if (!set)
> > > @@ -3160,6 +3175,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
> > >   			ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
> > >   							false);
> > >   		} else {
> > > +			sched_tags = true;
> > >   			ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
> > >   							nr, true);
> > >   		}
> > > @@ -3169,8 +3185,41 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
> > >   			q->elevator->type->ops.depth_updated(hctx);
> > >   	}
> > > -	if (!ret)
> > > +	/*
> > > +	 * if ret is 0, all queues should have been updated to the same depth
> > > +	 * if not, then maybe some have been updated - yuk, need to handle this for shared sbitmap...
> > > +	 * if some are updated, we should probably roll back the change altogether. FIXME
> > > +	 */
> > > +	if (!ret) {
> > > +		if (blk_mq_is_sbitmap_shared(set)) {
> > > +			if (sched_tags) {
> > > +				sbitmap_queue_free(&set->sched_shared_bitmap_tags);
> > > +				sbitmap_queue_free(&set->sched_shared_breserved_tags);
> > > +				if (!blk_mq_init_sched_shared_sbitmap(set, nr))
> > > +					return -ENOMEM; /* fixup error handling */
> > > +
> > > +				queue_for_each_hw_ctx(q, hctx, i) {
> > > +					hctx->sched_tags->pbitmap_tags = &set->sched_shared_bitmap_tags;
> > > +					hctx->sched_tags->pbreserved_tags = &set->sched_shared_breserved_tags;
> > > +				}
> > > +			} else {
> > > +				sbitmap_queue_free(&set->shared_bitmap_tags);
> > > +				sbitmap_queue_free(&set->shared_breserved_tags);
> > > +				if (!blk_mq_init_shared_sbitmap(set))
> > > +					return -ENOMEM; /* fixup error handling */
> > 
> > No, we can't re-allocate driver tags here which are shared by all LUNs. > And you should see that 'can_grow' is set as false for driver tags
> > in blk_mq_update_nr_requests(), which can only touch per-request-queue
> > data, not tagset wide data.
> 
> Yeah, I see that. We should just resize for driver tags bitmap.
> 
> Personally I think the mainline code is a little loose here, as if we could
> grow driver tags, then blk_mq_tagset.tags would be out-of-sync with the
> hctx->tags. Maybe that should be made more explicit in the code.
> 
> BTW, do you have anything to say about this (modified slightly) comment:
> 
> /*
>  * if ret != 0, q->nr_requests would not be updated, yet the depth
>  * for some hctx sched tags may have changed - is that the right thing
>  * to do?
>  */

In theory, your concern is right, but so far we only support same
depth of hctx for either sched tags or driver tags, so not an issue
so far.

Thanks,
Ming