On Wed, Apr 28, 2021 at 10:30:13AM -0400, David Jeffery wrote: > On Tue, Apr 27, 2021 at 11:10:58PM +0800, Ming Lei wrote: > > > > refcount_inc_not_zero() in bt_tags_iter() still may read one freed > > request. > > > > Fix the issue by the following approach: > > > > 1) hold a per-tags spinlock when reading ->rqs[tag] and calling > > refcount_inc_not_zero in bt_tags_iter() > > > > This method of closing the race still in my original patch is very nice. > It's a great improvement. > > > 2) clearing stale request referred via ->rqs[tag] before freeing > > request pool, the per-tags spinlock is held for clearing stale > > ->rq[tag] > > > > So after we cleared stale requests, bt_tags_iter() won't observe > > freed request any more, also the clearing will wait for pending > > request reference. > > > > The idea of clearing ->rqs[] is borrowed from John Garry's previous > > patch and one recent David's patch. > > > > However, when you took my original cmpxchg patch and merged my separate > function to do the cmpxchg cleaning into blk_mq_clear_rq_mapping, you > missed why it was a separate function. Your patch will clean out the > static_rqs requests which are being freed, but it doesn't clean out the > special flush request that gets allocated individually by a > request_queue. The flush request can be put directly into the rqs[] > array so it also needs to be cleaned when a request_queue is being > torn down. This was the second caller of my separated cleaning function. It can be covered just in one calling of the cleaning function, see the following patch: diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index e1e997af89a0..fb9eeb03d6a0 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -500,7 +500,8 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; if (hctx->sched_tags) { - blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); + blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx, + hctx->fq->flush_rq); blk_mq_free_rq_map(hctx->sched_tags, flags); hctx->sched_tags = NULL; } @@ -611,7 +612,8 @@ void blk_mq_sched_free_requests(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) - blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); + blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i, + hctx->fq->flush_rq); } } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 9329b94a9743..757e0e652ce7 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -277,9 +277,15 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) if (iter_static_rqs) rq = tags->static_rqs[bitnr]; else { + unsigned long flags; + + spin_lock_irqsave(&tags->lock, flags); rq = tags->rqs[bitnr]; - if (!rq || !refcount_inc_not_zero(&rq->ref)) + if (!rq || !refcount_inc_not_zero(&rq->ref)) { + spin_unlock_irqrestore(&tags->lock, flags); return true; + } + spin_unlock_irqrestore(&tags->lock, flags); } if ((iter_data->flags & BT_TAG_ITER_STARTED) && !blk_mq_request_started(rq)) @@ -526,6 +532,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; + spin_lock_init(&tags->lock); if (blk_mq_is_sbitmap_shared(flags)) return tags; @@ -586,7 +593,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, return -ENOMEM; } - blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); + blk_mq_free_rqs(set, *tagsptr, hctx->queue_num, + hctx->fq->flush_rq); blk_mq_free_rq_map(*tagsptr, flags); *tagsptr = new; } else { diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 7d3e6b333a4a..f942a601b5ef 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -20,6 +20,9 @@ struct blk_mq_tags { struct request **rqs; struct request **static_rqs; struct list_head page_list; + + /* used to clear rqs[] before one request pool is freed */ + spinlock_t lock; }; extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index 4bd6c11bd8bc..686f6b210114 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2291,8 +2291,42 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) return BLK_QC_T_NONE; } +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +/* called before freeing request pool in @tags */ +static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, unsigned int hctx_idx, + struct request *flush_rq) +{ + struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; + struct page *page; + unsigned long flags; + + spin_lock_irqsave(&drv_tags->lock, flags); + list_for_each_entry(page, &tags->page_list, lru) { + unsigned long start = (unsigned long)page_address(page); + unsigned long end = start + order_to_size(page->private); + int i; + + for (i = 0; i < set->queue_depth; i++) { + struct request *rq = drv_tags->rqs[i]; + unsigned long rq_addr = (unsigned long)rq; + + if ((rq_addr >= start && rq_addr < end) || + rq == flush_rq) { + WARN_ON_ONCE(refcount_read(&rq->ref) != 0); + cmpxchg(&drv_tags->rqs[i], rq, NULL); + } + } + } + spin_unlock_irqrestore(&drv_tags->lock, flags); +} + void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx) + unsigned int hctx_idx, struct request *flush_rq) { struct page *page; @@ -2309,6 +2343,8 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } + blk_mq_clear_rq_mapping(set, tags, hctx_idx, flush_rq); + while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); @@ -2368,11 +2404,6 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, return tags; } -static size_t order_to_size(unsigned int order) -{ - return (size_t)PAGE_SIZE << order; -} - static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, int node) { @@ -2461,7 +2492,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, return 0; fail: - blk_mq_free_rqs(set, tags, hctx_idx); + blk_mq_free_rqs(set, tags, hctx_idx, NULL); return -ENOMEM; } @@ -2794,7 +2825,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, unsigned int flags = set->flags; if (set->tags && set->tags[hctx_idx]) { - blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); + blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx, NULL); blk_mq_free_rq_map(set->tags[hctx_idx], flags); set->tags[hctx_idx] = NULL; } diff --git a/block/blk-mq.h b/block/blk-mq.h index 143afe42c63a..08ef96c181f6 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -53,7 +53,7 @@ void blk_mq_put_rq_ref(struct request *rq); * Internal helpers for allocating/freeing the request map */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx); + unsigned int hctx_idx, struct request *flush_rq); void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags); struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, -- Ming