On 12/20/18 2:40 PM, Bart Van Assche wrote: > On Thu, 2018-12-20 at 14:34 -0700, Jens Axboe wrote: >> Yeah, I don't think it's bullet proof either, it just closes the gap. >> I'm fine with fiddling with the tag iteration. On top of what I sent, we >> could have tag iteration hold the RCU read lock, and then we just need >> to ensure that the tags are freed with RCU. > > Do you mean using call_rcu() to free tags? Would that require to add a > struct rcu_head to every request? Would it be acceptable to increase the > size of struct request with an rcu_head? Additionally, could that reduce > the queue depth if the time between grace periods is larger than the time > between I/O submissions? No no, the requests are out of full pages, we just need one per blk_mq_tags. Something like the below... And then the tag iteration needs to grab the RCU read lock to prevent the page freeing from happening. This should essentially be free, as rcu read lock for tag iteration is basically a no-op. Need to handle the flush request on top of this as well. diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2089c6c62f44..c39b58391ae8 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -228,13 +228,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) if (!reserved) bitnr += tags->nr_reserved_tags; - rq = tags->rqs[bitnr]; + if (tags->rqs[bitnr].queue != hctx->queue) + return true; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ - if (rq && rq->q == hctx->queue) + rq = tags->rqs[bitnr].rq; + if (rq) return iter_data->fn(hctx, rq, iter_data->data, reserved); return true; } @@ -263,7 +265,9 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, .reserved = reserved, }; + rcu_read_lock(); sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); + rcu_read_unlock(); } struct bt_tags_iter_data { @@ -287,7 +291,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * We can hit rq == NULL here, because the tagging functions * test and set the bit before assining ->rqs[]. */ - rq = tags->rqs[bitnr]; + rq = tags->rqs[bitnr].rq; if (rq && blk_mq_request_started(rq)) return iter_data->fn(rq, iter_data->data, reserved); @@ -317,8 +321,11 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, .reserved = reserved, }; - if (tags->rqs) + if (tags->rqs) { + rcu_read_lock(); sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); + rcu_read_unlock(); + } } /** diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 61deab0b5a5a..bdd1bfc08c21 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -4,6 +4,11 @@ #include "blk-mq.h" +struct rq_tag_entry { + struct request_queue *queue; + struct request *rq; +}; + /* * Tag address space map. */ @@ -16,9 +21,11 @@ struct blk_mq_tags { struct sbitmap_queue bitmap_tags; struct sbitmap_queue breserved_tags; - struct request **rqs; + struct rq_tag_entry *rqs; struct request **static_rqs; struct list_head page_list; + + struct rcu_work rcu_work; }; @@ -78,7 +85,8 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag, struct request *rq) { - hctx->tags->rqs[tag] = rq; + hctx->tags->rqs[tag].queue = hctx->queue; + hctx->tags->rqs[tag].rq = rq; } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index 2de972857496..4decd1e7d2d9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -298,13 +298,16 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->tag = -1; rq->internal_tag = tag; } else { - if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct blk_mq_hw_ctx *hctx = data->hctx; + + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); + atomic_inc(&hctx->nr_active); } rq->tag = tag; rq->internal_tag = -1; - data->hctx->tags->rqs[rq->tag] = rq; + hctx->tags->rqs[rq->tag].queue = hctx->queue; + hctx->tags->rqs[rq->tag].rq = rq; } /* csd/requeue_work/fifo_time is initialized before use */ @@ -797,8 +800,8 @@ EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { if (tag < tags->nr_tags) { - prefetch(tags->rqs[tag]); - return tags->rqs[tag]; + prefetch(tags->rqs[tag].rq); + return tags->rqs[tag].rq; } return NULL; @@ -809,10 +812,11 @@ static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { /* - * If we find a request that is inflight and the queue matches, - * we know the queue is busy. Return false to stop the iteration. + * We're only called here if the queue matches. If the rq state is + * inflight, we know the queue is busy. Return false to stop the + * iteration. */ - if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) { + if (rq->state == MQ_RQ_IN_FLIGHT) { bool *busy = priv; *busy = true; @@ -1049,11 +1053,14 @@ bool blk_mq_get_driver_tag(struct request *rq) shared = blk_mq_tag_busy(data.hctx); rq->tag = blk_mq_get_tag(&data); if (rq->tag >= 0) { + struct blk_mq_hw_ctx *hctx = data.hctx; + if (shared) { rq->rq_flags |= RQF_MQ_INFLIGHT; atomic_inc(&data.hctx->nr_active); } - data.hctx->tags->rqs[rq->tag] = rq; + hctx->tags->rqs[rq->tag].queue = hctx->queue; + hctx->tags->rqs[rq->tag].rq = rq; } done: @@ -2020,11 +2027,27 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) return cookie; } -void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx) +static void blk_mq_rcu_free_pages(struct work_struct *work) { + struct blk_mq_tags *tags = container_of(to_rcu_work(work), + struct blk_mq_tags, rcu_work); struct page *page; + while (!list_empty(&tags->page_list)) { + page = list_first_entry(&tags->page_list, struct page, lru); + list_del_init(&page->lru); + /* + * Remove kmemleak object previously allocated in + * blk_mq_init_rq_map(). + */ + kmemleak_free(page_address(page)); + __free_pages(page, page->private); + } +} + +void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx) +{ if (tags->rqs && set->ops->exit_request) { int i; @@ -2038,16 +2061,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } - while (!list_empty(&tags->page_list)) { - page = list_first_entry(&tags->page_list, struct page, lru); - list_del_init(&page->lru); - /* - * Remove kmemleak object previously allocated in - * blk_mq_init_rq_map(). - */ - kmemleak_free(page_address(page)); - __free_pages(page, page->private); - } + /* Punt to RCU free, so we don't race with tag iteration */ + INIT_RCU_WORK(&tags->rcu_work, blk_mq_rcu_free_pages); + queue_rcu_work(system_wq, &tags->rcu_work); } void blk_mq_free_rq_map(struct blk_mq_tags *tags) @@ -2077,7 +2093,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (!tags) return NULL; - tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), + tags->rqs = kcalloc_node(nr_tags, sizeof(struct rq_tag_entry), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) { -- Jens Axboe