On 12/20/18 2:44 PM, Jens Axboe wrote: > On 12/20/18 2:40 PM, Bart Van Assche wrote: >> On Thu, 2018-12-20 at 14:34 -0700, Jens Axboe wrote: >>> Yeah, I don't think it's bullet proof either, it just closes the gap. >>> I'm fine with fiddling with the tag iteration. On top of what I sent, we >>> could have tag iteration hold the RCU read lock, and then we just need >>> to ensure that the tags are freed with RCU. >> >> Do you mean using call_rcu() to free tags? Would that require to add a >> struct rcu_head to every request? Would it be acceptable to increase the >> size of struct request with an rcu_head? Additionally, could that reduce >> the queue depth if the time between grace periods is larger than the time >> between I/O submissions? > > No no, the requests are out of full pages, we just need one per blk_mq_tags. > Something like the below... And then the tag iteration needs to grab the > RCU read lock to prevent the page freeing from happening. > > This should essentially be free, as rcu read lock for tag iteration is > basically a no-op. > > Need to handle the flush request on top of this as well. With FQ handled as well. diff --git a/block/blk-flush.c b/block/blk-flush.c index a3fc7191c694..23e1f5fb091f 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -491,12 +491,22 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, return NULL; } +static void blk_fq_rcu_free(struct work_struct *work) +{ + struct blk_flush_queue *fq = container_of(to_rcu_work(work), + struct blk_flush_queue, + rcu_work); + + kfree(fq->flush_rq); + kfree(fq); +} + void blk_free_flush_queue(struct blk_flush_queue *fq) { /* bio based request queue hasn't flush queue */ if (!fq) return; - kfree(fq->flush_rq); - kfree(fq); + INIT_RCU_WORK(&fq->rcu_work, blk_fq_rcu_free); + queue_rcu_work(system_wq, &fq->rcu_work); } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2089c6c62f44..c39b58391ae8 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -228,13 +228,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) if (!reserved) bitnr += tags->nr_reserved_tags; - rq = tags->rqs[bitnr]; + if (tags->rqs[bitnr].queue != hctx->queue) + return true; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ - if (rq && rq->q == hctx->queue) + rq = tags->rqs[bitnr].rq; + if (rq) return iter_data->fn(hctx, rq, iter_data->data, reserved); return true; } @@ -263,7 +265,9 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, .reserved = reserved, }; + rcu_read_lock(); sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); + rcu_read_unlock(); } struct bt_tags_iter_data { @@ -287,7 +291,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * We can hit rq == NULL here, because the tagging functions * test and set the bit before assining ->rqs[]. */ - rq = tags->rqs[bitnr]; + rq = tags->rqs[bitnr].rq; if (rq && blk_mq_request_started(rq)) return iter_data->fn(rq, iter_data->data, reserved); @@ -317,8 +321,11 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, .reserved = reserved, }; - if (tags->rqs) + if (tags->rqs) { + rcu_read_lock(); sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); + rcu_read_unlock(); + } } /** diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 61deab0b5a5a..bdd1bfc08c21 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -4,6 +4,11 @@ #include "blk-mq.h" +struct rq_tag_entry { + struct request_queue *queue; + struct request *rq; +}; + /* * Tag address space map. */ @@ -16,9 +21,11 @@ struct blk_mq_tags { struct sbitmap_queue bitmap_tags; struct sbitmap_queue breserved_tags; - struct request **rqs; + struct rq_tag_entry *rqs; struct request **static_rqs; struct list_head page_list; + + struct rcu_work rcu_work; }; @@ -78,7 +85,8 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag, struct request *rq) { - hctx->tags->rqs[tag] = rq; + hctx->tags->rqs[tag].queue = hctx->queue; + hctx->tags->rqs[tag].rq = rq; } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index 2de972857496..4decd1e7d2d9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -298,13 +298,16 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->tag = -1; rq->internal_tag = tag; } else { - if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct blk_mq_hw_ctx *hctx = data->hctx; + + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); + atomic_inc(&hctx->nr_active); } rq->tag = tag; rq->internal_tag = -1; - data->hctx->tags->rqs[rq->tag] = rq; + hctx->tags->rqs[rq->tag].queue = hctx->queue; + hctx->tags->rqs[rq->tag].rq = rq; } /* csd/requeue_work/fifo_time is initialized before use */ @@ -797,8 +800,8 @@ EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { if (tag < tags->nr_tags) { - prefetch(tags->rqs[tag]); - return tags->rqs[tag]; + prefetch(tags->rqs[tag].rq); + return tags->rqs[tag].rq; } return NULL; @@ -809,10 +812,11 @@ static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { /* - * If we find a request that is inflight and the queue matches, - * we know the queue is busy. Return false to stop the iteration. + * We're only called here if the queue matches. If the rq state is + * inflight, we know the queue is busy. Return false to stop the + * iteration. */ - if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) { + if (rq->state == MQ_RQ_IN_FLIGHT) { bool *busy = priv; *busy = true; @@ -1049,11 +1053,14 @@ bool blk_mq_get_driver_tag(struct request *rq) shared = blk_mq_tag_busy(data.hctx); rq->tag = blk_mq_get_tag(&data); if (rq->tag >= 0) { + struct blk_mq_hw_ctx *hctx = data.hctx; + if (shared) { rq->rq_flags |= RQF_MQ_INFLIGHT; atomic_inc(&data.hctx->nr_active); } - data.hctx->tags->rqs[rq->tag] = rq; + hctx->tags->rqs[rq->tag].queue = hctx->queue; + hctx->tags->rqs[rq->tag].rq = rq; } done: @@ -2020,11 +2027,27 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) return cookie; } -void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx) +static void blk_mq_rcu_free_pages(struct work_struct *work) { + struct blk_mq_tags *tags = container_of(to_rcu_work(work), + struct blk_mq_tags, rcu_work); struct page *page; + while (!list_empty(&tags->page_list)) { + page = list_first_entry(&tags->page_list, struct page, lru); + list_del_init(&page->lru); + /* + * Remove kmemleak object previously allocated in + * blk_mq_init_rq_map(). + */ + kmemleak_free(page_address(page)); + __free_pages(page, page->private); + } +} + +void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx) +{ if (tags->rqs && set->ops->exit_request) { int i; @@ -2038,16 +2061,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } - while (!list_empty(&tags->page_list)) { - page = list_first_entry(&tags->page_list, struct page, lru); - list_del_init(&page->lru); - /* - * Remove kmemleak object previously allocated in - * blk_mq_init_rq_map(). - */ - kmemleak_free(page_address(page)); - __free_pages(page, page->private); - } + /* Punt to RCU free, so we don't race with tag iteration */ + INIT_RCU_WORK(&tags->rcu_work, blk_mq_rcu_free_pages); + queue_rcu_work(system_wq, &tags->rcu_work); } void blk_mq_free_rq_map(struct blk_mq_tags *tags) @@ -2077,7 +2093,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (!tags) return NULL; - tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), + tags->rqs = kcalloc_node(nr_tags, sizeof(struct rq_tag_entry), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) { diff --git a/block/blk.h b/block/blk.h index 848278c52030..785207fc8a30 100644 --- a/block/blk.h +++ b/block/blk.h @@ -29,6 +29,8 @@ struct blk_flush_queue { */ struct request *orig_rq; spinlock_t mq_flush_lock; + + struct rcu_work rcu_work; }; extern struct kmem_cache *blk_requestq_cachep; -- Jens Axboe