request pool pages may take a bit more space, and each request queue may hold one unused request pool at most, so memory waste can be big when there are lots of request queues. Schedule a delayed work to check if tags->rqs[] still may refer to page in freed request pool page. If no any request in tags->rqs[] refers to the freed request pool page, release the page now. Otherwise, schedule the delayed work after 10 seconds for check & release the pages. Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> Cc: Hannes Reinecke <hare@xxxxxxx> Cc: Bart Van Assche <bvanassche@xxxxxxx> Cc: John Garry <john.garry@xxxxxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> --- block/blk-mq.c | 55 ++++++++++++++++++++++++++++++++++++++++++ include/linux/blk-mq.h | 1 + 2 files changed, 56 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index c644f5cb1549..2865920086ea 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2365,11 +2365,63 @@ static void blk_mq_release_rqs_page(struct page *page) __free_pages(page, blk_mq_rqs_page_order(page)); } +#define SHRINK_RQS_PAGE_DELAY (10 * HZ) + static void blk_mq_free_rqs_page(struct blk_mq_tag_set *set, struct page *page) { spin_lock(&set->free_page_list_lock); list_add_tail(&page->lru, &set->free_page_list); spin_unlock(&set->free_page_list_lock); + + schedule_delayed_work(&set->rqs_page_shrink, SHRINK_RQS_PAGE_DELAY); +} + +static bool blk_mq_can_shrink_rqs_page(struct blk_mq_tag_set *set, + struct page *pg) +{ + unsigned hctx_idx = blk_mq_rqs_page_hctx_idx(pg); + struct blk_mq_tags *tags = set->tags[hctx_idx]; + unsigned long start = (unsigned long)page_address(pg); + unsigned long end = start + order_to_size(blk_mq_rqs_page_order(pg)); + int i; + + for (i = 0; i < set->queue_depth; i++) { + unsigned long rq_addr = (unsigned long)tags->rqs[i]; + if (rq_addr >= start && rq_addr < end) + return false; + } + return true; +} + +static void blk_mq_rqs_page_shrink_work(struct work_struct *work) +{ + struct blk_mq_tag_set *set = + container_of(work, struct blk_mq_tag_set, rqs_page_shrink.work); + LIST_HEAD(pg_list); + struct page *page, *tmp; + bool resched; + + spin_lock(&set->free_page_list_lock); + list_splice_init(&set->free_page_list, &pg_list); + spin_unlock(&set->free_page_list_lock); + + mutex_lock(&set->tag_list_lock); + list_for_each_entry_safe(page, tmp, &pg_list, lru) { + if (blk_mq_can_shrink_rqs_page(set, page)) { + list_del_init(&page->lru); + blk_mq_release_rqs_page(page); + } + } + mutex_unlock(&set->tag_list_lock); + + spin_lock(&set->free_page_list_lock); + list_splice_init(&pg_list, &set->free_page_list); + resched = !list_empty(&set->free_page_list); + spin_unlock(&set->free_page_list_lock); + + if (resched) + schedule_delayed_work(&set->rqs_page_shrink, + SHRINK_RQS_PAGE_DELAY); } static void blk_mq_release_all_rqs_page(struct blk_mq_tag_set *set) @@ -2377,6 +2429,8 @@ static void blk_mq_release_all_rqs_page(struct blk_mq_tag_set *set) struct page *page; LIST_HEAD(pg_list); + cancel_delayed_work_sync(&set->rqs_page_shrink); + spin_lock(&set->free_page_list_lock); list_splice_init(&set->free_page_list, &pg_list); spin_unlock(&set->free_page_list_lock); @@ -3527,6 +3581,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) spin_lock_init(&set->free_page_list_lock); INIT_LIST_HEAD(&set->free_page_list); + INIT_DELAYED_WORK(&set->rqs_page_shrink, blk_mq_rqs_page_shrink_work); ret = blk_mq_alloc_map_and_requests(set); if (ret) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4c2b135dbbe1..b2adf99dbbef 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -250,6 +250,7 @@ struct blk_mq_tag_set { spinlock_t free_page_list_lock; struct list_head free_page_list; + struct delayed_work rqs_page_shrink; }; /** -- 2.25.2