When mTHP is enabled, IO can contain larger folios instead of pages. In such cases add a larger size to the bio instead of looping through pages. This reduces the overhead of iterating through pages for larger block sizes. perf diff before and after this change: Perf diff for write I/O with 128K block size: 1.26% -1.04% [kernel.kallsyms] [k] bio_iov_iter_get_pages 1.74% [kernel.kallsyms] [k] bvec_try_merge_page Perf diff for read I/O with 128K block size: 4.40% -3.63% [kernel.kallsyms] [k] bio_iov_iter_get_pages 5.60% [kernel.kallsyms] [k] bvec_try_merge_page Signed-off-by: Kundan Kumar <kundan.kumar@xxxxxxxxxxx> --- Changes since v1: https://lore.kernel.org/all/20240419091721.1790-1-kundan.kumar@xxxxxxxxxxx/ - Changed functions bio_iov_add_page() and bio_iov_add_zone_append_page() to accept a folio - Removed branching and now calculate folio_offset and len in same fashion for both 0 order and larger folios - Added change in NVMe driver to use nvme_setup_prp_simple() by ignoring multiples of NVME_CTRL_PAGE_SIZE in offset - Added change to unpin_user_pages which were added as folios. Also stopped the unpin of pages one by one from __bio_release_pages()(Suggested by Keith) block/bio.c | 48 +++++++++++++++++++++++++---------------- drivers/nvme/host/pci.c | 3 ++- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/block/bio.c b/block/bio.c index 38baedb39c6f..0ec453ad15b3 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1155,7 +1155,6 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) bio_for_each_folio_all(fi, bio) { struct page *page; - size_t nr_pages; if (mark_dirty) { folio_lock(fi.folio); @@ -1163,11 +1162,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) folio_unlock(fi.folio); } page = folio_page(fi.folio, fi.offset / PAGE_SIZE); - nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - - fi.offset / PAGE_SIZE + 1; - do { - bio_release_page(bio, page++); - } while (--nr_pages != 0); + bio_release_page(bio, page); } } EXPORT_SYMBOL_GPL(__bio_release_pages); @@ -1192,7 +1187,7 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static int bio_iov_add_page(struct bio *bio, struct page *page, +static int bio_iov_add_folio(struct bio *bio, struct folio *folio, unsigned int len, unsigned int offset) { bool same_page = false; @@ -1202,27 +1197,26 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, if (bio->bi_vcnt > 0 && bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], - page, len, offset, &same_page)) { + &folio->page, len, offset, &same_page)) { bio->bi_iter.bi_size += len; if (same_page) - bio_release_page(bio, page); + bio_release_page(bio, &folio->page); return 0; } - __bio_add_page(bio, page, len, offset); + bio_add_folio_nofail(bio, folio, len, offset); return 0; } -static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, +static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio, unsigned int len, unsigned int offset) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); bool same_page = false; - - if (bio_add_hw_page(q, bio, page, len, offset, + if (bio_add_hw_page(q, bio, &folio->page, len, offset, queue_max_zone_append_sectors(q), &same_page) != len) return -EINVAL; if (same_page) - bio_release_page(bio, page); + bio_release_page(bio, &folio->page); return 0; } @@ -1247,8 +1241,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page **pages = (struct page **)bv; ssize_t size, left; unsigned len, i = 0; - size_t offset; + size_t offset, folio_offset, size_folio; int ret = 0; + int num_pages; + struct folio *folio; /* * Move page array up in the allocated memory for the bio vecs as far as @@ -1289,16 +1285,30 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) for (left = size, i = 0; left > 0; left -= len, i++) { struct page *page = pages[i]; + folio = page_folio(page); + + /* See the offset in folio and the size */ + folio_offset = (folio_page_idx(folio, page) + << PAGE_SHIFT) + offset; + size_folio = folio_size(folio); + + /* Calculate the length of folio to be added */ + len = min_t(size_t, (size_folio - folio_offset), left); + + num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = bio_iov_add_zone_append_page(bio, page, len, - offset); + ret = bio_iov_add_zone_append_folio(bio, folio, len, + folio_offset); if (ret) break; } else - bio_iov_add_page(bio, page, len, offset); + bio_iov_add_folio(bio, folio, len, folio_offset); + /* Skip the pages which got added */ + if (bio_flagged(bio, BIO_PAGE_PINNED) && num_pages > 1) + unpin_user_pages(pages + i, num_pages - 1); + i = i + (num_pages - 1); offset = 0; } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8e0bb9692685..7c07b0582cae 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -778,7 +778,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { - if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) + if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); -- 2.25.1