[PATCH] io_uring/rsrc: Add support for multi-folio buffer coalescing

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Currently fixed buffers consisting of pages in one same folio(huge page)
can be coalesced into a single bvec entry at registration.
This patch expands it to support coalescing fixed buffers
with multiple folios, by:
1. Add a helper function and a helper struct to do the coalescing work
at buffer registration;
2. Add the bvec setup procedure of the coalsced path;
3. store page_mask and page_shift into io_mapped_ubuf for
later use in io_import_fixed.

Signed-off-by: Chenliang Li <cliang01.li@xxxxxxxxxxx>
---
 io_uring/rsrc.c | 156 +++++++++++++++++++++++++++++++++++-------------
 io_uring/rsrc.h |   9 +++
 2 files changed, 124 insertions(+), 41 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 65417c9553b1..f9e11131c9a5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -871,6 +871,80 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	return ret;
 }
 
+/*
+ * For coalesce to work, a buffer must be one or multiple
+ * folios, all the folios except the first and last one
+ * should be of the same size.
+ */
+static bool io_sqe_buffer_try_coalesce(struct page **pages,
+				       unsigned int nr_pages,
+				       struct io_imu_folio_stats *stats)
+{
+	struct folio	*folio = NULL, *first_folio = NULL;
+	unsigned int	page_cnt;
+	int		i, j;
+
+	if (nr_pages <= 1)
+		return false;
+
+	first_folio = page_folio(pages[0]);
+	stats->full_folio_pcnt = folio_nr_pages(first_folio);
+	if (stats->full_folio_pcnt == 1)
+		return false;
+
+	stats->folio_shift = folio_shift(first_folio);
+
+	folio = first_folio;
+	page_cnt = 1;
+	stats->nr_folios = 1;
+	/*
+	 * Check:
+	 * 1. Pages must be contiguous;
+	 * 2. All folios should have the same page count
+	 *    except the first and last one
+	 */
+	for (i = 1; i < nr_pages; i++) {
+		if (page_folio(pages[i]) != folio ||
+		   pages[i] != pages[i-1] + 1) {
+			if (folio == first_folio)
+				stats->first_folio_pcnt = page_cnt;
+			else if (page_cnt != stats->full_folio_pcnt)
+				return false;
+			folio = page_folio(pages[i]);
+			page_cnt = 1;
+			stats->nr_folios++;
+			continue;
+		}
+		page_cnt++;
+	}
+	if (folio == first_folio)
+		stats->first_folio_pcnt = page_cnt;
+
+	if (stats->first_folio_pcnt > 1)
+		/*
+		 * The pages are bound to the folio, it doesn't
+		 * actually unpin them but drops all but one reference,
+		 * which is usually put down by io_buffer_unmap().
+		 * Note, needs a better helper.
+		 */
+		unpin_user_pages(&pages[1], stats->first_folio_pcnt - 1);
+	j = stats->first_folio_pcnt;
+	nr_pages -= stats->first_folio_pcnt;
+	for (i = 1; i < stats->nr_folios; i++) {
+		unsigned int nr_unpin;
+
+		nr_unpin = min_t(unsigned int, nr_pages - 1,
+				stats->full_folio_pcnt - 1);
+		if (nr_unpin <= 1)
+			continue;
+		unpin_user_pages(&pages[j+1], nr_unpin);
+		j += stats->full_folio_pcnt;
+		nr_pages -= stats->full_folio_pcnt;
+	}
+
+	return true;
+}
+
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 				  struct io_mapped_ubuf **pimu,
 				  struct page **last_hpage)
@@ -879,8 +953,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	struct page **pages = NULL;
 	unsigned long off;
 	size_t size;
-	int ret, nr_pages, i;
-	struct folio *folio = NULL;
+	int ret, nr_pages, nr_bvecs, i, j;
+	bool coalesced;
+	struct io_imu_folio_stats stats;
 
 	*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
 	if (!iov->iov_base)
@@ -895,39 +970,26 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		goto done;
 	}
 
-	/* If it's a huge page, try to coalesce them into a single bvec entry */
-	if (nr_pages > 1) {
-		folio = page_folio(pages[0]);
-		for (i = 1; i < nr_pages; i++) {
-			/*
-			 * Pages must be consecutive and on the same folio for
-			 * this to work
-			 */
-			if (page_folio(pages[i]) != folio ||
-			    pages[i] != pages[i - 1] + 1) {
-				folio = NULL;
-				break;
-			}
-		}
-		if (folio) {
-			/*
-			 * The pages are bound to the folio, it doesn't
-			 * actually unpin them but drops all but one reference,
-			 * which is usually put down by io_buffer_unmap().
-			 * Note, needs a better helper.
-			 */
-			unpin_user_pages(&pages[1], nr_pages - 1);
-			nr_pages = 1;
-		}
-	}
-
-	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+	/* If it's multiple huge pages, try to coalesce them into fewer bvec entries */
+	coalesced = io_sqe_buffer_try_coalesce(pages, nr_pages, &stats);
+	nr_bvecs = nr_pages;
+	if (coalesced)
+		nr_bvecs = stats.nr_folios;
+	imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
 	if (!imu)
 		goto done;
 
 	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
 	if (ret) {
-		unpin_user_pages(pages, nr_pages);
+		if (coalesced) {
+			unpin_user_page(pages[0]);
+			j = stats.first_folio_pcnt;
+			for (i = 1; i < stats.nr_folios; i++) {
+				unpin_user_page(pages[j]);
+				j += stats.full_folio_pcnt;
+			}
+		} else
+			unpin_user_pages(pages, nr_pages);
 		goto done;
 	}
 
@@ -936,12 +998,29 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	/* store original address for later verification */
 	imu->ubuf = (unsigned long) iov->iov_base;
 	imu->ubuf_end = imu->ubuf + iov->iov_len;
-	imu->nr_bvecs = nr_pages;
+	imu->nr_bvecs = nr_bvecs;
+	imu->page_shift = PAGE_SHIFT;
+	imu->page_mask = PAGE_MASK;
+	if (coalesced) {
+		imu->page_shift = stats.folio_shift;
+		imu->page_mask = ~((1UL << stats.folio_shift) - 1);
+	}
 	*pimu = imu;
 	ret = 0;
 
-	if (folio) {
-		bvec_set_page(&imu->bvec[0], pages[0], size, off);
+	if (coalesced) {
+		size_t vec_len;
+
+		vec_len = min_t(size_t, size, PAGE_SIZE * stats.first_folio_pcnt - off);
+		bvec_set_page(&imu->bvec[0], pages[0], vec_len, off);
+		size -= vec_len;
+		j = stats.first_folio_pcnt;
+		for (i = 1; i < nr_bvecs; i++) {
+			vec_len = min_t(size_t, size, PAGE_SIZE * stats.full_folio_pcnt);
+			bvec_set_page(&imu->bvec[i], pages[j], vec_len, 0);
+			size -= vec_len;
+			j += stats.full_folio_pcnt;
+		}
 		goto done;
 	}
 	for (i = 0; i < nr_pages; i++) {
@@ -1049,7 +1128,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 		 * we know that:
 		 *
 		 * 1) it's a BVEC iter, we set it up
-		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
+		 * 2) all bvecs are the same in size, except potentially the
 		 *    first and last bvec
 		 *
 		 * So just find our index, and adjust the iterator afterwards.
@@ -1061,11 +1140,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 		const struct bio_vec *bvec = imu->bvec;
 
 		if (offset < bvec->bv_len) {
-			/*
-			 * Note, huge pages buffers consists of one large
-			 * bvec entry and should always go this way. The other
-			 * branch doesn't expect non PAGE_SIZE'd chunks.
-			 */
 			iter->bvec = bvec;
 			iter->nr_segs = bvec->bv_len;
 			iter->count -= offset;
@@ -1075,12 +1149,12 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 
 			/* skip first vec */
 			offset -= bvec->bv_len;
-			seg_skip = 1 + (offset >> PAGE_SHIFT);
+			seg_skip = 1 + (offset >> imu->page_shift);
 
 			iter->bvec = bvec + seg_skip;
 			iter->nr_segs -= seg_skip;
 			iter->count -= bvec->bv_len + offset;
-			iter->iov_offset = offset & ~PAGE_MASK;
+			iter->iov_offset = offset & ~(imu->page_mask);
 		}
 	}
 
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index c032ca3436ca..4c655e446150 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -47,9 +47,18 @@ struct io_mapped_ubuf {
 	u64		ubuf_end;
 	unsigned int	nr_bvecs;
 	unsigned long	acct_pages;
+	unsigned int	page_shift;
+	unsigned long	page_mask;
 	struct bio_vec	bvec[] __counted_by(nr_bvecs);
 };
 
+struct io_imu_folio_stats {
+	unsigned int	first_folio_pcnt;
+	unsigned int	full_folio_pcnt;
+	unsigned int	nr_folios;
+	unsigned int	folio_shift;
+};
+
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
-- 
2.34.1





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux