[PATCH 6/7] io_uring/kbuf: add support for mapping type KBUF_MODE_BVEC

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The provided buffer helpers always map to iovecs. Add a new mode,
KBUF_MODE_BVEC, which instead maps it to a bio_vec array instead. For
use with zero-copy scenarios, where the caller would want to turn it
into a bio_vec anyway, and this avoids first iterating and filling out
and iovec array, only for the caller to then iterate it again and turn
it into a bio_vec array.

Since it's now managing both iovecs and bvecs, change the naming of
buf_sel_arg->nr_iovs member to nr_vecs instead.

Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
---
 io_uring/kbuf.c | 170 +++++++++++++++++++++++++++++++++++++++++++-----
 io_uring/kbuf.h |   9 ++-
 io_uring/net.c  |  10 +--
 3 files changed, 165 insertions(+), 24 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 42579525c4bd..10a3a7a27e9a 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -16,6 +16,7 @@
 #include "opdef.h"
 #include "kbuf.h"
 #include "memmap.h"
+#include "rsrc.h"
 
 /* BIDs are addressed by a 16-bit field in a CQE */
 #define MAX_BIDS_PER_BGID (1 << 16)
@@ -117,20 +118,135 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 	return NULL;
 }
 
+static struct io_mapped_ubuf *io_ubuf_from_buf(struct io_ring_ctx *ctx,
+					       u64 addr, unsigned int *offset)
+{
+	struct io_mapped_ubuf *imu;
+	u16 idx;
+
+	/*
+	 * Get registered buffer index and offset, encoded into the
+	 * addr base value.
+	 */
+	idx = addr & ((1ULL << IOU_BUF_REGBUF_BITS) - 1);
+	addr >>= IOU_BUF_REGBUF_BITS;
+	*offset = addr  & ((1ULL << IOU_BUF_OFFSET_BITS) - 1);
+
+	if (unlikely(idx >= ctx->nr_user_bufs))
+		return ERR_PTR(-EFAULT);
+
+	idx = array_index_nospec(idx, ctx->nr_user_bufs);
+	imu = READ_ONCE(ctx->user_bufs[idx]);
+	if (unlikely(*offset >= imu->len))
+		return ERR_PTR(-EFAULT);
+
+	return imu;
+}
+
+static bool io_expand_bvecs(struct buf_sel_arg *arg)
+{
+	int nvecs = arg->nr_vecs + 8;
+	struct bio_vec *bv;
+
+	if (!(arg->mode & KBUF_MODE_EXPAND))
+		return false;
+
+	bv = kmalloc_array(nvecs, sizeof(struct bio_vec), GFP_KERNEL);
+	if (unlikely(!bv))
+		return false;
+	memcpy(bv, arg->bvecs, arg->nr_vecs * sizeof(*bv));
+	if (arg->mode & KBUF_MODE_FREE)
+		kfree(arg->bvecs);
+	arg->bvecs = bv;
+	arg->nr_vecs = nvecs;
+	arg->mode |= KBUF_MODE_FREE;
+	return true;
+}
+
+static int io_fill_bvecs(struct io_ring_ctx *ctx, u64 addr,
+			 struct buf_sel_arg *arg, unsigned int len,
+			 int *vec_off)
+{
+	struct bio_vec *src, *src_prv = NULL;
+	struct io_mapped_ubuf *imu;
+	unsigned int llen = len;
+	unsigned int offset;
+
+	imu = io_ubuf_from_buf(ctx, addr, &offset);
+	if (unlikely(IS_ERR(imu)))
+		return PTR_ERR(imu);
+
+	if (unlikely(offset >= imu->len || len > imu->len))
+		return -EOVERFLOW;
+	if (unlikely(offset > imu->len - len))
+		return -EOVERFLOW;
+
+	src = imu->bvec;
+	if (offset > src->bv_len) {
+		unsigned long seg_skip;
+
+		offset -= src->bv_len;
+		seg_skip = 1 + (offset >> imu->folio_shift);
+		offset &= ((1UL << imu->folio_shift) - 1);
+		src += seg_skip;
+	}
+
+	do {
+		unsigned int this_len = len;
+
+		if (this_len + offset > src->bv_len)
+			this_len = src->bv_len - offset;
+
+		/*
+		 * If contig with previous bio_vec, merge it to minimize the
+		 * number of segments needed. If not, then add a new segment,
+		 * expanding the number of available slots, if needed.
+		 */
+		if (src_prv &&
+		    page_folio(src_prv->bv_page) == page_folio(src->bv_page) &&
+		    src_prv->bv_page + 1 == src->bv_page) {
+			arg->bvecs[*vec_off - 1].bv_len += this_len;
+		} else {
+			struct bio_vec *dst;
+
+			if (*vec_off == arg->nr_vecs && !io_expand_bvecs(arg))
+				break;
+
+			dst = &arg->bvecs[*vec_off];
+			dst->bv_page = src->bv_page;
+			dst->bv_len = this_len;
+			dst->bv_offset = offset;
+			(*vec_off)++;
+		}
+		offset = 0;
+		len -= this_len;
+		src_prv = src++;
+	} while (len);
+
+	return llen - len;
+}
+
 static int io_provided_buffers_select(struct io_kiocb *req,
 				      struct buf_sel_arg *arg,
 				      struct io_buffer_list *bl, size_t *len)
 {
-	struct iovec *iov = arg->iovs;
 	void __user *buf;
+	int ret;
 
 	buf = io_provided_buffer_select(req, len, bl);
 	if (unlikely(!buf))
 		return -ENOBUFS;
 
-	iov[0].iov_base = buf;
-	iov[0].iov_len = *len;
-	return 1;
+	if (arg->mode & KBUF_MODE_BVEC) {
+		u64 addr = (unsigned long)(uintptr_t) buf;
+
+		*len = io_fill_bvecs(req->ctx, addr, arg, *len, &ret);
+	} else {
+		arg->iovs[0].iov_base = buf;
+		arg->iovs[0].iov_len = *len;
+		ret = 1;
+	}
+	return ret;
 }
 
 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
@@ -196,13 +312,16 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 #define PEEK_MAX_IMPORT		256
 
 static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
-				struct io_buffer_list *bl)
+				struct io_buffer_list *bl, int *nbufs)
 {
 	struct io_uring_buf_ring *br = bl->buf_ring;
 	struct iovec *iov = arg->iovs;
-	int nr_iovs = arg->nr_iovs;
+	int nr_iovs = arg->nr_vecs;
 	__u16 nr_avail, tail, head;
 	struct io_uring_buf *buf;
+	int vec_off;
+
+	BUILD_BUG_ON(sizeof(struct iovec) > sizeof(struct bio_vec));
 
 	tail = smp_load_acquire(&br->tail);
 	head = bl->head;
@@ -236,10 +355,12 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 
 	/*
 	 * only alloc a bigger array if we know we have data to map, eg not
-	 * a speculative peek operation.
+	 * a speculative peek operation. Note that struct bio_vec and
+	 * struct iovec are the same size, so we can use them interchangably
+	 * here as it's just for sizing purposes.
 	 */
 	if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) {
-		iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL);
+		iov = kmalloc_array(nr_avail, sizeof(struct bio_vec), GFP_KERNEL);
 		if (unlikely(!iov))
 			return -ENOMEM;
 		if (arg->mode & KBUF_MODE_FREE)
@@ -255,6 +376,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 	if (!arg->max_len)
 		arg->max_len = INT_MAX;
 
+	vec_off = 0;
 	req->buf_index = buf->bid;
 	do {
 		u32 len = buf->len;
@@ -266,15 +388,25 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 				buf->len = len;
 		}
 
-		iov->iov_base = u64_to_user_ptr(buf->addr);
-		iov->iov_len = len;
-		iov++;
+		if (arg->mode & KBUF_MODE_BVEC) {
+			int ret;
+
+			ret = io_fill_bvecs(req->ctx, buf->addr, arg, len, &vec_off);
+			if (unlikely(ret < 0))
+				return ret;
+			len = ret;
+		} else {
+			iov->iov_base = u64_to_user_ptr(buf->addr);
+			iov->iov_len = len;
+			iov++;
+			vec_off++;
+		}
 
 		arg->out_len += len;
 		arg->max_len -= len;
+		(*nbufs)++;
 		if (!arg->max_len)
 			break;
-
 		buf = io_ring_head_to_buf(br, ++head, bl->mask);
 	} while (--nr_iovs);
 
@@ -283,7 +415,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 
 	req->flags |= REQ_F_BUFFER_RING;
 	req->buf_list = bl;
-	return iov - arg->iovs;
+	return vec_off;
 }
 
 int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
@@ -299,7 +431,9 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
 		goto out_unlock;
 
 	if (bl->flags & IOBL_BUF_RING) {
-		ret = io_ring_buffers_peek(req, arg, bl);
+		int nbufs = 0;
+
+		ret = io_ring_buffers_peek(req, arg, bl, &nbufs);
 		/*
 		 * Don't recycle these buffers if we need to go through poll.
 		 * Nobody else can use them anyway, and holding on to provided
@@ -307,9 +441,9 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
 		 * side anyway with normal buffers. Besides, we already
 		 * committed them, they cannot be put back in the queue.
 		 */
-		if (ret > 0) {
+		if (nbufs) {
 			req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
-			io_kbuf_commit(req, bl, arg->out_len, ret);
+			io_kbuf_commit(req, bl, arg->out_len, nbufs);
 		}
 	} else {
 		ret = io_provided_buffers_select(req, arg, bl, &arg->out_len);
@@ -332,7 +466,9 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
 		return -ENOENT;
 
 	if (bl->flags & IOBL_BUF_RING) {
-		ret = io_ring_buffers_peek(req, arg, bl);
+		int nbufs = 0;
+
+		ret = io_ring_buffers_peek(req, arg, bl, &nbufs);
 		if (ret > 0)
 			req->flags |= REQ_F_BUFFERS_COMMIT;
 		return ret;
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 36aadfe5ac00..7c56ba994f21 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -53,13 +53,18 @@ enum {
 	KBUF_MODE_EXPAND	= 1,
 	/* if bigger vec allocated, free old one */
 	KBUF_MODE_FREE		= 2,
+	/* turn into bio_vecs, not iovecs */
+	KBUF_MODE_BVEC		= 4,
 };
 
 struct buf_sel_arg {
-	struct iovec *iovs;
+	union {
+		struct iovec *iovs;
+		struct bio_vec *bvecs;
+	};
 	size_t out_len;
 	size_t max_len;
-	unsigned short nr_iovs;
+	unsigned short nr_vecs;
 	unsigned short mode;
 };
 
diff --git a/io_uring/net.c b/io_uring/net.c
index dbef14aa50f9..154756762a46 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -643,17 +643,17 @@ static int io_send_import(struct io_kiocb *req, unsigned int issue_flags)
 		struct buf_sel_arg arg = {
 			.iovs = &kmsg->fast_iov,
 			.max_len = min_not_zero(sr->len, INT_MAX),
-			.nr_iovs = 1,
+			.nr_vecs = 1,
 		};
 
 		if (kmsg->free_iov) {
-			arg.nr_iovs = kmsg->free_iov_nr;
+			arg.nr_vecs = kmsg->free_iov_nr;
 			arg.iovs = kmsg->free_iov;
 			arg.mode = KBUF_MODE_FREE;
 		}
 
 		if (!(sr->flags & IORING_RECVSEND_BUNDLE))
-			arg.nr_iovs = 1;
+			arg.nr_vecs = 1;
 		else
 			arg.mode |= KBUF_MODE_EXPAND;
 
@@ -1140,12 +1140,12 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
 	    sr->flags & IORING_RECVSEND_BUNDLE) {
 		struct buf_sel_arg arg = {
 			.iovs = &kmsg->fast_iov,
-			.nr_iovs = 1,
+			.nr_vecs = 1,
 			.mode = KBUF_MODE_EXPAND,
 		};
 
 		if (kmsg->free_iov) {
-			arg.nr_iovs = kmsg->free_iov_nr;
+			arg.nr_vecs = kmsg->free_iov_nr;
 			arg.iovs = kmsg->free_iov;
 			arg.mode |= KBUF_MODE_FREE;
 		}
-- 
2.45.2





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux