Re: [PATCH V8 0/8] io_uring: support sqe group and leased group kbuf

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 10/29/24 2:06 PM, Jens Axboe wrote:
> On 10/29/24 1:18 PM, Jens Axboe wrote:
>> Now, this implementation requires a user buffer, and as far as I'm told,
>> you currently have kernel buffers on the ublk side. There's absolutely
>> no reason why kernel buffers cannot work, we'd most likely just need to
>> add a IORING_RSRC_KBUFFER type to handle that. My question here is how
>> hard is this requirement? Reason I ask is that it's much simpler to work
>> with userspace buffers. Yes the current implementation maps them
>> everytime, we could certainly change that, however I don't see this
>> being an issue. It's really no different than O_DIRECT, and you only
>> need to map them once for a read + whatever number of writes you'd need
>> to do. If a 'tag' is provided for LOCAL_BUF, it'll post a CQE whenever
>> that buffer is unmapped. This is a notification for the application that
>> it's done using the buffer. For a pure kernel buffer, we'd either need
>> to be able to reference it (so that we KNOW it's not going away) and/or
>> have a callback associated with the buffer.
> 
> Just to expand on this - if a kernel buffer is absolutely required, for
> example if you're inheriting pages from the page cache or other
> locations you cannot control, we would need to add something ala the
> below:

Here's a more complete one, but utterly untested. But it does the same
thing, mapping a struct request, but it maps it to an io_rsrc_node which
in turn has an io_mapped_ubuf in it. Both BUFFER and KBUFFER use the
same type, only the destruction is different. Then the callback provided
needs to do something ala:

struct io_mapped_ubuf *imu = node->buf;

if (imu && refcount_dec_and_test(&imu->refs))
	kvfree(imu);

when it's done with the imu. Probably an rsrc helper should just be done
for that, but those are details.

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 9621ba533b35..050868a4c9f1 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -8,6 +8,8 @@
 #include <linux/nospec.h>
 #include <linux/hugetlb.h>
 #include <linux/compat.h>
+#include <linux/bvec.h>
+#include <linux/blk-mq.h>
 #include <linux/io_uring.h>
 
 #include <uapi/linux/io_uring.h>
@@ -474,6 +476,9 @@ void io_free_rsrc_node(struct io_rsrc_node *node)
 		if (node->buf)
 			io_buffer_unmap(node->ctx, node);
 		break;
+	case IORING_RSRC_KBUFFER:
+		node->kbuf_fn(node);
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		break;
@@ -1070,6 +1075,65 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
 	return ret;
 }
 
+struct io_rsrc_node *io_rsrc_map_request(struct io_ring_ctx *ctx,
+					 struct request *req,
+					 void (*kbuf_fn)(struct io_rsrc_node *))
+{
+	struct io_mapped_ubuf *imu = NULL;
+	struct io_rsrc_node *node = NULL;
+	struct req_iterator rq_iter;
+	unsigned int offset;
+	struct bio_vec bv;
+	int nr_bvecs;
+
+	if (!bio_has_data(req->bio))
+		goto out;
+
+	nr_bvecs = 0;
+	rq_for_each_bvec(bv, req, rq_iter)
+		nr_bvecs++;
+	if (!nr_bvecs)
+		goto out;
+
+	node = io_rsrc_node_alloc(ctx, IORING_RSRC_KBUFFER);
+	if (!node)
+		goto out;
+	node->buf = NULL;
+
+	imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_NOIO);
+	if (!imu)
+		goto out;
+
+	imu->ubuf = 0;
+	imu->len = 0;
+	if (req->bio != req->biotail) {
+		int idx = 0;
+
+		offset = 0;
+		rq_for_each_bvec(bv, req, rq_iter) {
+			imu->bvec[idx++] = bv;
+			imu->len += bv.bv_len;
+		}
+	} else {
+		struct bio *bio = req->bio;
+
+		offset = bio->bi_iter.bi_bvec_done;
+		imu->bvec[0] = *__bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+		imu->len = imu->bvec[0].bv_len;
+	}
+	imu->nr_bvecs = nr_bvecs;
+	imu->folio_shift = PAGE_SHIFT;
+	refcount_set(&imu->refs, 1);
+	node->buf = imu;
+	node->kbuf_fn = kbuf_fn;
+	return node;
+out:
+	if (node)
+		io_put_rsrc_node(node);
+	kfree(imu);
+	return NULL;
+}
+
 int io_local_buf_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index be9b490c400e..8d479f765fe0 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -11,6 +11,7 @@
 enum {
 	IORING_RSRC_FILE		= 0,
 	IORING_RSRC_BUFFER		= 1,
+	IORING_RSRC_KBUFFER		= 2,
 };
 
 struct io_rsrc_node {
@@ -19,6 +20,7 @@ struct io_rsrc_node {
 	u16				type;
 
 	u64 tag;
+	void (*kbuf_fn)(struct io_rsrc_node *);
 	union {
 		unsigned long file_ptr;
 		struct io_mapped_ubuf *buf;
@@ -52,6 +54,10 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 			   struct io_mapped_ubuf *imu,
 			   u64 buf_addr, size_t len);
 
+struct io_rsrc_node *io_rsrc_map_request(struct io_ring_ctx *ctx,
+					 struct request *req,
+					 void (*kbuf_fn)(struct io_rsrc_node *));
+
 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,

-- 
Jens Axboe




[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux