From: Keith Busch <kbusch@xxxxxxxxxx> Provide an interface for the kernel to leverage the existing pre-registered buffers that io_uring provides. User space can reference these later to achieve zero-copy IO. User space must register a sparse fixed buffer table with io_uring in order for the kernel to make use of it. Kernel users of this interface need to register a callback to know when the last reference is released. io_uring uses the existence of this callback to differentiate user vs kernel register buffers. Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> --- include/linux/io_uring.h | 1 + include/linux/io_uring_types.h | 6 ++ io_uring/rsrc.c | 112 ++++++++++++++++++++++++++++++--- io_uring/rsrc.h | 2 + 4 files changed, 113 insertions(+), 8 deletions(-) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 85fe4e6b275c7..b5637a2aae340 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <linux/xarray.h> #include <uapi/linux/io_uring.h> +#include <linux/blk-mq.h> #if defined(CONFIG_IO_URING) void __io_uring_cancel(bool cancel_all); diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d5bf336882aa8..b9feba4df60c9 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -696,4 +696,10 @@ static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) return ctx->flags & IORING_SETUP_CQE32; } +int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags); +void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int tag, + unsigned int issue_flags); + #endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index af39b69eb4fde..0e323ca1e8e5c 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -103,19 +103,23 @@ static int io_buffer_validate(struct iovec *iov) static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - unsigned int i; + struct io_mapped_ubuf *imu = node->buf; - if (node->buf) { - struct io_mapped_ubuf *imu = node->buf; + if (!refcount_dec_and_test(&imu->refs)) + return; + + if (imu->release) { + imu->release(imu->priv); + } else { + unsigned int i; - if (!refcount_dec_and_test(&imu->refs)) - return; for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu); } + + kvfree(imu); } struct io_rsrc_node *io_rsrc_node_alloc(int type) @@ -764,6 +768,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, imu->len = iov->iov_len; imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; + imu->release = NULL; + imu->priv = NULL; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); @@ -860,6 +866,89 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } +int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags) +{ + struct io_rsrc_data *data = &ctx->buf_table; + struct req_iterator rq_iter; + struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; + int ret = 0, i = 0; + struct bio_vec bv; + u16 nr_bvecs; + + io_ring_submit_lock(ctx, issue_flags); + + if (index >= data->nr) { + ret = -EINVAL; + goto unlock; + } + + node = data->nodes[index]; + if (node) { + ret = -EBUSY; + goto unlock; + } + + node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + if (!node) { + ret = -ENOMEM; + goto unlock; + } + + nr_bvecs = blk_rq_nr_phys_segments(rq); + imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL); + if (!imu) { + kfree(node); + ret = -ENOMEM; + goto unlock; + } + + imu->ubuf = 0; + imu->len = blk_rq_bytes(rq); + imu->acct_pages = 0; + imu->nr_bvecs = nr_bvecs; + refcount_set(&imu->refs, 1); + imu->release = release; + imu->priv = rq; + + rq_for_each_bvec(bv, rq, rq_iter) + bvec_set_page(&imu->bvec[i++], bv.bv_page, bv.bv_len, + bv.bv_offset); + + node->buf = imu; + data->nodes[index] = node; +unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_buffer_register_bvec); + +void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int index, + unsigned int issue_flags) +{ + struct io_rsrc_data *data = &ctx->buf_table; + struct io_rsrc_node *node; + + io_ring_submit_lock(ctx, issue_flags); + + if (!data->nr) + goto unlock; + if (index >= data->nr) + goto unlock; + + node = data->nodes[index]; + if (!node || !node->buf) + goto unlock; + if (!node->buf->release) + goto unlock; + io_reset_rsrc_node(ctx, data, index); +unlock: + io_ring_submit_unlock(ctx, issue_flags); +} +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); + int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) @@ -886,8 +975,8 @@ int io_import_fixed(int ddir, struct iov_iter *iter, /* * Don't use iov_iter_advance() here, as it's really slow for * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: + * over each segment manually. We can cheat a bit here for user + * registered nodes, because we know that: * * 1) it's a BVEC iter, we set it up * 2) all bvecs are the same in size, except potentially the @@ -901,8 +990,15 @@ int io_import_fixed(int ddir, struct iov_iter *iter, */ const struct bio_vec *bvec = imu->bvec; + /* + * Kernel buffer bvecs, on the other hand, don't necessarily + * have the size property of user registered ones, so we have + * to use the slow iter advance. + */ if (offset < bvec->bv_len) { iter->iov_offset = offset; + } else if (imu->release) { + iov_iter_advance(iter, offset); } else { unsigned long seg_skip; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 190f7ee45de93..2e8d1862caefc 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -33,6 +33,8 @@ struct io_mapped_ubuf { unsigned int folio_shift; refcount_t refs; unsigned long acct_pages; + void (*release)(void *); + void *priv; struct bio_vec bvec[] __counted_by(nr_bvecs); }; -- 2.43.5