From: Keith Busch <kbusch@xxxxxxxxxx> Provide an interface for the kernel to leverage the existing pre-registered buffers that io_uring provides. User space can reference these later to achieve zero-copy IO. User space must register an empty fixed buffer table with io_uring in order for the kernel to make use of it. Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> --- include/linux/io_uring.h | 1 + include/linux/io_uring_types.h | 3 + io_uring/rsrc.c | 114 +++++++++++++++++++++++++++++++-- io_uring/rsrc.h | 1 + 4 files changed, 114 insertions(+), 5 deletions(-) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 85fe4e6b275c7..b5637a2aae340 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <linux/xarray.h> #include <uapi/linux/io_uring.h> +#include <linux/blk-mq.h> #if defined(CONFIG_IO_URING) void __io_uring_cancel(bool cancel_all); diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 623d8e798a11a..7e5a5a70c35f2 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -695,4 +695,7 @@ static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) return ctx->flags & IORING_SETUP_CQE32; } +int io_buffer_register_bvec(struct io_ring_ctx *ctx, const struct request *rq, unsigned int tag); +void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int tag); + #endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 4d0e1c06c8bc6..8c4c374abcc10 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -111,7 +111,10 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) if (!refcount_dec_and_test(&imu->refs)) return; for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); + if (node->type == IORING_RSRC_KBUF) + put_page(imu->bvec[i].bv_page); + else + unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); kvfree(imu); @@ -240,6 +243,13 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_rsrc_node *node; u64 tag = 0; + i = array_index_nospec(up->offset + done, ctx->buf_table.nr); + node = io_rsrc_node_lookup(&ctx->buf_table, i); + if (node && node->type != IORING_RSRC_BUFFER) { + err = -EBUSY; + break; + } + uvec = u64_to_user_ptr(user_data); iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); if (IS_ERR(iov)) { @@ -258,6 +268,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, err = PTR_ERR(node); break; } + if (tag) { if (!node) { err = -EINVAL; @@ -265,7 +276,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, } node->tag = tag; } - i = array_index_nospec(up->offset + done, ctx->buf_table.nr); io_reset_rsrc_node(ctx, &ctx->buf_table, i); ctx->buf_table.nodes[i] = node; if (ctx->compat) @@ -453,6 +463,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) fput(io_slot_file(node)); break; case IORING_RSRC_BUFFER: + case IORING_RSRC_KBUF: if (node->buf) io_buffer_unmap(ctx, node); break; @@ -860,6 +871,92 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } +static struct io_rsrc_node *io_buffer_alloc_node(struct io_ring_ctx *ctx, + unsigned int nr_bvecs, + unsigned int len) +{ + struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; + + node = io_rsrc_node_alloc(IORING_RSRC_KBUF); + if (!node) + return NULL; + + imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL); + if (!imu) { + io_put_rsrc_node(ctx, node); + return NULL; + } + + imu->ubuf = 0; + imu->len = len; + imu->acct_pages = 0; + imu->nr_bvecs = nr_bvecs; + refcount_set(&imu->refs, 1); + + node->buf = imu; + return node; +} + +int io_buffer_register_bvec(struct io_ring_ctx *ctx, const struct request *rq, + unsigned int index) +{ + struct io_rsrc_data *data = &ctx->buf_table; + u16 nr_bvecs = blk_rq_nr_phys_segments(rq); + struct req_iterator rq_iter; + struct io_rsrc_node *node; + struct bio_vec bv; + int i = 0; + + lockdep_assert_held(&ctx->uring_lock); + + if (WARN_ON_ONCE(!data->nr)) + return -EINVAL; + if (WARN_ON_ONCE(index >= data->nr)) + return -EINVAL; + + node = data->nodes[index]; + if (WARN_ON_ONCE(node)) + return -EBUSY; + + node = io_buffer_alloc_node(ctx, nr_bvecs, blk_rq_bytes(rq)); + if (!node) + return -ENOMEM; + + rq_for_each_bvec(bv, rq, rq_iter) { + get_page(bv.bv_page); + node->buf->bvec[i].bv_page = bv.bv_page; + node->buf->bvec[i].bv_len = bv.bv_len; + node->buf->bvec[i].bv_offset = bv.bv_offset; + i++; + } + data->nodes[index] = node; + + return 0; +} +EXPORT_SYMBOL_GPL(io_buffer_register_bvec); + +void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int index) +{ + struct io_rsrc_data *data = &ctx->buf_table; + struct io_rsrc_node *node; + + lockdep_assert_held(&ctx->uring_lock); + + if (WARN_ON_ONCE(!data->nr)) + return; + if (WARN_ON_ONCE(index >= data->nr)) + return; + + node = data->nodes[index]; + if (WARN_ON_ONCE(!node || !node->buf)) + return; + if (WARN_ON_ONCE(node->type != IORING_RSRC_KBUF)) + return; + io_reset_rsrc_node(ctx, data, index); +} +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); + int io_import_fixed(int ddir, struct iov_iter *iter, struct io_rsrc_node *node, u64 buf_addr, size_t len) { @@ -886,8 +983,8 @@ int io_import_fixed(int ddir, struct iov_iter *iter, struct io_rsrc_node *node, /* * Don't use iov_iter_advance() here, as it's really slow for * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: + * over each segment manually. We can cheat a bit here for user + * registered nodes, because we know that: * * 1) it's a BVEC iter, we set it up * 2) all bvecs are the same in size, except potentially the @@ -901,7 +998,14 @@ int io_import_fixed(int ddir, struct iov_iter *iter, struct io_rsrc_node *node, */ const struct bio_vec *bvec = imu->bvec; - if (offset < bvec->bv_len) { + /* + * Kernel buffer bvecs, on the other hand, don't necessarily + * have the size property of user registered ones, so we have + * to use the slow iter advance. + */ + if (node->type == IORING_RSRC_KBUF) + iov_iter_advance(iter, offset); + else if (offset < bvec->bv_len) { iter->iov_offset = offset; } else { unsigned long seg_skip; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index abd0d5d42c3e1..d1d90d9cd2b43 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -13,6 +13,7 @@ enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, + IORING_RSRC_KBUF = 2, }; struct io_rsrc_node { -- 2.43.5