From: Keith Busch <kbusch@xxxxxxxxxx> Provide an interface for the kernel to leverage the existing pre-registered buffers that io_uring provides. User space can reference these later to achieve zero-copy IO. User space must register an empty fixed buffer table with io_uring in order for the kernel to make use of it. Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> --- include/linux/io_uring/cmd.h | 7 ++ io_uring/io_uring.c | 3 + io_uring/rsrc.c | 123 +++++++++++++++++++++++++++++++++-- io_uring/rsrc.h | 9 +++ io_uring/rw.c | 3 + 5 files changed, 138 insertions(+), 7 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 87150dc0a07cf..cf8d80d847344 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -4,6 +4,7 @@ #include <uapi/linux/io_uring.h> #include <linux/io_uring_types.h> +#include <linux/blk-mq.h> /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ #define IORING_URING_CMD_CANCELABLE (1U << 30) @@ -125,4 +126,10 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur return cmd_to_io_kiocb(cmd)->async_data; } +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags); +void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags); + #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index db1c0792def63..2f5dd47e7dbf5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3947,6 +3947,9 @@ static int __init io_uring_init(void) io_uring_optable_init(); + /* imu->dir is u8 */ + BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); + /* * Allow user copy in the per-command field, which starts after the * file in io_kiocb and until the opcode field. The openat2 handling diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f814526982c36..0eceaf2e03777 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -9,6 +9,7 @@ #include <linux/hugetlb.h> #include <linux/compat.h> #include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include <uapi/linux/io_uring.h> @@ -101,17 +102,23 @@ int io_buffer_validate(struct iovec *iov) return 0; } -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) +static void io_release_ubuf(void *priv) { - struct io_mapped_ubuf *imu = node->buf; + struct io_mapped_ubuf *imu = priv; unsigned int i; - if (!refcount_dec_and_test(&imu->refs)) - return; for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); +} + +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) +{ + if (!refcount_dec_and_test(&imu->refs)) + return; + if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); + imu->release(imu->priv); kvfree(imu); } @@ -451,7 +458,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) break; case IORING_RSRC_BUFFER: if (node->buf) - io_buffer_unmap(ctx, node); + io_buffer_unmap(ctx, node->buf); break; default: WARN_ON_ONCE(1); @@ -761,6 +768,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, imu->len = iov->iov_len; imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; + imu->release = io_release_ubuf; + imu->priv = imu; + imu->is_kbuf = false; + imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); @@ -857,6 +868,95 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct req_iterator rq_iter; + struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; + struct bio_vec bv, *bvec; + u16 nr_bvecs; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) { + ret = -EINVAL; + goto unlock; + } + index = array_index_nospec(index, data->nr); + + if (data->nodes[index]) { + ret = -EBUSY; + goto unlock; + } + + node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + if (!node) { + ret = -ENOMEM; + goto unlock; + } + + nr_bvecs = blk_rq_nr_phys_segments(rq); + imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL); + if (!imu) { + kfree(node); + ret = -ENOMEM; + goto unlock; + } + + imu->ubuf = 0; + imu->len = blk_rq_bytes(rq); + imu->acct_pages = 0; + imu->folio_shift = PAGE_SHIFT; + imu->nr_bvecs = nr_bvecs; + refcount_set(&imu->refs, 1); + imu->release = release; + imu->priv = rq; + imu->is_kbuf = true; + + if (op_is_write(req_op(rq))) + imu->dir = IO_IMU_SOURCE; + else + imu->dir = IO_IMU_DEST; + + bvec = imu->bvec; + rq_for_each_bvec(bv, rq, rq_iter) + *bvec++ = bv; + + node->buf = imu; + data->nodes[index] = node; +unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_buffer_register_bvec); + +void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct io_rsrc_node *node; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) + goto unlock; + index = array_index_nospec(index, data->nr); + + node = data->nodes[index]; + if (!node || !node->buf->is_kbuf) + goto unlock; + + io_put_rsrc_node(ctx, node); + data->nodes[index] = NULL; +unlock: + io_ring_submit_unlock(ctx, issue_flags); +} +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); + static int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) @@ -871,6 +971,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; /* * Might not be a start of buffer, set size appropriately @@ -883,8 +985,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, /* * Don't use iov_iter_advance() here, as it's really slow for * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: + * over each segment manually. We can cheat a bit here for user + * registered nodes, because we know that: * * 1) it's a BVEC iter, we set it up * 2) all bvecs are the same in size, except potentially the @@ -898,8 +1000,15 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, */ const struct bio_vec *bvec = imu->bvec; + /* + * Kernel buffer bvecs, on the other hand, don't necessarily + * have the size property of user registered ones, so we have + * to use the slow iter advance. + */ if (offset < bvec->bv_len) { iter->iov_offset = offset; + } else if (imu->is_kbuf) { + iov_iter_advance(iter, offset); } else { unsigned long seg_skip; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index f0e9080599646..7600e2736eeb3 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -20,6 +20,11 @@ struct io_rsrc_node { }; }; +enum { + IO_IMU_DEST = 1 << ITER_DEST, + IO_IMU_SOURCE = 1 << ITER_SOURCE, +}; + struct io_mapped_ubuf { u64 ubuf; unsigned int len; @@ -27,6 +32,10 @@ struct io_mapped_ubuf { unsigned int folio_shift; refcount_t refs; unsigned long acct_pages; + void (*release)(void *); + void *priv; + bool is_kbuf; + u8 dir; struct bio_vec bvec[] __counted_by(nr_bvecs); }; diff --git a/io_uring/rw.c b/io_uring/rw.c index 7bc23802a388e..5ee9f8949e8ba 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -629,6 +629,7 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) */ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) { + struct io_kiocb *req = cmd_to_io_kiocb(rw); struct kiocb *kiocb = &rw->kiocb; struct file *file = kiocb->ki_filp; ssize_t ret = 0; @@ -644,6 +645,8 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) if ((kiocb->ki_flags & IOCB_NOWAIT) && !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; + if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf) + return -EFAULT; ppos = io_kiocb_ppos(kiocb); -- 2.43.5