On Tue, Feb 18, 2025 at 2:42 PM Keith Busch <kbusch@xxxxxxxx> wrote: > > From: Keith Busch <kbusch@xxxxxxxxxx> > > Provide an interface for the kernel to leverage the existing > pre-registered buffers that io_uring provides. User space can reference > these later to achieve zero-copy IO. > > User space must register an empty fixed buffer table with io_uring in > order for the kernel to make use of it. > > Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> > --- > include/linux/io_uring.h | 1 + > include/linux/io_uring_types.h | 6 ++ > io_uring/rsrc.c | 115 ++++++++++++++++++++++++++++++--- > io_uring/rsrc.h | 4 ++ > 4 files changed, 118 insertions(+), 8 deletions(-) > > diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h > index 85fe4e6b275c7..b5637a2aae340 100644 > --- a/include/linux/io_uring.h > +++ b/include/linux/io_uring.h > @@ -5,6 +5,7 @@ > #include <linux/sched.h> > #include <linux/xarray.h> > #include <uapi/linux/io_uring.h> > +#include <linux/blk-mq.h> Should this be added to the header file that declares io_buffer_register_bvec() instead? > > #if defined(CONFIG_IO_URING) > void __io_uring_cancel(bool cancel_all); > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h > index 0bcaefc4ffe02..2aed51e8c79ee 100644 > --- a/include/linux/io_uring_types.h > +++ b/include/linux/io_uring_types.h > @@ -709,4 +709,10 @@ static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) > return ctx->flags & IORING_SETUP_CQE32; > } > > +int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq, > + void (*release)(void *), unsigned int index, > + unsigned int issue_flags); > +void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int index, > + unsigned int issue_flags); Hmm, io_uring_types.h seems like a strange place for these. The rest of this file consists of types and inline functions. Maybe io_uring/rsrc.h would make more sense? > + > #endif > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c > index 20b884c84e55f..88bcacc77b72e 100644 > --- a/io_uring/rsrc.c > +++ b/io_uring/rsrc.c > @@ -103,19 +103,23 @@ int io_buffer_validate(struct iovec *iov) > > static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) > { > - unsigned int i; > + struct io_mapped_ubuf *imu = node->buf; > > - if (node->buf) { > - struct io_mapped_ubuf *imu = node->buf; > + if (!refcount_dec_and_test(&imu->refs)) > + return; > + > + if (imu->release) { > + imu->release(imu->priv); > + } else { > + unsigned int i; > > - if (!refcount_dec_and_test(&imu->refs)) > - return; > for (i = 0; i < imu->nr_bvecs; i++) > unpin_user_page(imu->bvec[i].bv_page); > if (imu->acct_pages) > io_unaccount_mem(ctx, imu->acct_pages); > - kvfree(imu); > } > + > + kvfree(imu); > } > > struct io_rsrc_node *io_rsrc_node_alloc(int type) > @@ -764,6 +768,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, > imu->len = iov->iov_len; > imu->nr_bvecs = nr_pages; > imu->folio_shift = PAGE_SHIFT; > + imu->release = NULL; > + imu->priv = NULL; > + imu->readable = true; > + imu->writeable = true; > if (coalesced) > imu->folio_shift = data.folio_shift; > refcount_set(&imu->refs, 1); > @@ -860,6 +868,87 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, > return ret; > } > > +int io_buffer_register_bvec(struct io_ring_ctx *ctx, struct request *rq, > + void (*release)(void *), unsigned int index, > + unsigned int issue_flags) > +{ > + struct io_rsrc_data *data = &ctx->buf_table; > + struct req_iterator rq_iter; > + struct io_mapped_ubuf *imu; > + struct io_rsrc_node *node; > + struct bio_vec bv, *bvec; > + int ret = 0; > + u16 nr_bvecs; > + > + io_ring_submit_lock(ctx, issue_flags); > + > + if (io_rsrc_node_lookup(data, index)) { > + ret = -EBUSY; > + goto unlock; > + } > + > + node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); > + if (!node) { > + ret = -ENOMEM; > + goto unlock; > + } > + > + nr_bvecs = blk_rq_nr_phys_segments(rq); > + imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL); > + if (!imu) { > + kfree(node); > + ret = -ENOMEM; > + goto unlock; > + } > + > + imu->ubuf = 0; > + imu->len = blk_rq_bytes(rq); > + imu->acct_pages = 0; > + imu->folio_shift = PAGE_SHIFT; > + imu->nr_bvecs = nr_bvecs; > + refcount_set(&imu->refs, 1); > + imu->release = release; > + imu->priv = rq; > + > + if (rq_data_dir(rq)) Check rq_data_dir(rq) == WRITE instead to avoid depending on the value of READ/WRITE? > + imu->writeable = true; > + else > + imu->readable = true; > + > + bvec = imu->bvec; > + rq_for_each_bvec(bv, rq, rq_iter) > + *bvec++ = bv; > + > + node->buf = imu; > + data->nodes[index] = node; > +unlock: > + io_ring_submit_unlock(ctx, issue_flags); > + return ret; > +} > +EXPORT_SYMBOL_GPL(io_buffer_register_bvec); > + > +void io_buffer_unregister_bvec(struct io_ring_ctx *ctx, unsigned int index, > + unsigned int issue_flags) > +{ > + struct io_rsrc_data *data = &ctx->buf_table; > + struct io_rsrc_node *node; > + > + io_ring_submit_lock(ctx, issue_flags); > + > + if (!data->nr) > + goto unlock; > + > + node = io_rsrc_node_lookup(data, index); > + if (!node || !node->buf->release) > + goto unlock; Still think the data->nr check is unnecessary, since io_rsrc_node_lookup() would return NULL in that case. > + > + io_put_rsrc_node(ctx, node); > + data->nodes[index] = NULL; > +unlock: > + io_ring_submit_unlock(ctx, issue_flags); > +} > +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); > + > int io_import_fixed(int ddir, struct iov_iter *iter, > struct io_mapped_ubuf *imu, > u64 buf_addr, size_t len) > @@ -874,6 +963,9 @@ int io_import_fixed(int ddir, struct iov_iter *iter, > /* not inside the mapped region */ > if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) > return -EFAULT; > + if ((ddir == READ && !imu->readable) || > + (ddir == WRITE && !imu->writeable)) > + return -EFAULT; This could be made less branchy by storing a bitmask of allowed data transfer directions instead of 2 bool fields. Then this could just be: if (!(imu->ddirs >> ddir & 1) return -EFAULT; Best, Caleb > > /* > * Might not be a start of buffer, set size appropriately > @@ -886,8 +978,8 @@ int io_import_fixed(int ddir, struct iov_iter *iter, > /* > * Don't use iov_iter_advance() here, as it's really slow for > * using the latter parts of a big fixed buffer - it iterates > - * over each segment manually. We can cheat a bit here, because > - * we know that: > + * over each segment manually. We can cheat a bit here for user > + * registered nodes, because we know that: > * > * 1) it's a BVEC iter, we set it up > * 2) all bvecs are the same in size, except potentially the > @@ -901,8 +993,15 @@ int io_import_fixed(int ddir, struct iov_iter *iter, > */ > const struct bio_vec *bvec = imu->bvec; > > + /* > + * Kernel buffer bvecs, on the other hand, don't necessarily > + * have the size property of user registered ones, so we have > + * to use the slow iter advance. > + */ > if (offset < bvec->bv_len) { > iter->iov_offset = offset; > + } else if (imu->release) { > + iov_iter_advance(iter, offset); > } else { > unsigned long seg_skip; > > diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h > index abf86b5b86140..81c31b93d4f7b 100644 > --- a/io_uring/rsrc.h > +++ b/io_uring/rsrc.h > @@ -33,6 +33,10 @@ struct io_mapped_ubuf { > unsigned int folio_shift; > refcount_t refs; > unsigned long acct_pages; > + void (*release)(void *); > + void *priv; > + bool readable; > + bool writeable; > struct bio_vec bvec[] __counted_by(nr_bvecs); > }; > > -- > 2.43.5 >