Allow the user to pre-register a region for passing various paramteres. To use it for passing wait loop arguments, which is wired in the following commit, the region has to be registered with the IORING_PARAM_REGION_WAIT_ARG flag set. The flag also requires the context to be currently disabled, i.e. IORING_SETUP_R_DISABLED, to avoid races with otherwise potentially running waiters. This will also be useful in the future for various request / SQE arguments like iovec, the meta read/write API, and also for BPF. Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> --- include/linux/io_uring_types.h | 6 ++++ include/uapi/linux/io_uring.h | 13 ++++++++ io_uring/io_uring.c | 1 + io_uring/register.c | 59 ++++++++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 1d3a37234ace..aa5f5ea98076 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -324,6 +324,9 @@ struct io_ring_ctx { unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; unsigned cq_extra; + + void *cq_wait_arg; + size_t cq_wait_size; } ____cacheline_aligned_in_smp; /* @@ -429,6 +432,9 @@ struct io_ring_ctx { unsigned short n_sqe_pages; struct page **ring_pages; struct page **sqe_pages; + + /* used for optimised request parameter and wait argument passing */ + struct io_mapped_region param_region; }; struct io_tw_state { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7ceeccbbf4cb..49b94029c137 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -627,6 +627,8 @@ enum io_uring_register_op { /* resize CQ ring */ IORING_REGISTER_RESIZE_RINGS = 33, + IORING_REGISTER_PARAM_REGION = 34, + /* this goes last */ IORING_REGISTER_LAST, @@ -660,6 +662,17 @@ struct io_uring_region_desc { __u64 __resv[4]; }; +enum { + /* expose the region as registered wait arguments */ + IORING_PARAM_REGION_WAIT_ARG = 1, +}; + +struct io_uring_param_region_reg { + __u64 region_uptr; /* struct io_uring_region_desc * */ + __u64 flags; + __u64 __resv[2]; +}; + /* * Register a fully sparse file space, rather than pass in an array of all * -1 file descriptors. diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 286b7bb73978..c640b8a4ceee 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2709,6 +2709,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); + io_free_region(ctx, &ctx->param_region); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); diff --git a/io_uring/register.c b/io_uring/register.c index 3c5a3cfb186b..d1ba14da37ea 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -570,6 +570,59 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) return ret; } +/* + * Register a page holding N entries of struct io_uring_reg_wait, which can + * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set. + * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing + * in a pointer for a struct io_uring_getevents_arg, an index into this + * registered array is passed, avoiding two (arg + timeout) copies per + * invocation. + */ +static int io_register_mapped_heap(struct io_ring_ctx *ctx, void __user *uarg) +{ + struct io_uring_param_region_reg __user *reg_uptr = uarg; + struct io_uring_param_region_reg reg; + struct io_uring_region_desc __user *rd_uptr; + struct io_uring_region_desc rd; + int ret; + + if (io_region_is_set(&ctx->param_region)) + return -EBUSY; + if (copy_from_user(®, reg_uptr, sizeof(reg))) + return -EFAULT; + rd_uptr = u64_to_user_ptr(reg.region_uptr); + if (copy_from_user(&rd, rd_uptr, sizeof(rd))) + return -EFAULT; + + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + if (reg.flags != IORING_PARAM_REGION_WAIT_ARG) + return -EINVAL; + + /* + * This ensures there are no waiters. Waiters are unlocked and it's + * hard to synchronise with them, especially if we need to initialise + * the region. + */ + if ((reg.flags & IORING_PARAM_REGION_WAIT_ARG) && + !(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EINVAL; + + ret = io_create_region(ctx, &ctx->param_region, &rd); + if (ret) + return ret; + if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { + io_free_region(ctx, &ctx->param_region); + return -EFAULT; + } + + if (reg.flags & IORING_PARAM_REGION_WAIT_ARG) { + ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); + ctx->cq_wait_size = rd.size; + } + return 0; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -764,6 +817,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_resize_rings(ctx, arg); break; + case IORING_REGISTER_PARAM_REGION: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_mapped_heap(ctx, arg); + break; default: ret = -EINVAL; break; -- 2.46.0