There are lots of parameters we might want to additionally pass to a request, but SQE has limited space and it may require additional parsing and checking in the hot path. Then requests take an index specifying which parameter set to use. The benefit for the kernel is that we can put any number of arguments in there and then do pre-processing at the initialisation time like renumbering flags and enabling static keys for performance deprecated features. The obvious downside is that the user can't use the entire parameter space as there could only be a limited number of sets. The main target here is tuning the waiting loop with finer grained control when we should wake the task and return to the user. The current implementation is crude, it needs a SETUP flag disabling creds/personalities, and is limited to one registration of maximum 16 sets. It could be made to co-exist with creds and be a bit more flexibly registered and expanded. Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> --- include/linux/io_uring_types.h | 8 ++++++ include/uapi/linux/io_uring.h | 9 ++++++ io_uring/io_uring.c | 36 ++++++++++++++++-------- io_uring/msg_ring.c | 1 + io_uring/net.c | 1 + io_uring/register.c | 51 ++++++++++++++++++++++++++++++++++ 6 files changed, 94 insertions(+), 12 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index ad5001102c86..79f38c07642d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -75,6 +75,10 @@ struct io_hash_table { unsigned hash_bits; }; +struct io_set { + u32 flags; +}; + /* * Arbitrary limit, can be raised if need be */ @@ -268,6 +272,9 @@ struct io_ring_ctx { unsigned cached_sq_head; unsigned sq_entries; + struct io_set iosets[16]; + unsigned int nr_iosets; + /* * Fixed resources fast path, should be accessed only under * uring_lock, and updated through io_uring_register(2) @@ -635,6 +642,7 @@ struct io_kiocb { struct io_ring_ctx *ctx; struct io_uring_task *tctx; + struct io_set *ioset; union { /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ba373deb8406..6a432383e7c3 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -158,6 +158,8 @@ enum io_uring_sqe_flags_bit { #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ #define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ +#define IORING_SETUP_IOSET (1U << 8) + /* * Cooperative task running. When requests complete, they often require * forcing the submitter to transition to the kernel to complete. If this @@ -634,6 +636,8 @@ enum io_uring_register_op { /* register fixed io_uring_reg_wait arguments */ IORING_REGISTER_CQWAIT_REG = 34, + IORING_REGISTER_IOSETS = 35, + /* this goes last */ IORING_REGISTER_LAST, @@ -895,6 +899,11 @@ struct io_uring_recvmsg_out { __u32 flags; }; +struct io_uring_ioset_reg { + __u64 flags; + __u64 __resv[3]; +}; + /* * Argument for IORING_OP_URING_CMD when file is a socket */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f34fa1ead2cf..cf688a9ff737 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2156,6 +2156,7 @@ static void io_init_req_drain(struct io_kiocb *req) static __cold int io_init_fail_req(struct io_kiocb *req, int err) { + req->ioset = &req->ctx->iosets[0]; /* ensure per-opcode data is cleared if we fail before prep */ memset(&req->cmd.data, 0, sizeof(req->cmd.data)); return err; @@ -2238,19 +2239,27 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } personality = READ_ONCE(sqe->personality); - if (personality) { - int ret; - - req->creds = xa_load(&ctx->personalities, personality); - if (!req->creds) + if (ctx->flags & IORING_SETUP_IOSET) { + if (unlikely(personality >= ctx->nr_iosets)) return io_init_fail_req(req, -EINVAL); - get_cred(req->creds); - ret = security_uring_override_creds(req->creds); - if (ret) { - put_cred(req->creds); - return io_init_fail_req(req, ret); + personality = array_index_nospec(personality, ctx->nr_iosets); + req->ioset = &ctx->iosets[personality]; + } else { + if (personality) { + int ret; + + req->creds = xa_load(&ctx->personalities, personality); + if (!req->creds) + return io_init_fail_req(req, -EINVAL); + get_cred(req->creds); + ret = security_uring_override_creds(req->creds); + if (ret) { + put_cred(req->creds); + return io_init_fail_req(req, ret); + } + req->flags |= REQ_F_CREDS; } - req->flags |= REQ_F_CREDS; + req->ioset = &ctx->iosets[0]; } return def->prep(req, sqe); @@ -3909,6 +3918,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (!ctx) return -ENOMEM; + ctx->nr_iosets = 0; + ctx->clockid = CLOCK_MONOTONIC; ctx->clock_offset = 0; @@ -4076,7 +4087,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) + IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL | + IORING_SETUP_IOSET)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index e63af34004b7..f5a747aa255c 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -98,6 +98,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, io_req_set_res(req, res, cflags); percpu_ref_get(&ctx->refs); req->ctx = ctx; + req->ioset = &ctx->iosets[0]; req->io_task_work.func = io_msg_tw_complete; io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); return 0; diff --git a/io_uring/net.c b/io_uring/net.c index 2ccc2b409431..785987bf9e6a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1242,6 +1242,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) notif = zc->notif = io_alloc_notif(ctx); if (!notif) return -ENOMEM; + notif->ioset = req->ioset; notif->cqe.user_data = req->cqe.user_data; notif->cqe.res = 0; notif->cqe.flags = IORING_CQE_F_NOTIF; diff --git a/io_uring/register.c b/io_uring/register.c index 45edfc57963a..e7571dc46da5 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -86,6 +86,48 @@ int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } +static int io_update_ioset(struct io_ring_ctx *ctx, + const struct io_uring_ioset_reg *reg, + struct io_set *set) +{ + if (!(ctx->flags & IORING_SETUP_IOSET)) + return -EINVAL; + if (reg->flags) + return -EINVAL; + if (reg->__resv[0] || reg->__resv[1] || reg->__resv[2]) + return -EINVAL; + + set->flags = reg->flags; + return 0; +} + +static int io_register_iosets(struct io_ring_ctx *ctx, + void __user *arg, unsigned int nr_args) +{ + struct io_uring_ioset_reg __user *uptr = arg; + struct io_uring_ioset_reg reg[16]; + int i, ret; + + /* TODO: one time setup, max 16 entries, should be made more dynamic */ + if (ctx->nr_iosets) + return -EINVAL; + if (nr_args >= ARRAY_SIZE(ctx->iosets)) + return -EINVAL; + + if (copy_from_user(reg, uptr, sizeof(reg[0]) * nr_args)) + return -EFAULT; + + for (i = 0; i < nr_args; i++) { + ret = io_update_ioset(ctx, ®[i], &ctx->iosets[i]); + if (ret) { + memset(&ctx->iosets[0], 0, sizeof(ctx->iosets[0])); + return ret; + } + } + + ctx->nr_iosets = nr_args; + return 0; +} static int io_register_personality(struct io_ring_ctx *ctx) { @@ -93,6 +135,9 @@ static int io_register_personality(struct io_ring_ctx *ctx) u32 id; int ret; + if (ctx->flags & IORING_SETUP_IOSET) + return -EINVAL; + creds = get_current_cred(); ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, @@ -846,6 +891,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_cqwait_reg(ctx, arg); break; + case IORING_REGISTER_IOSETS: + ret = -EINVAL; + if (!arg) + break; + ret = io_register_iosets(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; -- 2.46.0