Add IORING_SETUP_CQE32 flag to allow setting up ring with big-cqe which is 32 bytes in size. Also modify uring-cmd completion infra to accept additional result and fill that up in big-cqe. Signed-off-by: Kanchan Joshi <joshi.k@xxxxxxxxxxx> Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx> --- fs/io_uring.c | 82 +++++++++++++++++++++++++++++------ include/linux/io_uring.h | 10 +++-- include/uapi/linux/io_uring.h | 11 +++++ 3 files changed, 87 insertions(+), 16 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index bd0e6b102a7b..b819c0ad47fc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -211,8 +211,8 @@ struct io_mapped_ubuf { struct io_ring_ctx; struct io_overflow_cqe { - struct io_uring_cqe cqe; struct list_head list; + struct io_uring_cqe cqe; /* this must be kept at end */ }; struct io_fixed_file { @@ -1713,6 +1713,13 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) return NULL; tail = ctx->cached_cq_tail++; + + /* double index for large CQE */ + if (ctx->flags & IORING_SETUP_CQE32) { + mask = 2 * ctx->cq_entries - 1; + tail <<= 1; + } + return &rings->cqes[tail & mask]; } @@ -1792,13 +1799,16 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe = io_get_cqe(ctx); struct io_overflow_cqe *ocqe; + int cqeshift = 0; if (!cqe && !force) break; + /* copy more for big-cqe */ + cqeshift = ctx->flags & IORING_SETUP_CQE32 ? 1 : 0; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); if (cqe) - memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); + memcpy(cqe, &ocqe->cqe, sizeof(*cqe) << cqeshift); else io_account_cq_overflow(ctx); @@ -1884,11 +1894,17 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) + s32 res, u32 cflags, u64 res2, + int bigcqe) { struct io_overflow_cqe *ocqe; + int size = sizeof(*ocqe); + + /* allocate more for big-cqe */ + if (bigcqe) + size += sizeof(struct io_uring_cqe); - ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); + ocqe = kmalloc(size, GFP_ATOMIC | __GFP_ACCOUNT); if (!ocqe) { /* * If we're in ring overflow flush mode, or in task cancel mode, @@ -1907,6 +1923,11 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, ocqe->cqe.user_data = user_data; ocqe->cqe.res = res; ocqe->cqe.flags = cflags; + if (bigcqe) { + struct io_uring_cqe32 *bcqe = (struct io_uring_cqe32 *)&ocqe->cqe; + + bcqe->res2 = res2; + } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } @@ -1928,13 +1949,38 @@ static inline bool __fill_cqe(struct io_ring_ctx *ctx, u64 user_data, WRITE_ONCE(cqe->flags, cflags); return true; } - return io_cqring_event_overflow(ctx, user_data, res, cflags); + return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, false); } +static inline bool __fill_big_cqe(struct io_ring_ctx *ctx, u64 user_data, + s32 res, u32 cflags, u64 res2) +{ + struct io_uring_cqe32 *bcqe; + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + bcqe = (struct io_uring_cqe32 *) io_get_cqe(ctx); + if (likely(bcqe)) { + WRITE_ONCE(bcqe->cqe.user_data, user_data); + WRITE_ONCE(bcqe->cqe.res, res); + WRITE_ONCE(bcqe->cqe.flags, cflags); + WRITE_ONCE(bcqe->res2, res2); + return true; + } + return io_cqring_event_overflow(ctx, user_data, res, cflags, res2, + true); +} static inline bool __io_fill_cqe(struct io_kiocb *req, s32 res, u32 cflags) { trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); - return __fill_cqe(req->ctx, req->user_data, res, cflags); + if (!(req->ctx->flags & IORING_SETUP_CQE32)) + return __fill_cqe(req->ctx, req->user_data, res, cflags); + else + return __fill_big_cqe(req->ctx, req->user_data, res, cflags, + req->uring_cmd.res2); } static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) @@ -4126,10 +4172,12 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret) +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) { struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + /* store secondary result in res2 */ + req->uring_cmd.res2 = res2; if (ret < 0) req_set_fail(req); io_req_complete(req, ret); @@ -4163,7 +4211,7 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) /* queued async, consumer will call io_uring_cmd_done() when complete */ if (ret == -EIOCBQUEUED) return 0; - io_uring_cmd_done(ioucmd, ret); + io_uring_cmd_done(ioucmd, ret, 0); return 0; } @@ -9026,13 +9074,20 @@ static void *io_mem_alloc(size_t size) return (void *) __get_free_pages(gfp_flags, get_order(size)); } -static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, - size_t *sq_offset) +static unsigned long rings_size(struct io_uring_params *p, + size_t *sq_offset) { + unsigned sq_entries, cq_entries; struct io_rings *rings; size_t off, sq_array_size; - off = struct_size(rings, cqes, cq_entries); + sq_entries = p->sq_entries; + cq_entries = p->cq_entries; + + if (p->flags & IORING_SETUP_CQE32) + off = struct_size(rings, cqes, 2 * cq_entries); + else + off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) return SIZE_MAX; @@ -10483,7 +10538,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); + size = rings_size(p, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -10713,7 +10768,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED | IORING_SETUP_SQE128)) + IORING_SETUP_R_DISABLED | IORING_SETUP_SQE128 | + IORING_SETUP_CQE32)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index cedc68201469..0aba7b50cde6 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -14,7 +14,10 @@ enum io_uring_cmd_flags { struct io_uring_cmd { struct file *file; - void *cmd; + union { + void *cmd; /* used on submission */ + u64 res2; /* used on completion */ + }; /* for irq-completion - if driver requires doing stuff in task-context*/ void (*driver_cb)(struct io_uring_cmd *cmd); u32 flags; @@ -25,7 +28,7 @@ struct io_uring_cmd { }; #if defined(CONFIG_IO_URING) -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret); +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2); void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, void (*driver_cb)(struct io_uring_cmd *)); struct sock *io_uring_get_socket(struct file *file); @@ -48,7 +51,8 @@ static inline void io_uring_free(struct task_struct *tsk) __io_uring_free(tsk); } #else -static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret) +static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, + ssize_t ret2) { } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index d7a4bdb9bf3b..85b8ff046496 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -113,6 +113,7 @@ enum { #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ #define IORING_SETUP_SQE128 (1U << 7) /* SQEs are 128b */ +#define IORING_SETUP_CQE32 (1U << 8) /* CQEs are 32b */ enum { IORING_OP_NOP, @@ -207,6 +208,16 @@ struct io_uring_cqe { __u32 flags; }; +/* + * If the ring is initializefd with IORING_SETUP_CQE32, we setup large cqe. + * Large CQE is created by combining two adjacent regular CQES. + */ +struct io_uring_cqe32 { + struct io_uring_cqe cqe; + __u64 res2; + __u64 unused; +}; + /* * cqe->flags * -- 2.25.1