IORING_REGISTER_FILES is a good feature to reduce fget/fput overhead for each IO we do on file, but still left one, which is io_uring_enter(2). In io_uring_enter(2), it still fget/fput io_ring fd. I have observed this overhead in some our internal oroutine implementations based on io_uring with low submit batch. To totally remove fget/fput overhead in io_uring, we may add a small struct file cache in io_uring_task and add a new IORING_ENTER_FIXED_FILE flag. Currently the capacity of this file cache is 16, wihcih I think it maybe enough, also not that this cache is per-thread. Signed-off-by: Xiaoguang Wang <xiaoguang.wang@xxxxxxxxxxxxxxxxx> --- fs/io_uring.c | 127 +++++++++++++++++++++++++++++++++++++----- include/linux/io_uring.h | 5 +- include/uapi/linux/io_uring.h | 9 +++ 3 files changed, 126 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 77b9c7e4793b..e1d4b537cb60 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -461,6 +461,8 @@ struct io_ring_ctx { }; }; +#define IO_RINGFD_REG_MAX 16 + struct io_uring_task { /* submission side */ int cached_refs; @@ -476,6 +478,7 @@ struct io_uring_task { struct io_wq_work_list task_list; struct io_wq_work_list prior_task_list; struct callback_head task_work; + struct file **registered_files; bool task_running; }; @@ -8739,8 +8742,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, if (unlikely(!tctx)) return -ENOMEM; + tctx->registered_files = kzalloc(sizeof(struct file *) * IO_RINGFD_REG_MAX, + GFP_KERNEL); + if (unlikely(!tctx->registered_files)) { + kfree(tctx); + return -ENOMEM; + } + ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); if (unlikely(ret)) { + kfree(tctx->registered_files); kfree(tctx); return ret; } @@ -8749,6 +8760,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, if (IS_ERR(tctx->io_wq)) { ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); + kfree(tctx->registered_files); kfree(tctx); return ret; } @@ -9382,6 +9394,56 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) return -ENXIO; } +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked); + +static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_fd_reg reg; + struct io_uring_task *tctx; + struct file *file; + int ret; + + if (copy_from_user(®, arg, sizeof(struct io_uring_fd_reg))) + return -EFAULT; + if (reg.offset > IO_RINGFD_REG_MAX) + return -EINVAL; + + ret = io_uring_add_tctx_node(ctx, true); + if (unlikely(ret)) + return ret; + + tctx = current->io_uring; + if (tctx->registered_files[reg.offset]) + return -EBUSY; + file = fget(reg.fd); + if (unlikely(!file)) + return -EBADF; + tctx->registered_files[reg.offset] = file; + return 0; +} + +static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_task *tctx; + __u32 offset; + + if (!current->io_uring) + return 0; + + if (copy_from_user(&offset, arg, sizeof(__u32))) + return -EFAULT; + if (offset > IO_RINGFD_REG_MAX) + return -EINVAL; + + tctx = current->io_uring; + if (tctx->registered_files[offset]) { + fput(tctx->registered_files[offset]); + tctx->registered_files[offset] = NULL; + } + + return 0; +} + static void io_destroy_buffers(struct io_ring_ctx *ctx) { struct io_buffer *buf; @@ -9790,7 +9852,7 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } } -static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) +static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; @@ -9825,9 +9887,11 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) return ret; } - mutex_lock(&ctx->uring_lock); + if (!locked) + mutex_lock(&ctx->uring_lock); list_add(&node->ctx_node, &ctx->tctx_list); - mutex_unlock(&ctx->uring_lock); + if (!locked) + mutex_unlock(&ctx->uring_lock); } tctx->last = ctx; return 0; @@ -9836,13 +9900,13 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) /* * Note that this task has used io_uring. We use it for cancelation purposes. */ -static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked) { struct io_uring_task *tctx = current->io_uring; if (likely(tctx && tctx->last == ctx)) return 0; - return __io_uring_add_tctx_node(ctx); + return __io_uring_add_tctx_node(ctx, locked); } /* @@ -9973,6 +10037,16 @@ void __io_uring_cancel(bool cancel_all) io_uring_cancel_generic(cancel_all, NULL); } +void io_uring_unreg_ringfd(struct io_uring_task *tctx) +{ + int i; + + for (i = 0; i < IO_RINGFD_REG_MAX; i++) { + if (tctx->registered_files[i]) + fput(tctx->registered_files[i]); + } +} + static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { @@ -10098,24 +10172,36 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, struct io_ring_ctx *ctx; int submitted = 0; struct fd f; + struct file *file; long ret; io_run_task_work(); if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))) + IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | + IORING_ENTER_FIXED_FILE))) return -EINVAL; - f = fdget(fd); - if (unlikely(!f.file)) - return -EBADF; + if (flags & IORING_ENTER_FIXED_FILE) { + if (fd > IO_RINGFD_REG_MAX || !current->io_uring) + return -EINVAL; + + file = current->io_uring->registered_files[fd]; + if (unlikely(!file)) + return -EBADF; + } else { + f = fdget(fd); + if (unlikely(!f.file)) + return -EBADF; + file = f.file; + } ret = -EOPNOTSUPP; - if (unlikely(f.file->f_op != &io_uring_fops)) + if (unlikely(file->f_op != &io_uring_fops)) goto out_fput; ret = -ENXIO; - ctx = f.file->private_data; + ctx = file->private_data; if (unlikely(!percpu_ref_tryget(&ctx->refs))) goto out_fput; @@ -10145,7 +10231,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } submitted = to_submit; } else if (to_submit) { - ret = io_uring_add_tctx_node(ctx); + ret = io_uring_add_tctx_node(ctx, false); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); @@ -10182,7 +10268,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, out: percpu_ref_put(&ctx->refs); out_fput: - fdput(f); + if (!(flags & IORING_ENTER_FIXED_FILE)) + fdput(f); return submitted ? submitted : ret; } @@ -10413,7 +10500,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) if (fd < 0) return fd; - ret = io_uring_add_tctx_node(ctx); + ret = io_uring_add_tctx_node(ctx, false); if (ret) { put_unused_fd(fd); return ret; @@ -11134,6 +11221,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_iowq_max_workers(ctx, arg); break; + case IORING_REGISTER_IORINGFD: + ret = -EINVAL; + if (nr_args != 1) + break; + ret = io_ringfd_register(ctx, arg); + break; + case IORING_UNREGISTER_IORINGFD: + ret = -EINVAL; + if (nr_args != 1) + break; + ret = io_ringfd_unregister(ctx, arg); + break; default: ret = -EINVAL; break; diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 649a4d7c241b..5ddea83912c7 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -9,11 +9,14 @@ struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); +void io_uring_unreg_ringfd(struct io_uring_task *tctx); static inline void io_uring_files_cancel(void) { - if (current->io_uring) + if (current->io_uring) { + io_uring_unreg_ringfd(current->io_uring); __io_uring_cancel(false); + } } static inline void io_uring_task_cancel(void) { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 787f491f0d2a..f076a203317a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -261,6 +261,7 @@ struct io_cqring_offsets { #define IORING_ENTER_SQ_WAKEUP (1U << 1) #define IORING_ENTER_SQ_WAIT (1U << 2) #define IORING_ENTER_EXT_ARG (1U << 3) +#define IORING_ENTER_FIXED_FILE (1U << 4) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -325,6 +326,9 @@ enum { /* set/get max number of io-wq workers */ IORING_REGISTER_IOWQ_MAX_WORKERS = 19, + IORING_REGISTER_IORINGFD = 20, + IORING_UNREGISTER_IORINGFD = 21, + /* this goes last */ IORING_REGISTER_LAST }; @@ -422,4 +426,9 @@ struct io_uring_getevents_arg { __u64 ts; }; +struct io_uring_fd_reg { + __u32 offset; + __s32 fd; +}; + #endif -- 2.14.4.44.g2045bb6