Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 3/3/22 13:28, Xiaoguang Wang wrote:
IORING_REGISTER_FILES is a good feature to reduce fget/fput overhead for
each IO we do on file, but still left one, which is io_uring_enter(2).
In io_uring_enter(2), it still fget/fput io_ring fd. I have observed
this overhead in some our internal oroutine implementations based on
io_uring with low submit batch. To totally remove fget/fput overhead in
io_uring, we may add a small struct file cache in io_uring_task and add
a new IORING_ENTER_FIXED_FILE flag. Currently the capacity of this file
cache is 16, wihcih I think it maybe enough, also not that this cache is
per-thread.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@xxxxxxxxxxxxxxxxx>
---
  fs/io_uring.c                 | 127 +++++++++++++++++++++++++++++++++++++-----
  include/linux/io_uring.h      |   5 +-
  include/uapi/linux/io_uring.h |   9 +++
  3 files changed, 126 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 77b9c7e4793b..e1d4b537cb60 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -461,6 +461,8 @@ struct io_ring_ctx {
  	};
  };
+#define IO_RINGFD_REG_MAX 16
+
  struct io_uring_task {
  	/* submission side */
  	int			cached_refs;
@@ -476,6 +478,7 @@ struct io_uring_task {
  	struct io_wq_work_list	task_list;
  	struct io_wq_work_list	prior_task_list;
  	struct callback_head	task_work;
+	struct file		**registered_files;
  	bool			task_running;
  };
@@ -8739,8 +8742,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
  	if (unlikely(!tctx))
  		return -ENOMEM;
+ tctx->registered_files = kzalloc(sizeof(struct file *) * IO_RINGFD_REG_MAX,
+					 GFP_KERNEL);
+	if (unlikely(!tctx->registered_files)) {
+		kfree(tctx);
+		return -ENOMEM;
+	}
+
  	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
  	if (unlikely(ret)) {
+		kfree(tctx->registered_files);
  		kfree(tctx);
  		return ret;
  	}
@@ -8749,6 +8760,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
  	if (IS_ERR(tctx->io_wq)) {
  		ret = PTR_ERR(tctx->io_wq);
  		percpu_counter_destroy(&tctx->inflight);
+		kfree(tctx->registered_files);
  		kfree(tctx);
  		return ret;
  	}
@@ -9382,6 +9394,56 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
  	return -ENXIO;
  }
+static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked);
+
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_fd_reg reg;
+	struct io_uring_task *tctx;
+	struct file *file;
+	int ret;
+
+	if (copy_from_user(&reg, arg, sizeof(struct io_uring_fd_reg)))
+		return -EFAULT;
+	if (reg.offset > IO_RINGFD_REG_MAX)
should be >=
+		return -EINVAL;
+
+	ret = io_uring_add_tctx_node(ctx, true);
+	if (unlikely(ret))
+		return ret;
+
+	tctx = current->io_uring;
+	if (tctx->registered_files[reg.offset])
+		return -EBUSY;
+	file = fget(reg.fd);
+	if (unlikely(!file))
+		return -EBADF;
+	tctx->registered_files[reg.offset] = file;
+	return 0;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_task *tctx;
+	__u32 offset;
+
+	if (!current->io_uring)
+		return 0;
+
+	if (copy_from_user(&offset, arg, sizeof(__u32)))
+		return -EFAULT;
+	if (offset > IO_RINGFD_REG_MAX)
ditto
+		return -EINVAL;
+
+	tctx = current->io_uring;
+	if (tctx->registered_files[offset]) {
+		fput(tctx->registered_files[offset]);
+		tctx->registered_files[offset] = NULL;
+	}
+
+	return 0;
+}
+
  static void io_destroy_buffers(struct io_ring_ctx *ctx)
  {
  	struct io_buffer *buf;
@@ -9790,7 +9852,7 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
  	}
  }
-static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
+static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked)
  {
  	struct io_uring_task *tctx = current->io_uring;
  	struct io_tctx_node *node;
@@ -9825,9 +9887,11 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
  			return ret;
  		}
- mutex_lock(&ctx->uring_lock);
+		if (!locked)
+			mutex_lock(&ctx->uring_lock);
  		list_add(&node->ctx_node, &ctx->tctx_list);
-		mutex_unlock(&ctx->uring_lock);
+		if (!locked)
+			mutex_unlock(&ctx->uring_lock);
  	}
  	tctx->last = ctx;
  	return 0;
@@ -9836,13 +9900,13 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
  /*
   * Note that this task has used io_uring. We use it for cancelation purposes.
   */
-static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
+static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked)
  {
  	struct io_uring_task *tctx = current->io_uring;
if (likely(tctx && tctx->last == ctx))
  		return 0;
-	return __io_uring_add_tctx_node(ctx);
+	return __io_uring_add_tctx_node(ctx, locked);
  }
/*
@@ -9973,6 +10037,16 @@ void __io_uring_cancel(bool cancel_all)
  	io_uring_cancel_generic(cancel_all, NULL);
  }
+void io_uring_unreg_ringfd(struct io_uring_task *tctx)
+{
+	int i;
+
+	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+		if (tctx->registered_files[i])
+			fput(tctx->registered_files[i]);
+	}
+}
+
  static void *io_uring_validate_mmap_request(struct file *file,
  					    loff_t pgoff, size_t sz)
  {
@@ -10098,24 +10172,36 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
  	struct io_ring_ctx *ctx;
  	int submitted = 0;
  	struct fd f;
+	struct file *file;
  	long ret;
io_run_task_work(); if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+			       IORING_ENTER_FIXED_FILE)))
  		return -EINVAL;
- f = fdget(fd);
-	if (unlikely(!f.file))
-		return -EBADF;
+	if (flags & IORING_ENTER_FIXED_FILE) {
+		if (fd > IO_RINGFD_REG_MAX || !current->io_uring)
ditto
+			return -EINVAL;
+
+		file = current->io_uring->registered_files[fd];
+		if (unlikely(!file))
+			return -EBADF;
+	} else {
+		f = fdget(fd);
+		if (unlikely(!f.file))
+			return -EBADF;
+		file = f.file;
+	}
ret = -EOPNOTSUPP;
-	if (unlikely(f.file->f_op != &io_uring_fops))
+	if (unlikely(file->f_op != &io_uring_fops))
  		goto out_fput;
ret = -ENXIO;
-	ctx = f.file->private_data;
+	ctx = file->private_data;
  	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
  		goto out_fput;
@@ -10145,7 +10231,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
  		}
  		submitted = to_submit;
  	} else if (to_submit) {
-		ret = io_uring_add_tctx_node(ctx);
+		ret = io_uring_add_tctx_node(ctx, false);
  		if (unlikely(ret))
  			goto out;
  		mutex_lock(&ctx->uring_lock);
@@ -10182,7 +10268,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
  out:
  	percpu_ref_put(&ctx->refs);
  out_fput:
-	fdput(f);
+	if (!(flags & IORING_ENTER_FIXED_FILE))
+		fdput(f);
  	return submitted ? submitted : ret;
  }
@@ -10413,7 +10500,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
  	if (fd < 0)
  		return fd;
- ret = io_uring_add_tctx_node(ctx);
+	ret = io_uring_add_tctx_node(ctx, false);
  	if (ret) {
  		put_unused_fd(fd);
  		return ret;
@@ -11134,6 +11221,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
  			break;
  		ret = io_register_iowq_max_workers(ctx, arg);
  		break;
+	case IORING_REGISTER_IORINGFD:
+		ret = -EINVAL;
+		if (nr_args != 1)
+			break;
+		ret = io_ringfd_register(ctx, arg);
+		break;
+	case IORING_UNREGISTER_IORINGFD:
+		ret = -EINVAL;
+		if (nr_args != 1)
+			break;
+		ret = io_ringfd_unregister(ctx, arg);
+		break;
  	default:
  		ret = -EINVAL;
  		break;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 649a4d7c241b..5ddea83912c7 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -9,11 +9,14 @@
  struct sock *io_uring_get_socket(struct file *file);
  void __io_uring_cancel(bool cancel_all);
  void __io_uring_free(struct task_struct *tsk);
+void io_uring_unreg_ringfd(struct io_uring_task *tctx);
static inline void io_uring_files_cancel(void)
  {
-	if (current->io_uring)
+	if (current->io_uring) {
+		io_uring_unreg_ringfd(current->io_uring);
  		__io_uring_cancel(false);
+	}
  }
  static inline void io_uring_task_cancel(void)
  {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 787f491f0d2a..f076a203317a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -261,6 +261,7 @@ struct io_cqring_offsets {
  #define IORING_ENTER_SQ_WAKEUP	(1U << 1)
  #define IORING_ENTER_SQ_WAIT	(1U << 2)
  #define IORING_ENTER_EXT_ARG	(1U << 3)
+#define IORING_ENTER_FIXED_FILE	(1U << 4)
/*
   * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -325,6 +326,9 @@ enum {
  	/* set/get max number of io-wq workers */
  	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
+ IORING_REGISTER_IORINGFD = 20,
+	IORING_UNREGISTER_IORINGFD		= 21,
+
  	/* this goes last */
  	IORING_REGISTER_LAST
  };
@@ -422,4 +426,9 @@ struct io_uring_getevents_arg {
  	__u64	ts;
  };
+struct io_uring_fd_reg {
+	__u32	offset;
+	__s32	fd;
+};
+
  #endif



[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux