Re: Problems replacing epoll with io_uring in tevent

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Jens,

I'm currently trying to prototype for an IORING_POLL_CANCEL_ON_CLOSE
flag that can be passed to POLL_ADD. With that we'll register
the request in &req->file->f_uring_poll (similar to the file->f_ep list for epoll)
Then we only get a real reference to the file during the call to
vfs_poll() otherwise we drop the fget/fput reference and rely on
an io_uring_poll_release_file() (similar to eventpoll_release_file())
to cancel our registered poll request.

Yes, this is a bit tricky as we hold the file ref across the operation. I'd
be interested in seeing your approach to this, and also how it would
interact with registered files...

Here's my current patch:
https://git.samba.org/?p=metze/linux/wip.git;a=commitdiff;h=b9cccfac515739fc279c6eec87ce655a96f94685
It compiles, but I haven't tested it yet. And I'm not sure if the locking is done correctly...

It doesn't deadlock nor blow up immediately :-)
And it does fix the problem I had.

So what do you think about that patch?
Am I doing stupid things there?

These points might be changed:
- returning -EBADF instead of -ECANCELED
  might be better and allow the caller to avoid
  retrying.
- I guess we could use a single linked list, but
  I'm mostly used to how struct list_head works.
  And I want something that works first.
- We may find a better name than IORING_POLL_CANCEL_ON_CLOSE
- struct io_poll is completely full, as well as io_kiocb->flags
  (maybe we should move flags to 64 bit?),
  so we need to use some other generic struct io_kiocb space,
  which might also be good in order make it possible to keep io_poll_add()
  and io_arm_poll_handler() in common.
  But we may have the new field a bit differently. Note that
  struct io_kiocb (without this patch) still has 32 free bytes before
  4 64 byte cachelines are filled. With my patch 24 bytes are left...
- In struct file it might be possible to share a reference list with
  with the epoll code, where each element can indicate it epoll
  or io_uring is used.

I'm pasting it below in order to make it easier to get comments...

metze

 fs/file_table.c                |   3 ++
 include/linux/fs.h             |   1 +
 include/linux/io_uring.h       |  12 +++++
 include/linux/io_uring_types.h |   4 ++
 include/uapi/linux/io_uring.h  |   1 +
 io_uring/opdef.c               |   1 +
 io_uring/poll.c                | 100 ++++++++++++++++++++++++++++++++++++++++-
 io_uring/poll.h                |   1 +
 8 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index dd88701e54a9..cad408e9c0f5 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -16,6 +16,7 @@
 #include <linux/security.h>
 #include <linux/cred.h>
 #include <linux/eventpoll.h>
+#include <linux/io_uring.h>
 #include <linux/rcupdate.h>
 #include <linux/mount.h>
 #include <linux/capability.h>
@@ -147,6 +148,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
 	}

 	atomic_long_set(&f->f_count, 1);
+	INIT_LIST_HEAD(&f->f_uring_poll);
 	rwlock_init(&f->f_owner.lock);
 	spin_lock_init(&f->f_lock);
 	mutex_init(&f->f_pos_lock);
@@ -309,6 +311,7 @@ static void __fput(struct file *file)
 	 * in the file cleanup chain.
 	 */
 	eventpoll_release(file);
+	io_uring_poll_release(file);
 	locks_remove_file(file);

 	ima_file_free(file);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e654435f1651..7f99efa7a1dc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -972,6 +972,7 @@ struct file {
 	/* Used by fs/eventpoll.c to link all the hooks to this file */
 	struct hlist_head	*f_ep;
 #endif /* #ifdef CONFIG_EPOLL */
+	struct list_head	f_uring_poll;
 	struct address_space	*f_mapping;
 	errseq_t		f_wb_err;
 	errseq_t		f_sb_err; /* for syncfs */
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 43bc8a2edccf..c931ea92c29a 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -61,6 +61,15 @@ static inline void io_uring_free(struct task_struct *tsk)
 	if (tsk->io_uring)
 		__io_uring_free(tsk);
 }
+
+void io_uring_poll_release_file(struct file *file);
+static inline void io_uring_poll_release(struct file *file)
+{
+	if (likely(list_empty_careful(&file->f_uring_poll)))
+		return;
+
+	io_uring_poll_release_file(file);
+}
 #else
 static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
@@ -92,6 +101,9 @@ static inline const char *io_uring_get_opcode(u8 opcode)
 {
 	return "";
 }
+static inline void io_uring_poll_release(struct file *file)
+{
+}
 #endif

 #endif
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f5b687a787a3..2373e01c57e7 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -547,8 +547,12 @@ struct io_kiocb {
 	union {
 		/* used by request caches, completion batching and iopoll */
 		struct io_wq_work_node	comp_list;
+		struct {
 		/* cache ->apoll->events */
 		__poll_t apoll_events;
+		u8 poll_cancel_on_close:1;
+		struct list_head		f_uring_poll_entry;
+		};
 	};
 	atomic_t			refs;
 	atomic_t			poll_refs;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a2ce8ba7abb5..fe311667cb8c 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -276,6 +276,7 @@ enum io_uring_op {
 #define IORING_POLL_UPDATE_EVENTS	(1U << 1)
 #define IORING_POLL_UPDATE_USER_DATA	(1U << 2)
 #define IORING_POLL_ADD_LEVEL		(1U << 3)
+#define IORING_POLL_CANCEL_ON_CLOSE	(1U << 4)

 /*
  * ASYNC_CANCEL flags.
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 34b08c87ffa5..540ee55961a3 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -131,6 +131,7 @@ const struct io_op_def io_op_defs[] = {
 		.name			= "POLL_ADD",
 		.prep			= io_poll_add_prep,
 		.issue			= io_poll_add,
+		.cleanup		= io_poll_cleanup,
 	},
 	[IORING_OP_POLL_REMOVE] = {
 		.audit_skip		= 1,
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 0d9f49c575e0..d4ccf2f2e815 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -163,6 +163,19 @@ static inline void io_poll_remove_entry(struct io_poll *poll)

 static void io_poll_remove_entries(struct io_kiocb *req)
 {
+	if (!list_empty_careful(&req->f_uring_poll_entry)) {
+		spin_lock(&req->file->f_lock);
+		list_del_init_careful(&req->f_uring_poll_entry);
+		/*
+		 * upgrade to a full reference again,
+		 * it will be released in the common
+		 * cleanup code via io_put_file().
+		 */
+		if (!(req->flags & REQ_F_FIXED_FILE))
+			WARN_ON_ONCE(!get_file_rcu(req->file));
+		spin_unlock(&req->file->f_lock);
+	}
+
 	/*
 	 * Nothing to do if neither of those flags are set. Avoid dipping
 	 * into the poll/apoll/double cachelines if we can.
@@ -199,6 +212,54 @@ enum {
 	IOU_POLL_REMOVE_POLL_USE_RES = 2,
 };

+static inline struct file *io_poll_get_additional_file_ref(struct io_kiocb *req,
+							   unsigned issue_flags)
+{
+	if (!(req->poll_cancel_on_close))
+		return NULL;
+
+	if (unlikely(!req->file))
+		return NULL;
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+
+	if (list_empty_careful(&req->f_uring_poll_entry)) {
+		/*
+		 * This first time we need to add ourself to the
+		 * file->f_uring_poll.
+		 */
+		spin_lock(&req->file->f_lock);
+		list_add_tail(&req->f_uring_poll_entry, &req->file->f_uring_poll);
+		spin_unlock(&req->file->f_lock);
+		if (!(req->flags & REQ_F_FIXED_FILE)) {
+			/*
+			 * If it's not a fixed file,
+			 * we can allow the caller to drop the existing
+			 * reference.
+			 */
+			return req->file;
+		}
+		/*
+		 * For fixed files we grab an additional reference
+		 */
+	}
+
+	io_ring_submit_lock(req->ctx, issue_flags);
+	if (unlikely(!req->file)) {
+		io_ring_submit_unlock(req->ctx, issue_flags);
+		return NULL;
+	}
+	rcu_read_lock();
+	if (unlikely(!get_file_rcu(req->file))) {
+		req->file = NULL;
+		req->cqe.fd = -1;
+		io_poll_mark_cancelled(req);
+	}
+	rcu_read_unlock();
+	io_ring_submit_unlock(req->ctx, issue_flags);
+	return req->file;
+}
+
 /*
  * All poll tw should go through this. Checks for poll events, manages
  * references, does rewait, etc.
@@ -230,7 +291,12 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 		/* the mask was stashed in __io_poll_execute */
 		if (!req->cqe.res) {
 			struct poll_table_struct pt = { ._key = req->apoll_events };
+			unsigned issue_flags = (!*locked) ? IO_URING_F_UNLOCKED : 0;
+			struct file *file_to_put = io_poll_get_additional_file_ref(req, issue_flags);
+			if (unlikely(!req->file))
+				return -ECANCELED;
 			req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
+			io_put_file(file_to_put);
 		}

 		if ((unlikely(!req->cqe.res)))
@@ -499,6 +565,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 				 unsigned issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	struct file *file_to_put;
 	int v;

 	INIT_HLIST_NODE(&req->hash_node);
@@ -506,6 +573,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	io_init_poll_iocb(poll, mask, io_poll_wake);
 	poll->file = req->file;
 	req->apoll_events = poll->events;
+	INIT_LIST_HEAD(&req->f_uring_poll_entry);

 	ipt->pt._key = mask;
 	ipt->req = req;
@@ -529,7 +597,11 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
 	if (issue_flags & IO_URING_F_UNLOCKED)
 		req->flags &= ~REQ_F_HASH_LOCKED;

+	file_to_put = io_poll_get_additional_file_ref(req, issue_flags);
+	if (unlikely(!req->file))
+		return -ECANCELED;
 	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
+	io_put_file(file_to_put);

 	if (unlikely(ipt->error || !ipt->nr_entries)) {
 		io_poll_remove_entries(req);
@@ -857,11 +929,17 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (sqe->buf_index || sqe->off || sqe->addr)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
-	if (flags & ~IORING_POLL_ADD_MULTI)
+	if (flags & ~(IORING_POLL_ADD_MULTI|IORING_POLL_CANCEL_ON_CLOSE))
 		return -EINVAL;
 	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
 		return -EINVAL;

+	if (flags & IORING_POLL_CANCEL_ON_CLOSE) {
+		req->poll_cancel_on_close = 1;
+	} else {
+		req->poll_cancel_on_close = 0;
+	}
+
 	poll->events = io_poll_parse_events(sqe, flags);
 	return 0;
 }
@@ -963,3 +1041,23 @@ void io_apoll_cache_free(struct io_cache_entry *entry)
 {
 	kfree(container_of(entry, struct async_poll, cache));
 }
+
+void io_uring_poll_release_file(struct file *file)
+{
+	struct io_kiocb *req, *next;
+
+	list_for_each_entry_safe(req, next, &file->f_uring_poll, f_uring_poll_entry) {
+		io_ring_submit_lock(req->ctx, IO_URING_F_UNLOCKED);
+		io_poll_mark_cancelled(req);
+		list_del_init_careful(&req->f_uring_poll_entry);
+		io_poll_remove_entries(req);
+		req->file = NULL;
+		io_poll_execute(req, 0);
+		io_ring_submit_unlock(req->ctx, IO_URING_F_UNLOCKED);
+	}
+}
+
+void io_poll_cleanup(struct io_kiocb *req)
+{
+	io_poll_remove_entries(req);
+}
diff --git a/io_uring/poll.h b/io_uring/poll.h
index ef25c26fdaf8..43e6b877f1bc 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -27,6 +27,7 @@ struct async_poll {

 int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_poll_add(struct io_kiocb *req, unsigned int issue_flags);
+void io_poll_cleanup(struct io_kiocb *req);

 int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags);




[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux