On 2/8/25 23:27, Pavel Begunkov wrote:
...
But it might be better to just poll the epoll fd, reuse all the
io_uring polling machinery, and implement IO_URING_F_MULTISHOT for
the epoll opcode.
epoll_issue(issue_flags) {
if (!(flags & IO_URING_F_MULTISHOT))
return -EAGAIN;
res = epoll_check_events();
post_cqe(res);
etc.
}
I think that would make this patch quite trivial, including
the multishot mode.
Something like this instead of the last patch. Completely untested,
the eventpoll.c hunk is dirty might be incorrect, need to pass the
right mask for polling, and all that. At least it looks simpler,
and probably doesn't need half of the prep patches.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b96cc9193517..99dd8c1a2f2c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1996,33 +1996,6 @@ static int ep_try_send_events(struct eventpoll *ep,
return res;
}
-static int ep_poll_queue(struct eventpoll *ep,
- struct epoll_event __user *events, int maxevents,
- struct wait_queue_entry *wait)
-{
- int res = 0, eavail;
-
- /* See ep_poll() for commentary */
- eavail = ep_events_available(ep);
- while (1) {
- if (eavail) {
- res = ep_try_send_events(ep, events, maxevents);
- if (res)
- return res;
- }
- if (!list_empty_careful(&wait->entry))
- break;
- write_lock_irq(&ep->lock);
- eavail = ep_events_available(ep);
- if (!eavail)
- __add_wait_queue_exclusive(&ep->wq, wait);
- write_unlock_irq(&ep->lock);
- if (!eavail)
- break;
- }
- return -EIOCBQUEUED;
-}
-
static int __epoll_wait_remove(struct eventpoll *ep,
struct wait_queue_entry *wait, int timed_out)
{
@@ -2517,16 +2490,22 @@ static int ep_check_params(struct file *file, struct epoll_event __user *evs,
return 0;
}
-int epoll_queue(struct file *file, struct epoll_event __user *events,
- int maxevents, struct wait_queue_entry *wait)
+int epoll_sendevents(struct file *file, struct epoll_event __user *events,
+ int maxevents)
{
- int ret;
+ int res = 0, eavail;
ret = ep_check_params(file, events, maxevents);
if (unlikely(ret))
return ret;
- return ep_poll_queue(file->private_data, events, maxevents, wait);
+ eavail = ep_events_available(ep);
+ if (eavail) {
+ res = ep_try_send_events(ep, events, maxevents);
+ if (res)
+ return res;
+ }
+ return 0;
}
/*
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 6c088d5e945b..751e3f325927 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -25,9 +25,8 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long t
/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);
-/* Use to reap events, and/or queue for a callback on new events */
-int epoll_queue(struct file *file, struct epoll_event __user *events,
- int maxevents, struct wait_queue_entry *wait);
+int epoll_sendevents(struct file *file, struct epoll_event __user *events,
+ int maxevents);
/* Remove wait entry */
int epoll_wait_remove(struct file *file, struct wait_queue_entry *wait);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index e11c82638527..a559e1e1544a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -278,6 +278,7 @@ enum io_uring_op {
IORING_OP_FTRUNCATE,
IORING_OP_BIND,
IORING_OP_LISTEN,
+ IORING_OP_EPOLL_WAIT,
/* this goes last, obviously */
IORING_OP_LAST,
diff --git a/io_uring/epoll.c b/io_uring/epoll.c
index 7848d9cc073d..6d2c48ba1923 100644
--- a/io_uring/epoll.c
+++ b/io_uring/epoll.c
@@ -20,6 +20,12 @@ struct io_epoll {
struct epoll_event event;
};
+struct io_epoll_wait {
+ struct file *file;
+ int maxevents;
+ struct epoll_event __user *events;
+};
+
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_epoll *epoll = io_kiocb_to_cmd(req, struct io_epoll);
@@ -57,3 +63,30 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
io_req_set_res(req, ret, 0);
return IOU_OK;
}
+
+int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait);
+
+ if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+ return -EINVAL;
+
+ iew->maxevents = READ_ONCE(sqe->len);
+ iew->events = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ return 0;
+}
+
+int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait);
+ int ret;
+
+ ret = epoll_sendevents(req->file, iew->events, iew->maxevents);
+ if (ret == 0)
+ return -EAGAIN;
+ if (ret < 0)
+ req_set_fail(req);
+
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
diff --git a/io_uring/epoll.h b/io_uring/epoll.h
index 870cce11ba98..4111997c360b 100644
--- a/io_uring/epoll.h
+++ b/io_uring/epoll.h
@@ -3,4 +3,6 @@
#if defined(CONFIG_EPOLL)
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags);
+int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags);
#endif
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index e8baef4e5146..bd62d6068b61 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -514,6 +514,18 @@ const struct io_issue_def io_issue_defs[] = {
.async_size = sizeof(struct io_async_msghdr),
#else
.prep = io_eopnotsupp_prep,
+#endif
+ },
+ [IORING_OP_EPOLL_WAIT] = {
+ .needs_file = 1,
+ .audit_skip = 1,
+ .pollout = 1,
+ .pollin = 1,
+#if defined(CONFIG_EPOLL)
+ .prep = io_epoll_wait_prep,
+ .issue = io_epoll_wait,
+#else
+ .prep = io_eopnotsupp_prep,
#endif
},
};
@@ -745,6 +757,9 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_LISTEN] = {
.name = "LISTEN",
},
+ [IORING_OP_EPOLL_WAIT] = {
+ .name = "EPOLL_WAIT",
+ },
};
const char *io_uring_get_opcode(u8 opcode)