Using io_uring with epoll is very expensive because every completion leads to a __wake_up() call, most of which are unnecessary because the polling process has already been woken up but has not had a chance to process the completions. During this time, wq_has_sleeper() still returns true, therefore this check is not enough. Perf diff for a HTTP server pushing a static file with splice() into the TCP socket: 37.91% -2.00% [kernel.kallsyms] [k] queued_spin_lock_slowpath 1.69% -1.67% [kernel.kallsyms] [k] ep_poll_callback 0.95% +1.64% [kernel.kallsyms] [k] io_wq_free_work 0.88% -0.35% [kernel.kallsyms] [k] _raw_spin_lock_irqsave 1.66% +0.28% [kernel.kallsyms] [k] io_worker_handle_work 1.14% +0.18% [kernel.kallsyms] [k] _raw_spin_lock 0.24% -0.17% [kernel.kallsyms] [k] __wake_up Signed-off-by: Max Kellermann <max.kellermann@xxxxxxxxx> --- include/linux/io_uring_types.h | 10 ++++++++++ io_uring/io_uring.c | 4 ++++ io_uring/io_uring.h | 2 +- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 623d8e798a11..01514cb76095 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -384,6 +384,16 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; + /** + * Non-zero if a process is waiting for #poll_wq and reset to + * zero when #poll_wq is woken up. This is supposed to + * eliminate redundant wakeup calls. Only checking + * wq_has_sleeper() is not enough because it will return true + * until the sleeper has actually woken up and has been + * scheduled. + */ + atomic_t poll_wq_waiting; + u32 pers_next; struct xarray personalities; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 137c2066c5a3..b65efd07e9f0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2760,6 +2760,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb) * Wake ups for some events between start of polling and activation * might've been lost due to loose synchronisation. */ + atomic_set_release(&ctx->poll_wq_waiting, 0); wake_up_all(&ctx->poll_wq); percpu_ref_put(&ctx->refs); } @@ -2793,6 +2794,9 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) if (unlikely(!ctx->poll_activated)) io_activate_pollwq(ctx); + + atomic_set(&ctx->poll_wq_waiting, 1); + /* * provides mb() which pairs with barrier from wq_has_sleeper * call in io_commit_cqring diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index f65e3f3ede51..186cee066f9f 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -287,7 +287,7 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) static inline void io_poll_wq_wake(struct io_ring_ctx *ctx) { - if (wq_has_sleeper(&ctx->poll_wq)) + if (wq_has_sleeper(&ctx->poll_wq) && atomic_xchg_release(&ctx->poll_wq_waiting, 0) > 0) __wake_up(&ctx->poll_wq, TASK_NORMAL, 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); } -- 2.45.2