Some io_uring-eventfd users assume that there won't be spurious wakeups. That assumption has to be honoured by all io_cqring_ev_posted() callers, which is inconvenient and from time to time leads to problems but should be maintained to not break the userspace. Instead of making the callers to track whether a CQE was posted or not, hide it inside io_eventfd_signal(). It saves ->cached_cq_tail it saw last time and triggers the eventfd only when ->cached_cq_tail changed since then. Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> --- include/linux/io_uring_types.h | 2 ++ io_uring/io_uring.c | 44 ++++++++++++++++++++-------------- io_uring/timeout.c | 3 +-- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 779c72da5b8f..327bc7f0808d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -315,6 +315,8 @@ struct io_ring_ctx { struct list_head defer_list; unsigned sq_thread_idle; + /* protected by ->completion_lock */ + unsigned evfd_last_cq_tail; }; enum { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 31beb9ccbf12..0875cc649e23 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -473,6 +473,22 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; + bool skip; + + spin_lock(&ctx->completion_lock); + /* + * Eventfd should only get triggered when at least one event has been + * posted. Some applications rely on the eventfd notification count only + * changing IFF a new CQE has been added to the CQ ring. There's no + * depedency on 1:1 relationship between how many times this function is + * called (and hence the eventfd count) and number of CQEs posted to the + * CQ ring. + */ + skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + if (skip) + return; rcu_read_lock(); /* @@ -511,13 +527,6 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_eventfd_signal(ctx); } -/* - * This should only get called when at least one event has been posted. - * Some applications rely on the eventfd notification count only changing - * IFF a new CQE has been added to the CQ ring. There's no depedency on - * 1:1 relationship between how many times this function is called (and - * hence the eventfd count) and number of CQEs posted to the CQ ring. - */ void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (unlikely(ctx->off_timeout_used || ctx->drain_active || @@ -530,7 +539,7 @@ void io_cqring_ev_posted(struct io_ring_ctx *ctx) /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { - bool all_flushed, posted; + bool all_flushed; size_t cqe_size = sizeof(struct io_uring_cqe); if (!force && __io_cqring_events(ctx) == ctx->cq_entries) @@ -539,7 +548,6 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (ctx->flags & IORING_SETUP_CQE32) cqe_size <<= 1; - posted = false; spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe = io_get_cqe(ctx); @@ -554,7 +562,6 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) else io_account_cq_overflow(ctx); - posted = true; list_del(&ocqe->list); kfree(ocqe); } @@ -567,8 +574,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted(ctx); return all_flushed; } @@ -758,8 +764,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, filled = io_fill_cqe_aux(ctx, user_data, res, cflags); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (filled) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted(ctx); return filled; } @@ -940,14 +945,12 @@ __cold void io_free_req(struct io_kiocb *req) static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - bool posted; spin_lock(&ctx->completion_lock); - posted = io_disarm_next(req); + io_disarm_next(req); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted(ctx); } static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) @@ -2431,6 +2434,11 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, kfree(ev_fd); return ret; } + + spin_lock(&ctx->completion_lock); + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + ev_fd->eventfd_async = eventfd_async; ctx->has_evfd = true; rcu_assign_pointer(ctx->io_ev_fd, ev_fd); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 557c637af158..4938c1cdcbcd 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -628,7 +628,6 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, spin_unlock_irq(&ctx->timeout_lock); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (canceled != 0) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted(ctx); return canceled != 0; } -- 2.36.1