Every task_work will try to wake the task to be executed, which causes excessive scheduling with corresponding overhead. For some tw it's justified, but others won't do much but post a single CQE. When a task waits for multiple cqes, every such task_work will wake it up. Instead, the task may give a hint about how many cqes it waits for, io_req_local_work_add() will compare against it and skip wake ups if #cqes + #tw items is not enough to satisfy the task. The optimisation is used only for simple enough tws, more complex and/or urgent items will force wake up. It's also limited to DEFER_TASKRUN. The trade-off is having extra atomics in io_req_local_work_add() but saving more on rescheduling the task.. Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 41 +++++++++++++++++++++------------- io_uring/io_uring.h | 1 + io_uring/notif.h | 2 +- io_uring/rw.c | 2 +- 5 files changed, 29 insertions(+), 19 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 00689c12f6ab..fdf0ae28023d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -295,7 +295,7 @@ struct io_ring_ctx { spinlock_t completion_lock; bool poll_multi_queue; - bool cq_waiting; + atomic_t cq_wait_nr; /* * ->iopoll_list is protected by the ctx->uring_lock for diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 42ada470845f..0fa4dee8dcf4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1279,31 +1279,38 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx) } } -static void io_req_local_work_add(struct io_kiocb *req) +static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; + bool first; percpu_ref_get(&ctx->refs); - if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) - goto put_ref; - + first = llist_add(&req->io_task_work.node, &ctx->work_llist); /* needed for the following wake up */ smp_mb__after_atomic(); - if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) { - io_move_task_work_from_local(ctx); - goto put_ref; + if (first) { + if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) { + io_move_task_work_from_local(ctx); + goto put_ref; + } + + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + if (ctx->has_evfd) + io_eventfd_signal(ctx); } - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ctx->has_evfd) - io_eventfd_signal(ctx); + if (atomic_read(&ctx->cq_wait_nr) <= 0) + goto put_ref; - if (READ_ONCE(ctx->cq_waiting)) - wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); + if (!(flags & IOU_F_TWQ_FACILE)) + atomic_set(&ctx->cq_wait_nr, 0); + else if (atomic_dec_return(&ctx->cq_wait_nr) > 0) + goto put_ref; + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); put_ref: percpu_ref_put(&ctx->refs); } @@ -1315,7 +1322,7 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) if (!(flags & IOU_F_TWQ_FORCE_NORMAL) && (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { - io_req_local_work_add(req); + io_req_local_work_add(req, flags); return; } @@ -2601,7 +2608,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, unsigned long check_cq; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - WRITE_ONCE(ctx->cq_waiting, 1); + int to_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); + + atomic_set(&ctx->cq_wait_nr, to_wait); set_current_state(TASK_INTERRUPTIBLE); } else { prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, @@ -2610,7 +2619,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, ret = io_cqring_wait_schedule(ctx, &iowq); __set_current_state(TASK_RUNNING); - WRITE_ONCE(ctx->cq_waiting, 0); + atomic_set(&ctx->cq_wait_nr, 0); if (ret < 0) break; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index cd2e702f206c..98ff9b71d498 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -18,6 +18,7 @@ enum { /* don't use deferred task_work */ IOU_F_TWQ_FORCE_NORMAL = 1, + IOU_F_TWQ_FACILE = 2, }; enum { diff --git a/io_uring/notif.h b/io_uring/notif.h index c88c800cd89d..ec9998fb0be6 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -33,7 +33,7 @@ static inline void io_notif_flush(struct io_kiocb *notif) /* drop slot's master ref */ if (refcount_dec_and_test(&nd->uarg.refcnt)) - io_req_task_work_add(notif); + __io_req_task_work_add(notif, IOU_F_TWQ_FACILE); } static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) diff --git a/io_uring/rw.c b/io_uring/rw.c index 4c233910e200..a4578c120973 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -304,7 +304,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) return; io_req_set_res(req, io_fixup_rw_res(req, res), 0); req->io_task_work.func = io_req_rw_complete; - io_req_task_work_add(req); + __io_req_task_work_add(req, IOU_F_TWQ_FACILE); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) -- 2.39.1