On 8/22/24 7:46 AM, Pavel Begunkov wrote: > On 8/21/24 15:16, Jens Axboe wrote: >> Waiting for events with io_uring has two knobs that can be set: >> >> 1) The number of events to wake for >> 2) The timeout associated with the event >> >> Waiting will abort when either of those conditions are met, as expected. >> >> This adds support for a third event, which is associated with the number >> of events to wait for. Applications generally like to handle batches of >> completions, and right now they'd set a number of events to wait for and >> the timeout for that. If no events have been received but the timeout >> triggers, control is returned to the application and it can wait again. >> However, if the application doesn't have anything to do until events are >> reaped, then it's possible to make this waiting more efficient. >> >> For example, the application may have a latency time of 50 usecs and >> wanting to handle a batch of 8 requests at the time. If it uses 50 usecs >> as the timeout, then it'll be doing 20K context switches per second even >> if nothing is happening. >> >> This introduces the notion of min batch wait time. If the min batch wait >> time expires, then we'll return to userspace if we have any events at all. >> If none are available, the general wait time is applied. Any request >> arriving after the min batch wait time will cause waiting to stop and >> return control to the application. >> >> Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> >> --- >> io_uring/io_uring.c | 88 ++++++++++++++++++++++++++++++++++++++------- >> io_uring/io_uring.h | 2 ++ >> 2 files changed, 77 insertions(+), 13 deletions(-) >> >> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c >> index 4ba5292137c3..87e7cf6551d7 100644 >> --- a/io_uring/io_uring.c >> +++ b/io_uring/io_uring.c >> @@ -2322,7 +2322,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, >> * Cannot safely flush overflowed CQEs from here, ensure we wake up >> * the task, and the next invocation will do it. >> */ >> - if (io_should_wake(iowq) || io_has_work(iowq->ctx) || iowq->hit_timeout) >> + if (io_should_wake(iowq) || io_has_work(iowq->ctx) || >> + READ_ONCE(iowq->hit_timeout)) >> return autoremove_wake_function(curr, mode, wake_flags, key); >> return -1; >> } >> @@ -2359,13 +2360,66 @@ static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) >> return HRTIMER_NORESTART; >> } >> +/* >> + * Doing min_timeout portion. If we saw any timeouts, events, or have work, >> + * wake up. If not, and we have a normal timeout, switch to that and keep >> + * sleeping. >> + */ >> +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) >> +{ >> + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); >> + struct io_ring_ctx *ctx = iowq->ctx; >> + >> + /* no general timeout, or shorter, we are done */ >> + if (iowq->timeout == KTIME_MAX || >> + ktime_after(iowq->min_timeout, iowq->timeout)) >> + goto out_wake; >> + /* work we may need to run, wake function will see if we need to wake */ >> + if (io_has_work(ctx)) >> + goto out_wake; >> + /* got events since we started waiting, min timeout is done */ >> + if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) >> + goto out_wake; >> + /* if we have any events and min timeout expired, we're done */ >> + if (io_cqring_events(ctx)) >> + goto out_wake; >> + >> + /* >> + * If using deferred task_work running and application is waiting on >> + * more than one request, ensure we reset it now where we are switching >> + * to normal sleeps. Any request completion post min_wait should wake >> + * the task and return. >> + */ >> + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { >> + atomic_set(&ctx->cq_wait_nr, 1); >> + smp_mb(); >> + if (!llist_empty(&ctx->work_llist)) >> + goto out_wake; >> + } >> + >> + iowq->t.function = io_cqring_timer_wakeup; >> + hrtimer_set_expires(timer, iowq->timeout); >> + return HRTIMER_RESTART; >> +out_wake: >> + return io_cqring_timer_wakeup(timer); >> +} >> + >> static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, >> - clockid_t clock_id) >> + clockid_t clock_id, ktime_t start_time) >> { >> - iowq->hit_timeout = 0; >> + ktime_t timeout; >> + >> + WRITE_ONCE(iowq->hit_timeout, 0); >> hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS); >> - iowq->t.function = io_cqring_timer_wakeup; >> - hrtimer_set_expires_range_ns(&iowq->t, iowq->timeout, 0); >> + if (iowq->min_timeout) { > > What's the default, 0 or KTIME_MAX? __io_cqring_wait_schedule() > checks KTIME_MAX instead. In practice either one works, but let's keep it consistent - since it's a relative value (eg you ask for xx usec), I'll change the one that checks for KTIME_MAX to just check if it's set. > It likely needs to account for hit_timeout. Not looking deep into > the new callback, but imagine that it expired and you promoted the > timeout to the next stage (long wait). Then you get a spurious wake > up, it cancels timeouts, loops in io_cqring_wait() and gets back to > schedule timeout. Since nothing modified ->min_timeout it'll > try a short timeout again. Yeah good point, we don't want to redo it for that case. With hit_timeout being set earlier now, we can just check it in here. >> @@ -2379,7 +2433,8 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, >> } >> static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, >> - struct io_wait_queue *iowq) >> + struct io_wait_queue *iowq, >> + ktime_t start_time) >> { >> int ret = 0; >> @@ -2390,8 +2445,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, >> */ >> if (current_pending_io()) >> current->in_iowait = 1; >> - if (iowq->timeout != KTIME_MAX) >> - ret = io_cqring_schedule_timeout(iowq, ctx->clockid); >> + if (iowq->timeout != KTIME_MAX || iowq->min_timeout != KTIME_MAX) >> + ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); >> else >> schedule(); >> current->in_iowait = 0; >> @@ -2400,7 +2455,8 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, >> /* If this returns > 0, the caller should retry */ >> static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, >> - struct io_wait_queue *iowq) >> + struct io_wait_queue *iowq, >> + ktime_t start_time) >> { >> if (unlikely(READ_ONCE(ctx->check_cq))) >> return 1; >> @@ -2413,7 +2469,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, >> if (unlikely(io_should_wake(iowq))) >> return 0; >> - return __io_cqring_wait_schedule(ctx, iowq); >> + return __io_cqring_wait_schedule(ctx, iowq, start_time); >> } >> struct ext_arg { >> @@ -2431,6 +2487,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, >> { >> struct io_wait_queue iowq; >> struct io_rings *rings = ctx->rings; >> + ktime_t start_time; >> int ret; >> if (!io_allowed_run_tw(ctx)) >> @@ -2449,8 +2506,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, >> INIT_LIST_HEAD(&iowq.wq.entry); >> iowq.ctx = ctx; >> iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); >> + iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); >> iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; >> + iowq.min_timeout = 0; >> iowq.timeout = KTIME_MAX; >> + start_time = io_get_time(ctx); >> if (ext_arg->ts) { >> struct timespec64 ts; >> @@ -2460,7 +2520,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, >> iowq.timeout = timespec64_to_ktime(ts); >> if (!(flags & IORING_ENTER_ABS_TIMER)) >> - iowq.timeout = ktime_add(iowq.timeout, io_get_time(ctx)); >> + iowq.timeout = ktime_add(iowq.timeout, start_time); >> } >> if (ext_arg->sig) { >> @@ -2484,14 +2544,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, >> unsigned long check_cq; >> if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { >> - atomic_set(&ctx->cq_wait_nr, nr_wait); >> + /* if min timeout has been hit, don't reset wait count */ >> + if (!READ_ONCE(iowq.hit_timeout)) > > Why read once? You're out of io_cqring_schedule_timeout(), > timers are cancelled and everything should've been synchronised > by this point. Just for consistency's sake. >> + atomic_set(&ctx->cq_wait_nr, nr_wait); > > if (hit_timeout) > nr_wait = 1; > atomic_set(cq_wait_nr, nr_wait); > > otherwise, you're risking not to be ever woken up > ever again for this wait by tw. Good point, I'll init nr_wait rather than check here. -- Jens Axboe