On Thu, 2022-08-18 at 11:42 +0100, Pavel Begunkov wrote: > On 8/16/22 16:37, Dylan Yudaken wrote: > > Allow deferring async tasks until the user calls io_uring_enter(2) > > with > > the IORING_ENTER_GETEVENTS flag. Enable this mode with a flag at > > io_uring_setup time. This functionality requires that the later > > io_uring_enter will be called from the same submission task, and > > therefore > > restrict this flag to work only when IORING_SETUP_SINGLE_ISSUER is > > also > > set. > > > > Being able to hand pick when tasks are run prevents the problem > > where > > there is current work to be done, however task work runs anyway. > > > > For example, a common workload would obtain a batch of CQEs, and > > process > > each one. Interrupting this to additional taskwork would add > > latency but > > not gain anything. If instead task work is deferred to just before > > more > > CQEs are obtained then no additional latency is added. > > > > The way this is implemented is by trying to keep task work local to > > a > > io_ring_ctx, rather than to the submission task. This is required, > > as the > > application will want to wake up only a single io_ring_ctx at a > > time to > > process work, and so the lists of work have to be kept separate. > > > > This has some other benefits like not having to check the task > > continually > > in handle_tw_list (and potentially unlocking/locking those), and > > reducing > > locks in the submit & process completions path. > > > > There are networking cases where using this option can reduce > > request > > latency by 50%. For example a contrived example using [1] where the > > client > > sends 2k data and receives the same data back while doing some > > system > > calls (to trigger task work) shows this reduction. The reason ends > > up > > being that if sending responses is delayed by processing task work, > > then > > the client side sits idle. Whereas reordering the sends first means > > that > > the client runs it's workload in parallel with the local task work. > > > > [1]: > > Using https://github.com/DylanZA/netbench/tree/defer_run > > Client: > > ./netbench --client_only 1 --control_port 10000 --host <host> --tx > > "epoll --threads 16 --per_thread 1 --size 2048 --resp 2048 -- > > workload 1000" > > Server: > > ./netbench --server_only 1 --control_port 10000 --rx "io_uring -- > > defer_taskrun 0 --workload 100" --rx "io_uring --defer_taskrun 1 > > --workload 100" > > > > Signed-off-by: Dylan Yudaken <dylany@xxxxxx> > > --- > > include/linux/io_uring_types.h | 2 + > > include/uapi/linux/io_uring.h | 7 ++ > > io_uring/cancel.c | 2 +- > > io_uring/io_uring.c | 119 > > +++++++++++++++++++++++++++++---- > > io_uring/io_uring.h | 30 ++++++++- > > io_uring/rsrc.c | 2 +- > > 6 files changed, 146 insertions(+), 16 deletions(-) > > > > [...] > > @@ -2230,7 +2299,7 @@ static int io_cqring_wait(struct io_ring_ctx > > *ctx, int min_events, > > io_cqring_overflow_flush(ctx); > > if (io_cqring_events(ctx) >= min_events) > > return 0; > > - if (!io_run_task_work()) > > + if (!io_run_task_work_ctx(ctx, false)) > > Sounds like there should be true, but see comments > around io_run_task_work_ctx(). I think it is good like it is. We only want to run task work that could cause completions here. > > Also, should we do > > if (DEFER_TW) { > if (current != ctx->submitter_task) // + SQPOLL handling > return -E<error>; > } > > And on top we can remove moving local tw -> normal tw in > io_run_local_work(), which probably will only be needed in > exit_work(). Yes - this seems sensible now that we enforce single issuer. > > > > break; > > } while (1); > > > > @@ -2768,6 +2837,8 @@ static __cold bool > > io_uring_try_cancel_requests(struct io_ring_ctx *ctx, > > } > > } > > > > + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) > > + ret |= io_run_local_work(ctx, false); > > ret |= io_cancel_defer_files(ctx, task, cancel_all); > > mutex_lock(&ctx->uring_lock); > > ret |= io_poll_remove_all(ctx, task, cancel_all); > > @@ -2837,7 +2908,7 @@ __cold void io_uring_cancel_generic(bool > > cancel_all, struct io_sq_data *sqd) > > } > > > > prepare_to_wait(&tctx->wait, &wait, > > TASK_INTERRUPTIBLE); > > - io_run_task_work(); > > + io_run_task_work_ctx(ctx, true); > > Here we rely on normal tw waking up the task, which is not done > by the new local tw Makes sense. I'll fix this up > > > io_uring_drop_tctx_refs(current); > > > > /* > > @@ -3057,10 +3128,17 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned > > int, fd, u32, to_submit, > > } > > if ((flags & IORING_ENTER_GETEVENTS) && ctx- > > >syscall_iopoll) > > goto iopoll_locked; > > + if ((flags & IORING_ENTER_GETEVENTS) && > > + (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) > > { > > + io_run_local_work(ctx, true); > > + goto getevents_ran_local; > > + } > > mutex_unlock(&ctx->uring_lock); > > } > > + > > if (flags & IORING_ENTER_GETEVENTS) { > > int ret2; > > + > > if (ctx->syscall_iopoll) { > > /* > > * We disallow the app entering > > submit/complete with > > @@ -3081,6 +3159,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, > > fd, u32, to_submit, > > const sigset_t __user *sig; > > struct __kernel_timespec __user *ts; > > > > + if (ctx->flags & > > IORING_SETUP_DEFER_TASKRUN) > > + io_run_local_work(ctx, false); > > +getevents_ran_local: > > ret2 = io_get_ext_arg(flags, argp, &argsz, > > &ts, &sig); > > if (likely(!ret2)) { > > min_complete = min(min_complete, > > @@ -3289,17 +3370,29 @@ static __cold int io_uring_create(unsigned > > entries, struct io_uring_params *p, > > if (ctx->flags & IORING_SETUP_SQPOLL) { > > /* IPI related flags don't make sense with SQPOLL > > */ > > if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | > > - IORING_SETUP_TASKRUN_FLAG)) > > + IORING_SETUP_TASKRUN_FLAG | > > + IORING_SETUP_DEFER_TASKRUN)) > > goto err; > > ctx->notify_method = TWA_SIGNAL_NO_IPI; > > } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { > > ctx->notify_method = TWA_SIGNAL_NO_IPI; > > } else { > > - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) > > + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && > > + !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) > > goto err; > > ctx->notify_method = TWA_SIGNAL; > > } > > > > + /* > > + * For DEFER_TASKRUN we require the completion task to be > > the same as the > > + * submission task. This implies that there is only one > > submitter, so enforce > > + * that. > > + */ > > + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && > > + !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { > > + goto err; > > + } > > + > > /* > > * This is just grabbed for accounting purposes. When a > > process exits, > > * the mm is exited and dropped before the files, hence we > > need to hang > > @@ -3400,7 +3493,7 @@ static long io_uring_setup(u32 entries, > > struct io_uring_params __user *params) > > IORING_SETUP_R_DISABLED | > > IORING_SETUP_SUBMIT_ALL | > > IORING_SETUP_COOP_TASKRUN | > > IORING_SETUP_TASKRUN_FLAG | > > IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | > > - IORING_SETUP_SINGLE_ISSUER)) > > + IORING_SETUP_SINGLE_ISSUER | > > IORING_SETUP_DEFER_TASKRUN)) > > return -EINVAL; > > > > return io_uring_create(entries, &p, params); > > @@ -3872,7 +3965,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned > > int, fd, unsigned int, opcode, > > > > ctx = f.file->private_data; > > > > - io_run_task_work(); > > + io_run_task_work_ctx(ctx, true); > > > > mutex_lock(&ctx->uring_lock); > > ret = __io_uring_register(ctx, opcode, arg, nr_args); > > diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h > > index 2f73f83af960..4d0e9c2e00d0 100644 > > --- a/io_uring/io_uring.h > > +++ b/io_uring/io_uring.h > > @@ -26,7 +26,8 @@ enum { > > > > struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx); > > bool io_req_cqe_overflow(struct io_kiocb *req); > > -int io_run_task_work_sig(void); > > +int io_run_task_work_sig(struct io_ring_ctx *ctx); > > +bool io_run_local_work(struct io_ring_ctx *ctx, bool locked); > > void io_req_complete_failed(struct io_kiocb *req, s32 res); > > void __io_req_complete(struct io_kiocb *req, unsigned > > issue_flags); > > void io_req_complete_post(struct io_kiocb *req); > > @@ -234,6 +235,33 @@ static inline bool io_run_task_work(void) > > return false; > > } > > > > +static inline bool io_run_task_work_ctx(struct io_ring_ctx *ctx, > > bool all) > > +{ > > + bool ret = false; > > I'd rather prefer to have two inlined back to back calls like > > io_run_local_work(); > io_run_task_work(); > > instead of passing true/false, which are always confusing. Will do Thanks, Dylan