While profiling task_work intensive workloads, I noticed that most of the time in tctx_task_work() is spending stalled on loading 'req'. This is one of the unfortunate side effects of using linked lists, particularly when they end up being passe around. Move the list entry to where the rest of the data we care about is, and prefetch the next entry while iterating the list and processing the work items. This reduces tctx_task_work() overhead from ~3% to 1-1.5% in my testing. Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> --- diff --git a/fs/io_uring.c b/fs/io_uring.c index 17a76bc04344..5cdd3a6c9268 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -933,6 +933,8 @@ struct io_kiocb { struct io_ring_ctx *ctx; struct task_struct *task; + struct io_task_work io_task_work; + struct percpu_ref *fixed_rsrc_refs; /* store used ubuf, so we can prevent reloading */ struct io_mapped_ubuf *imu; @@ -942,7 +944,6 @@ struct io_kiocb { atomic_t refs; atomic_t poll_refs; struct io_kiocb *link; - struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; /* internal polling, see IORING_FEAT_FAST_POLL */ @@ -2483,6 +2486,9 @@ static void handle_tw_list(struct io_wq_work_node *node, struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); + /* fetch next entry early to avoid stalling */ + prefetch(next); + if (req->ctx != *ctx) { ctx_flush_and_put(*ctx, locked); *ctx = req->ctx; -- Jens Axboe