[PATCH] io_uring: improve task work cache utilization

Jens Axboe <axboe@xxxxxxxxx> · Thu, 24 Mar 2022 10:17:44 -0600

While profiling task_work intensive workloads, I noticed that most of
the time in tctx_task_work() is spending stalled on loading 'req'. This
is one of the unfortunate side effects of using linked lists,
particularly when they end up being passe around.

Move the list entry to where the rest of the data we care about is, and
prefetch the next entry while iterating the list and processing the work
items.

This reduces tctx_task_work() overhead from ~3% to 1-1.5% in my testing.

Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>

---

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 17a76bc04344..5cdd3a6c9268 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -933,6 +933,8 @@ struct io_kiocb {
 	struct io_ring_ctx		*ctx;
 	struct task_struct		*task;
 
+	struct io_task_work		io_task_work;
+
 	struct percpu_ref		*fixed_rsrc_refs;
 	/* store used ubuf, so we can prevent reloading */
 	struct io_mapped_ubuf		*imu;
@@ -942,7 +944,6 @@ struct io_kiocb {
 	atomic_t			refs;
 	atomic_t			poll_refs;
 	struct io_kiocb			*link;
-	struct io_task_work		io_task_work;
 	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 	struct hlist_node		hash_node;
 	/* internal polling, see IORING_FEAT_FAST_POLL */
@@ -2483,6 +2486,9 @@ static void handle_tw_list(struct io_wq_work_node *node,
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
 
+		/* fetch next entry early to avoid stalling */
+		prefetch(next);
+
 		if (req->ctx != *ctx) {
 			ctx_flush_and_put(*ctx, locked);
 			*ctx = req->ctx;

-- 
Jens Axboe