Re: [PATCH 2/4] io_uring: switch deferred task_work to an io_wq_work_list

Pavel Begunkov <asml.silence@xxxxxxxxx> · Wed, 27 Mar 2024 13:24:22 +0000

On 3/26/24 18:42, Jens Axboe wrote:
Lockless lists may be handy for some things, but they mean that items
are in the reverse order as we can only add to the head of the list.
That in turn means that iterating items on the list needs to reverse it
first, if it's sensitive to ordering between items on the list.

Switch the DEFER_TASKRUN work list from an llist to a normal
io_wq_work_list, and protect it with a lock. Then we can get rid of the
manual reversing of the list when running it, which takes considerable
cycles particularly for bursty task_work additions.

Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
---
  include/linux/io_uring_types.h |  11 ++--
  io_uring/io_uring.c            | 117 ++++++++++++---------------------
  io_uring/io_uring.h            |   4 +-
  3 files changed, 51 insertions(+), 81 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index aeb4639785b5..e51bf15196e4 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -329,7 +329,9 @@ struct io_ring_ctx {
  	 * regularly bounce b/w CPUs.

...

-static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
+static inline void io_req_local_work_add(struct io_kiocb *req, unsigned tw_flags)
  {
  	struct io_ring_ctx *ctx = req->ctx;
-	unsigned nr_wait, nr_tw, nr_tw_prev;
-	struct llist_node *head;
+	unsigned nr_wait, nr_tw;
+	unsigned long flags;
  
  	/* See comment above IO_CQ_WAKE_INIT */
  	BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES);
  
  	/*
-	 * We don't know how many reuqests is there in the link and whether
+	 * We don't know how many requests is there in the link and whether
  	 * they can even be queued lazily, fall back to non-lazy.
  	 */
  	if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
-		flags &= ~IOU_F_TWQ_LAZY_WAKE;
-
-	head = READ_ONCE(ctx->work_llist.first);
-	do {
-		nr_tw_prev = 0;
-		if (head) {
-			struct io_kiocb *first_req = container_of(head,
-							struct io_kiocb,
-							io_task_work.node);
-			/*
-			 * Might be executed at any moment, rely on
-			 * SLAB_TYPESAFE_BY_RCU to keep it alive.
-			 */
-			nr_tw_prev = READ_ONCE(first_req->nr_tw);
-		}
-
-		/*
-		 * Theoretically, it can overflow, but that's fine as one of
-		 * previous adds should've tried to wake the task.
-		 */
-		nr_tw = nr_tw_prev + 1;
-		if (!(flags & IOU_F_TWQ_LAZY_WAKE))
-			nr_tw = IO_CQ_WAKE_FORCE;

Aren't you just killing the entire IOU_F_TWQ_LAZY_WAKE handling?
It's assigned to IO_CQ_WAKE_FORCE so that it passes the check
before wake_up below.

+		tw_flags &= ~IOU_F_TWQ_LAZY_WAKE;
  
-		req->nr_tw = nr_tw;
-		req->io_task_work.node.next = head;
-	} while (!try_cmpxchg(&ctx->work_llist.first, &head,
-			      &req->io_task_work.node));
+	spin_lock_irqsave(&ctx->work_lock, flags);
+	wq_list_add_tail(&req->io_task_work.node, &ctx->work_list);
+	nr_tw = ++ctx->work_items;
+	spin_unlock_irqrestore(&ctx->work_lock, flags);

smp_mb(), see the comment below, and fwiw "_after_atomic" would not
work.

  	/*
  	 * cmpxchg implies a full barrier, which pairs with the barrier
@@ -1289,7 +1254,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
  	 * is similar to the wait/wawke task state sync.
  	 */
  
-	if (!head) {
+	if (nr_tw == 1) {
  		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
  			atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
  		if (ctx->has_evfd)
@@ -1297,13 +1262,8 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
  	}
  
  	nr_wait = atomic_read(&ctx->cq_wait_nr);
-	/* not enough or no one is waiting */
-	if (nr_tw < nr_wait)
-		return;
-	/* the previous add has already woken it up */
-	if (nr_tw_prev >= nr_wait)
-		return;
-	wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
+	if (nr_tw >= nr_wait)
+		wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);

IIUC, you're removing a very important optimisation, and I
don't understand why'd you do that. It was waking up only when
it's changing from "don't need to wake" to "have to be woken up",
just once per splicing the list on the waiting side.

It seems like this one will keep pounding with wake_up_state for
every single tw queued after @nr_wait, which quite often just 1.

--
Pavel Begunkov