Re: [PATCH V6 4/8] io_uring: support SQE group

Pavel Begunkov <asml.silence@xxxxxxxxx> · Fri, 4 Oct 2024 14:12:28 +0100

On 9/12/24 11:49, Ming Lei wrote:
...

--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -111,13 +111,15 @@
...
+static void io_complete_group_member(struct io_kiocb *req)
+{
+	struct io_kiocb *lead = get_group_leader(req);
+
+	if (WARN_ON_ONCE(!(req->flags & REQ_F_SQE_GROUP) ||
+			 lead->grp_refs <= 0))
+		return;
+
+	/* member CQE needs to be posted first */
+	if (!(req->flags & REQ_F_CQE_SKIP))
+		io_req_commit_cqe(req->ctx, req);
+
+	req->flags &= ~REQ_F_SQE_GROUP;

I can't say I like this implicit state machine too much,
but let's add a comment why we need to clear it. i.e.
it seems it wouldn't be needed if not for the
mark_last_group_member() below that puts it back to tunnel
the leader to io_free_batch_list().

+
+	/* Set leader as failed in case of any member failed */
+	if (unlikely((req->flags & REQ_F_FAIL)))
+		req_set_fail(lead);
+
+	if (!--lead->grp_refs) {
+		mark_last_group_member(req);
+		if (!(lead->flags & REQ_F_CQE_SKIP))
+			io_req_commit_cqe(lead->ctx, lead);
+	} else if (lead->grp_refs == 1 && (lead->flags & REQ_F_SQE_GROUP)) {
+		/*
+		 * The single uncompleted leader will degenerate to plain
+		 * request, so group leader can be always freed via the
+		 * last completed member.
+		 */
+		lead->flags &= ~REQ_F_SQE_GROUP_LEADER;

What does this try to handle? A group with a leader but no
members? If that's the case, io_group_sqe() and io_submit_state_end()
just need to fail such groups (and clear REQ_F_SQE_GROUP before
that).

+	}
+}
+
+static void io_complete_group_leader(struct io_kiocb *req)
+{
+	WARN_ON_ONCE(req->grp_refs <= 1);
+	req->flags &= ~REQ_F_SQE_GROUP;
+	req->grp_refs -= 1;
+}
+
+static void io_complete_group_req(struct io_kiocb *req)
+{
+	if (req_is_group_leader(req))
+		io_complete_group_leader(req);
+	else
+		io_complete_group_member(req);
+}
+
  static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
  {
  	struct io_ring_ctx *ctx = req->ctx;
@@ -890,7 +1005,8 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
  	 * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires
  	 * the submitter task context, IOPOLL protects with uring_lock.
  	 */
-	if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) {
+	if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL) ||
+	    req_is_group_leader(req)) {

We're better to push all group requests to io_req_task_complete(),
not just a group leader. While seems to be correct, that just
overcomplicates the request's flow, it can post a CQE here, but then
still expect to do group stuff in the CQE posting loop
(flush_completions -> io_complete_group_req), which might post another
cqe for the leader, and then do yet another post processing loop in
io_free_batch_list().


  		req->io_task_work.func = io_req_task_complete;
  		io_req_task_work_add(req);
  		return;
@@ -1388,11 +1504,43 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
  						    comp_list);
  
  		if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
+			if (req_is_last_group_member(req) ||
+					req_is_group_leader(req)) {
+				struct io_kiocb *leader;
+
+				/* Leader is freed via the last member */
+				if (req_is_group_leader(req)) {
+					if (req->grp_link)
+						io_queue_group_members(req);
+					node = req->comp_list.next;
+					continue;
+				}
+
+				/*
+				 * Prepare for freeing leader since we are the
+				 * last group member
+				 */
+				leader = get_group_leader(req);
+				leader->flags &= ~REQ_F_SQE_GROUP_LEADER;
+				req->flags &= ~REQ_F_SQE_GROUP;
+				/*
+				 * Link leader to current request's next,
+				 * this way works because the iterator
+				 * always check the next node only.
+				 *
+				 * Be careful when you change the iterator
+				 * in future
+				 */
+				wq_stack_add_head(&leader->comp_list,
+						  &req->comp_list);
+			}
+
  			if (req->flags & REQ_F_REFCOUNT) {
  				node = req->comp_list.next;
  				if (!req_ref_put_and_test(req))
  					continue;
  			}
+
  			if ((req->flags & REQ_F_POLLED) && req->apoll) {
  				struct async_poll *apoll = req->apoll;
  
@@ -1427,8 +1575,16 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
  		struct io_kiocb *req = container_of(node, struct io_kiocb,
  					    comp_list);
  
-		if (!(req->flags & REQ_F_CQE_SKIP))
-			io_req_commit_cqe(ctx, req);
+		if (unlikely(req->flags & (REQ_F_CQE_SKIP | REQ_F_SQE_GROUP))) {
+			if (req->flags & REQ_F_SQE_GROUP) {
+				io_complete_group_req(req);
+				continue;
+			}
+
+			if (req->flags & REQ_F_CQE_SKIP)
+				continue;
+		}
+		io_req_commit_cqe(ctx, req);
  	}
  	__io_cq_unlock_post(ctx);
  
@@ -1638,8 +1794,12 @@ static u32 io_get_sequence(struct io_kiocb *req)
  	struct io_kiocb *cur;
  
  	/* need original cached_sq_head, but it was increased for each req */
-	io_for_each_link(cur, req)
-		seq--;
+	io_for_each_link(cur, req) {
+		if (req_is_group_leader(cur))
+			seq -= cur->grp_refs;
+		else
+			seq--;
+	}
  	return seq;
  }
...
@@ -2217,8 +2470,22 @@ static void io_submit_state_end(struct io_ring_ctx *ctx)
  {
  	struct io_submit_state *state = &ctx->submit_state;
  
-	if (unlikely(state->link.head))
-		io_queue_sqe_fallback(state->link.head);
+	if (unlikely(state->group.head || state->link.head)) {
+		/* the last member must set REQ_F_SQE_GROUP */
+		if (state->group.head) {
+			struct io_kiocb *lead = state->group.head;
+
+			state->group.last->grp_link = NULL;
+			if (lead->flags & IO_REQ_LINK_FLAGS)
+				io_link_sqe(&state->link, lead);
+			else
+				io_queue_sqe_fallback(lead);

req1(F_LINK), req2(F_GROUP), req3

is supposed to be turned into

req1 -> {group: req2 (lead), req3 }

but note that req2 here doesn't have F_LINK set.
I think it should be like this instead:

if (state->link.head)
	io_link_sqe();
else
	io_queue_sqe_fallback(lead);

+		}
+
+		if (unlikely(state->link.head))
+			io_queue_sqe_fallback(state->link.head);
+	}
+
  	/* flush only after queuing links as they can generate completions */
  	io_submit_flush_completions(ctx);
  	if (state->plug_started)


--
Pavel Begunkov