On Fri, Oct 04, 2024 at 02:12:28PM +0100, Pavel Begunkov wrote: > On 9/12/24 11:49, Ming Lei wrote: > ... > > --- a/io_uring/io_uring.c > > +++ b/io_uring/io_uring.c > > @@ -111,13 +111,15 @@ > ... > > +static void io_complete_group_member(struct io_kiocb *req) > > +{ > > + struct io_kiocb *lead = get_group_leader(req); > > + > > + if (WARN_ON_ONCE(!(req->flags & REQ_F_SQE_GROUP) || > > + lead->grp_refs <= 0)) > > + return; > > + > > + /* member CQE needs to be posted first */ > > + if (!(req->flags & REQ_F_CQE_SKIP)) > > + io_req_commit_cqe(req->ctx, req); > > + > > + req->flags &= ~REQ_F_SQE_GROUP; > > I can't say I like this implicit state machine too much, > but let's add a comment why we need to clear it. i.e. > it seems it wouldn't be needed if not for the > mark_last_group_member() below that puts it back to tunnel > the leader to io_free_batch_list(). Yeah, the main purpose is for reusing the flag for marking last member, will add comment for this usage. > > > + > > + /* Set leader as failed in case of any member failed */ > > + if (unlikely((req->flags & REQ_F_FAIL))) > > + req_set_fail(lead); > > + > > + if (!--lead->grp_refs) { > > + mark_last_group_member(req); > > + if (!(lead->flags & REQ_F_CQE_SKIP)) > > + io_req_commit_cqe(lead->ctx, lead); > > + } else if (lead->grp_refs == 1 && (lead->flags & REQ_F_SQE_GROUP)) { > > + /* > > + * The single uncompleted leader will degenerate to plain > > + * request, so group leader can be always freed via the > > + * last completed member. > > + */ > > + lead->flags &= ~REQ_F_SQE_GROUP_LEADER; > > What does this try to handle? A group with a leader but no > members? If that's the case, io_group_sqe() and io_submit_state_end() > just need to fail such groups (and clear REQ_F_SQE_GROUP before > that). The code block allows to issue leader and members concurrently, but we have changed to always issue members after leader is completed, so the above code can be removed now. > > > + } > > +} > > + > > +static void io_complete_group_leader(struct io_kiocb *req) > > +{ > > + WARN_ON_ONCE(req->grp_refs <= 1); > > + req->flags &= ~REQ_F_SQE_GROUP; > > + req->grp_refs -= 1; > > +} > > + > > +static void io_complete_group_req(struct io_kiocb *req) > > +{ > > + if (req_is_group_leader(req)) > > + io_complete_group_leader(req); > > + else > > + io_complete_group_member(req); > > +} > > + > > static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) > > { > > struct io_ring_ctx *ctx = req->ctx; > > @@ -890,7 +1005,8 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) > > * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires > > * the submitter task context, IOPOLL protects with uring_lock. > > */ > > - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { > > + if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL) || > > + req_is_group_leader(req)) { > > We're better to push all group requests to io_req_task_complete(), > not just a group leader. While seems to be correct, that just > overcomplicates the request's flow, it can post a CQE here, but then > still expect to do group stuff in the CQE posting loop > (flush_completions -> io_complete_group_req), which might post another > cqe for the leader, and then do yet another post processing loop in > io_free_batch_list(). OK, it is simpler to complete all group reqs via tw. > > > > req->io_task_work.func = io_req_task_complete; > > io_req_task_work_add(req); > > return; > > @@ -1388,11 +1504,43 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, > > comp_list); > > if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { > > + if (req_is_last_group_member(req) || > > + req_is_group_leader(req)) { > > + struct io_kiocb *leader; > > + > > + /* Leader is freed via the last member */ > > + if (req_is_group_leader(req)) { > > + if (req->grp_link) > > + io_queue_group_members(req); > > + node = req->comp_list.next; > > + continue; > > + } > > + > > + /* > > + * Prepare for freeing leader since we are the > > + * last group member > > + */ > > + leader = get_group_leader(req); > > + leader->flags &= ~REQ_F_SQE_GROUP_LEADER; > > + req->flags &= ~REQ_F_SQE_GROUP; > > + /* > > + * Link leader to current request's next, > > + * this way works because the iterator > > + * always check the next node only. > > + * > > + * Be careful when you change the iterator > > + * in future > > + */ > > + wq_stack_add_head(&leader->comp_list, > > + &req->comp_list); > > + } > > + > > if (req->flags & REQ_F_REFCOUNT) { > > node = req->comp_list.next; > > if (!req_ref_put_and_test(req)) > > continue; > > } > > + > > if ((req->flags & REQ_F_POLLED) && req->apoll) { > > struct async_poll *apoll = req->apoll; > > @@ -1427,8 +1575,16 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) > > struct io_kiocb *req = container_of(node, struct io_kiocb, > > comp_list); > > - if (!(req->flags & REQ_F_CQE_SKIP)) > > - io_req_commit_cqe(ctx, req); > > + if (unlikely(req->flags & (REQ_F_CQE_SKIP | REQ_F_SQE_GROUP))) { > > + if (req->flags & REQ_F_SQE_GROUP) { > > + io_complete_group_req(req); > > + continue; > > + } > > + > > + if (req->flags & REQ_F_CQE_SKIP) > > + continue; > > + } > > + io_req_commit_cqe(ctx, req); > > } > > __io_cq_unlock_post(ctx); > > @@ -1638,8 +1794,12 @@ static u32 io_get_sequence(struct io_kiocb *req) > > struct io_kiocb *cur; > > /* need original cached_sq_head, but it was increased for each req */ > > - io_for_each_link(cur, req) > > - seq--; > > + io_for_each_link(cur, req) { > > + if (req_is_group_leader(cur)) > > + seq -= cur->grp_refs; > > + else > > + seq--; > > + } > > return seq; > > } > ... > > @@ -2217,8 +2470,22 @@ static void io_submit_state_end(struct io_ring_ctx *ctx) > > { > > struct io_submit_state *state = &ctx->submit_state; > > - if (unlikely(state->link.head)) > > - io_queue_sqe_fallback(state->link.head); > > + if (unlikely(state->group.head || state->link.head)) { > > + /* the last member must set REQ_F_SQE_GROUP */ > > + if (state->group.head) { > > + struct io_kiocb *lead = state->group.head; > > + > > + state->group.last->grp_link = NULL; > > + if (lead->flags & IO_REQ_LINK_FLAGS) > > + io_link_sqe(&state->link, lead); > > + else > > + io_queue_sqe_fallback(lead); > > req1(F_LINK), req2(F_GROUP), req3 > > is supposed to be turned into > > req1 -> {group: req2 (lead), req3 } > > but note that req2 here doesn't have F_LINK set. > I think it should be like this instead: > > if (state->link.head) > io_link_sqe(); > else > io_queue_sqe_fallback(lead); Indeed, the above change is correct. Thanks, Ming