> > > > > + return !--lead->grp_refs; > > > > +} > > > > + > > > > +static inline bool leader_is_the_last(struct io_kiocb *lead) > > > > +{ > > > > + return lead->grp_refs == 1 && (lead->flags & REQ_F_SQE_GROUP); > > > > +} > > > > + > > > > +static void io_complete_group_member(struct io_kiocb *req) > > > > +{ > > > > + struct io_kiocb *lead = get_group_leader(req); > > > > + > > > > + if (WARN_ON_ONCE(!(req->flags & REQ_F_SQE_GROUP))) > > > > + return; > > > > + > > > > + /* member CQE needs to be posted first */ > > > > + if (!(req->flags & REQ_F_CQE_SKIP)) > > > > + io_req_commit_cqe(req->ctx, req); > > > > + > > > > + if (__io_complete_group_member(req, lead)) { > > > > + /* > > > > + * SQE_GROUP flag is kept for the last member, so the leader > > > > + * can be retrieved & freed from this last member > > > > + */ > > > > + req->flags |= REQ_F_SQE_GROUP; > > > > 'req' is the last completed request, so mark it as the last one > > by reusing REQ_F_SQE_GROUP, so we can free group leader in > > io_free_batch_list() when observing the last flag. > > > > But it should have been documented. > > > > > > + if (!(lead->flags & REQ_F_CQE_SKIP)) > > > > + io_req_commit_cqe(lead->ctx, lead); > > > > + } else if (leader_is_the_last(lead)) { > > > > + /* leader will degenerate to plain req if it is the last */ > > > > + lead->flags &= ~(REQ_F_SQE_GROUP | REQ_F_SQE_GROUP_LEADER); > > > > > > What's this chunk is about? > > > > The leader becomes the only request not completed in group, so it is > > degenerated as normal one by clearing the two flags. This way simplifies > > logic for completing group leader. > > > ... > > > > @@ -1388,11 +1501,33 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, > > > > comp_list); > > > > if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { > > > > + if (req->flags & (REQ_F_SQE_GROUP | > > > > + REQ_F_SQE_GROUP_LEADER)) { > > > > + struct io_kiocb *leader; > > > > + > > > > + /* Leader is freed via the last member */ > > > > + if (req_is_group_leader(req)) { > > > > + node = req->comp_list.next; > > > > + continue; > > > > + } > > > > + > > > > + /* > > > > + * Only the last member keeps GROUP flag, > > > > + * free leader and this member together > > > > + */ > > > > + leader = get_group_leader(req); > > > > + leader->flags &= ~REQ_F_SQE_GROUP_LEADER; > > > > + req->flags &= ~REQ_F_SQE_GROUP; > > > > + wq_stack_add_head(&leader->comp_list, > > > > + &req->comp_list); > > > > > > That's quite hacky, but at least we can replace it with > > > task work if it gets in the way later on. > > > > io_free_batch_list() is already called in task context, and it isn't > > necessary to schedule one extra tw, which hurts perf more or less. > > > > Another way is to store these leaders into one temp list, and > > call io_free_batch_list() for this temp list one more time. > > What I'm saying, it's fine to leave it as is for now. In the > future if it becomes a problem for ome reason or another, we can > do it the task_work like way. OK, got it, I will leave it as is, and document the potential risk with future changes. > > ... > > > > @@ -2101,6 +2251,62 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, > > > > return def->prep(req, sqe); > > > > } > > > > +static struct io_kiocb *io_group_sqe(struct io_submit_link *group, > > > > + struct io_kiocb *req) > > > > +{ > > > > + /* > > > > + * Group chain is similar with link chain: starts with 1st sqe with > > > > + * REQ_F_SQE_GROUP, and ends with the 1st sqe without REQ_F_SQE_GROUP > > > > + */ > > > > + if (group->head) { > > > > + struct io_kiocb *lead = group->head; > > > > + > > > > + /* members can't be in link chain, can't be drained */ > > > > + if (req->flags & (IO_REQ_LINK_FLAGS | REQ_F_IO_DRAIN)) > > > > + req_fail_link_node(lead, -EINVAL); > > > > > > That should fail the entire link (if any) as well. > > > > Good catch, here we should fail link head by following the logic > > in io_submit_fail_init(). > > > > > > > > I have even more doubts we even want to mix links and groups. Apart > > > > Wrt. ublk, group provides zero copy, and the ublk io(group) is generic > > IO, sometime IO_LINK is really needed & helpful, such as in ublk-nbd, > > send(tcp) requests need to be linked & zc. And we shouldn't limit IO_LINK > > for generic io_uring IO. > > > > > from nuances as such, which would be quite hard to track, the semantics > > > of IOSQE_CQE_SKIP_SUCCESS is unclear. > > > > IO group just follows every normal request. > > It tries to mimic but groups don't and essentially can't do it the > same way, at least in some aspects. E.g. IOSQE_CQE_SKIP_SUCCESS > usually means that all following will be silenced. What if a > member is CQE_SKIP, should it stop the leader from posting a CQE? > And whatever the answer is, it'll be different from the link's > behaviour. Here it looks easier than link's: - only leader's IOSQE_CQE_SKIP_SUCCESS follows linked request's rule - all members just respects the flag for its own, and not related with leader's > > Regardless, let's forbid IOSQE_CQE_SKIP_SUCCESS and linked timeouts > for groups, that can be discussed afterwards. It should easy to forbid IOSQE_CQE_SKIP_SUCCESS which is per-sqe, will do it in V6. I am not sure if it is easy to disallow IORING_OP_LINK_TIMEOUT, which covers all linked sqes, and group leader could be just one of them. Can you share any idea about the implementation to forbid LINK_TIMEOUT for sqe group? > > > 1) fail in linked chain > > - follows IO_LINK's behavior since io_fail_links() covers io group > > > > 2) otherwise > > - just respect IOSQE_CQE_SKIP_SUCCESS > > > > > And also it doen't work with IORING_OP_LINK_TIMEOUT. > > > > REQ_F_LINK_TIMEOUT can work on whole group(or group leader) only, and I > > will document it in V6. > > It would still be troublesome. When a linked timeout fires it searches > for the request it's attached to and cancels it, however, group leaders > that queued up their members are discoverable. But let's say you can find > them in some way, then the only sensbile thing to do is cancel members, > which should be doable by checking req->grp_leader, but might be easier > to leave it to follow up patches. We have changed sqe group to start queuing members after leader is completed. link timeout will cancel leader with all its members via leader->grp_link, this behavior should respect IORING_OP_LINK_TIMEOUT completely. Please see io_fail_links() and io_cancel_group_members(). > > > > > > + > > > > + lead->grp_refs += 1; > > > > + group->last->grp_link = req; > > > > + group->last = req; > > > > + > > > > + if (req->flags & REQ_F_SQE_GROUP) > > > > + return NULL; > > > > + > > > > + req->grp_link = NULL; > > > > + req->flags |= REQ_F_SQE_GROUP; > > > > + group->head = NULL; > > > > + if (lead->flags & REQ_F_FAIL) { > > > > + io_queue_sqe_fallback(lead); > > > > > > Let's say the group was in the middle of a link, it'll > > > complete that group and continue with assembling / executing > > > the link when it should've failed it and honoured the > > > request order. > > > > OK, here we can simply remove the above two lines, and link submit > > state can handle this failure in link chain. > > If you just delete then nobody would check for REQ_F_FAIL and > fail the request. io_link_assembling() & io_link_sqe() checks for REQ_F_FAIL and call io_queue_sqe_fallback() either if it is in link chain or not. > Assuming you'd also set the fail flag to the > link head when appropriate, how about deleting these two line > and do like below? (can be further prettified) > > > bool io_group_assembling() > { > return state->group.head || (req->flags & REQ_F_SQE_GROUP); > } > bool io_link_assembling() > { > return state->link.head || (req->flags & IO_REQ_LINK_FLAGS); > } > > static inline int io_submit_sqe() > { > ... > if (unlikely(io_link_assembling(state, req) || > io_group_assembling(state, req) || > req->flags & REQ_F_FAIL)) { > if (io_group_assembling(state, req)) { > req = io_group_sqe(&state->group, req); > if (!req) > return 0; > } > if (io_link_assembling(state, req)) { > req = io_link_sqe(&state->link, req); > if (!req) > return 0; > } > if (req->flags & REQ_F_FAIL) { > io_queue_sqe_fallback(req); > return 0; As I mentioned above, io_link_assembling() & io_link_sqe() covers the failure handling. thanks, Ming