On 9/12/24 11:49, Ming Lei wrote:
...
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -111,13 +111,15 @@
...
+static void io_complete_group_member(struct io_kiocb *req)
+{
+ struct io_kiocb *lead = get_group_leader(req);
+
+ if (WARN_ON_ONCE(!(req->flags & REQ_F_SQE_GROUP) ||
+ lead->grp_refs <= 0))
+ return;
+
+ /* member CQE needs to be posted first */
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ io_req_commit_cqe(req->ctx, req);
+
+ req->flags &= ~REQ_F_SQE_GROUP;
I can't say I like this implicit state machine too much,
but let's add a comment why we need to clear it. i.e.
it seems it wouldn't be needed if not for the
mark_last_group_member() below that puts it back to tunnel
the leader to io_free_batch_list().
+
+ /* Set leader as failed in case of any member failed */
+ if (unlikely((req->flags & REQ_F_FAIL)))
+ req_set_fail(lead);
+
+ if (!--lead->grp_refs) {
+ mark_last_group_member(req);
+ if (!(lead->flags & REQ_F_CQE_SKIP))
+ io_req_commit_cqe(lead->ctx, lead);
+ } else if (lead->grp_refs == 1 && (lead->flags & REQ_F_SQE_GROUP)) {
+ /*
+ * The single uncompleted leader will degenerate to plain
+ * request, so group leader can be always freed via the
+ * last completed member.
+ */
+ lead->flags &= ~REQ_F_SQE_GROUP_LEADER;
What does this try to handle? A group with a leader but no
members? If that's the case, io_group_sqe() and io_submit_state_end()
just need to fail such groups (and clear REQ_F_SQE_GROUP before
that).
+ }
+}
+
+static void io_complete_group_leader(struct io_kiocb *req)
+{
+ WARN_ON_ONCE(req->grp_refs <= 1);
+ req->flags &= ~REQ_F_SQE_GROUP;
+ req->grp_refs -= 1;
+}
+
+static void io_complete_group_req(struct io_kiocb *req)
+{
+ if (req_is_group_leader(req))
+ io_complete_group_leader(req);
+ else
+ io_complete_group_member(req);
+}
+
static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -890,7 +1005,8 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
* Handle special CQ sync cases via task_work. DEFER_TASKRUN requires
* the submitter task context, IOPOLL protects with uring_lock.
*/
- if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) {
+ if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL) ||
+ req_is_group_leader(req)) {
We're better to push all group requests to io_req_task_complete(),
not just a group leader. While seems to be correct, that just
overcomplicates the request's flow, it can post a CQE here, but then
still expect to do group stuff in the CQE posting loop
(flush_completions -> io_complete_group_req), which might post another
cqe for the leader, and then do yet another post processing loop in
io_free_batch_list().
req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req);
return;
@@ -1388,11 +1504,43 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
comp_list);
if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
+ if (req_is_last_group_member(req) ||
+ req_is_group_leader(req)) {
+ struct io_kiocb *leader;
+
+ /* Leader is freed via the last member */
+ if (req_is_group_leader(req)) {
+ if (req->grp_link)
+ io_queue_group_members(req);
+ node = req->comp_list.next;
+ continue;
+ }
+
+ /*
+ * Prepare for freeing leader since we are the
+ * last group member
+ */
+ leader = get_group_leader(req);
+ leader->flags &= ~REQ_F_SQE_GROUP_LEADER;
+ req->flags &= ~REQ_F_SQE_GROUP;
+ /*
+ * Link leader to current request's next,
+ * this way works because the iterator
+ * always check the next node only.
+ *
+ * Be careful when you change the iterator
+ * in future
+ */
+ wq_stack_add_head(&leader->comp_list,
+ &req->comp_list);
+ }
+
if (req->flags & REQ_F_REFCOUNT) {
node = req->comp_list.next;
if (!req_ref_put_and_test(req))
continue;
}
+
if ((req->flags & REQ_F_POLLED) && req->apoll) {
struct async_poll *apoll = req->apoll;
@@ -1427,8 +1575,16 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
- if (!(req->flags & REQ_F_CQE_SKIP))
- io_req_commit_cqe(ctx, req);
+ if (unlikely(req->flags & (REQ_F_CQE_SKIP | REQ_F_SQE_GROUP))) {
+ if (req->flags & REQ_F_SQE_GROUP) {
+ io_complete_group_req(req);
+ continue;
+ }
+
+ if (req->flags & REQ_F_CQE_SKIP)
+ continue;
+ }
+ io_req_commit_cqe(ctx, req);
}
__io_cq_unlock_post(ctx);
@@ -1638,8 +1794,12 @@ static u32 io_get_sequence(struct io_kiocb *req)
struct io_kiocb *cur;
/* need original cached_sq_head, but it was increased for each req */
- io_for_each_link(cur, req)
- seq--;
+ io_for_each_link(cur, req) {
+ if (req_is_group_leader(cur))
+ seq -= cur->grp_refs;
+ else
+ seq--;
+ }
return seq;
}
...
@@ -2217,8 +2470,22 @@ static void io_submit_state_end(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
- if (unlikely(state->link.head))
- io_queue_sqe_fallback(state->link.head);
+ if (unlikely(state->group.head || state->link.head)) {
+ /* the last member must set REQ_F_SQE_GROUP */
+ if (state->group.head) {
+ struct io_kiocb *lead = state->group.head;
+
+ state->group.last->grp_link = NULL;
+ if (lead->flags & IO_REQ_LINK_FLAGS)
+ io_link_sqe(&state->link, lead);
+ else
+ io_queue_sqe_fallback(lead);
req1(F_LINK), req2(F_GROUP), req3
is supposed to be turned into
req1 -> {group: req2 (lead), req3 }
but note that req2 here doesn't have F_LINK set.
I think it should be like this instead:
if (state->link.head)
io_link_sqe();
else
io_queue_sqe_fallback(lead);
+ }
+
+ if (unlikely(state->link.head))
+ io_queue_sqe_fallback(state->link.head);
+ }
+
/* flush only after queuing links as they can generate completions */
io_submit_flush_completions(ctx);
if (state->plug_started)
--
Pavel Begunkov