Re: [PATCH for-next 5/7] io_uring: post msg_ring CQE in task context

Jens Axboe <axboe@xxxxxxxxx> · Tue, 6 Dec 2022 09:06:00 -0700

On 12/6/22 3:42?AM, Pavel Begunkov wrote:
> On 12/5/22 15:18, Jens Axboe wrote:
>> On 12/5/22 8:12?AM, Dylan Yudaken wrote:
>>> On Mon, 2022-12-05 at 04:53 -0700, Jens Axboe wrote:
>>>> On 12/4/22 7:44?PM, Pavel Begunkov wrote:
>>>>> We want to limit post_aux_cqe() to the task context when -
>>>>>> task_complete
>>>>> is set, and so we can't just deliver a IORING_OP_MSG_RING CQE to
>>>>> another
>>>>> thread. Instead of trying to invent a new delayed CQE posting
>>>>> mechanism
>>>>> push them into the overflow list.
>>>>
>>>> This is really the only one out of the series that I'm not a big fan
>>>> of.
>>>> If we always rely on overflow for msg_ring, then that basically
>>>> removes
>>>> it from being usable in a higher performance setting.
>>>>
>>>> The natural way to do this would be to post the cqe via task_work for
>>>> the target, ring, but we also don't any storage available for that.
>>>> Might still be better to alloc something ala
>>>>
>>>> struct tw_cqe_post {
>>>> ????????struct task_work work;
>>>> ????????s32 res;
>>>> ????????u32 flags;
>>>> ????????u64 user_data;
>>>> }
>>>>
>>>> and post it with that?
> 
> What does it change performance wise? I need to add a patch to
> "try to flush before overflowing", but apart from that it's one
> additional allocation in both cases but adds additional
> raw / not-batch task_work.

It adds alloc+free for each one, and overflow flush needed on the
recipient side. It also adds a cq lock/unlock, though I don't think that
part will be a big deal.

>>> It might work to post the whole request to the target, post the cqe,
>>> and then return the request back to the originating ring via tw for the
>>> msg_ring CQE and cleanup.
>>
>> I did consider that, but then you need to ref that request as well as
>> bounce it twice via task_work. Probably easier to just alloc at that
>> point? Though if you do that, then the target cqe would post later than
>> the original. And potentially lose -EOVERFLOW if the target ring is
>> overflown...
> 
> Double tw is interesting for future plans, but yeah, I don't think
> it's so much of a difference in context of this series.

I did a half assed patch for that... Below.

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 36cb63e4174f..974eeaac313f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1254,10 +1254,10 @@ static void io_req_local_work_add(struct io_kiocb *req)
 	__io_cqring_wake(ctx);
 }
 
-void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
+void __io_req_task_work_add_ctx(struct io_ring_ctx *ctx, struct io_kiocb *req,
+				struct task_struct *task, bool allow_local)
 {
-	struct io_uring_task *tctx = req->task->io_uring;
-	struct io_ring_ctx *ctx = req->ctx;
+	struct io_uring_task *tctx = task->io_uring;
 
 	if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
 		io_req_local_work_add(req);
@@ -1277,6 +1277,11 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
 	io_fallback_tw(tctx);
 }
 
+void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
+{
+	__io_req_task_work_add_ctx(req->ctx, req, req->task, allow_local);
+}
+
 static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
 {
 	struct llist_node *node;
@@ -1865,7 +1870,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 		return ret;
 
 	/* If the op doesn't have a file, we're not polling for it */
-	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && !def->noiopoll && req->file)
 		io_iopoll_req_issued(req, issue_flags);
 
 	return 0;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index c20f15f5024d..3d24cba17504 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -51,6 +51,8 @@ static inline bool io_req_ffs_set(struct io_kiocb *req)
 }
 
 void __io_req_task_work_add(struct io_kiocb *req, bool allow_local);
+void __io_req_task_work_add_ctx(struct io_ring_ctx *ctx, struct io_kiocb *req,
+				struct task_struct *task, bool allow_local);
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_queue(struct io_kiocb *req);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 7717fe519b07..fdc189b04d30 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -23,14 +23,41 @@ struct io_msg {
 	u32 flags;
 };
 
+static void io_msg_cqe_post(struct io_kiocb *req, bool *locked)
+{
+	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+	struct io_ring_ctx *ctx = req->file->private_data;
+	unsigned issue_flags = 0;
+	int ret = 0;
+
+	if (!io_post_aux_cqe(ctx, msg->user_data, msg->len, msg->flags))
+		ret = -EOVERFLOW;
+
+	io_req_set_res(req, ret, 0);
+	if (!*locked)
+		issue_flags = IO_URING_F_UNLOCKED;
+	io_req_complete_post(req, issue_flags);
+}
+
+static int io_msg_post_remote(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	req->io_task_work.func = io_msg_cqe_post;
+	__io_req_task_work_add_ctx(ctx, req, ctx->submitter_task, true);
+	return IOU_ISSUE_SKIP_COMPLETE;
+}
+
 /* post cqes to another ring */
-static int io_msg_post_cqe(struct io_ring_ctx *ctx,
-			   u64 user_data, s32 res, u32 cflags)
+static int io_msg_post_cqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
 {
-	if (!ctx->task_complete || current == ctx->submitter_task)
-		return io_post_aux_cqe(ctx, user_data, res, cflags);
-	else
-		return io_post_aux_cqe_overflow(ctx, user_data, res, cflags);
+	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+
+	if (!ctx->task_complete || current == ctx->submitter_task) {
+		if (io_post_aux_cqe(ctx, msg->user_data, msg->len, msg->flags))
+			return 0;
+		return -EOVERFLOW;
+	}
+
+	return io_msg_post_remote(ctx, req);
 }
 
 static int io_msg_ring_data(struct io_kiocb *req)
@@ -41,10 +68,7 @@ static int io_msg_ring_data(struct io_kiocb *req)
 	if (msg->src_fd || msg->dst_fd || msg->flags)
 		return -EINVAL;
 
-	if (io_msg_post_cqe(target_ctx, msg->user_data, msg->len, 0))
-		return 0;
-
-	return -EOVERFLOW;
+	return io_msg_post_cqe(target_ctx, req);
 }
 
 static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
@@ -126,8 +150,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
 	 * completes with -EOVERFLOW, then the sender must ensure that a
 	 * later IORING_OP_MSG_RING delivers the message.
 	 */
-	if (!io_msg_post_cqe(target_ctx, msg->user_data, msg->len, 0))
-		ret = -EOVERFLOW;
+	ret = io_msg_post_cqe(target_ctx, req);
 out_unlock:
 	io_double_unlock_ctx(ctx, target_ctx, issue_flags);
 	return ret;
@@ -173,13 +196,11 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 		break;
 	}
 
+	if (ret == IOU_ISSUE_SKIP_COMPLETE)
+		return IOU_ISSUE_SKIP_COMPLETE;
 done:
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
-	/* put file to avoid an attempt to IOPOLL the req */
-	if (!(req->flags & REQ_F_FIXED_FILE))
-		io_put_file(req->file);
-	req->file = NULL;
 	return IOU_OK;
 }
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 83dc0f9ad3b2..638df83895fb 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -436,6 +436,7 @@ const struct io_op_def io_op_defs[] = {
 	[IORING_OP_MSG_RING] = {
 		.needs_file		= 1,
 		.iopoll			= 1,
+		.noiopoll		= 1,
 		.name			= "MSG_RING",
 		.prep			= io_msg_ring_prep,
 		.issue			= io_msg_ring,
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index 3efe06d25473..e378eb240538 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -25,6 +25,8 @@ struct io_op_def {
 	unsigned		ioprio : 1;
 	/* supports iopoll */
 	unsigned		iopoll : 1;
+	/* don't iopoll for this request */
+	unsigned		noiopoll : 1;
 	/* opcode specific path will handle ->async_data allocation if needed */
 	unsigned		manual_alloc : 1;
 	/* size of async data needed, if any */

-- 
Jens Axboe