Re: io_uring_enter() with opcode IORING_OP_RECV ignores MSG_WAITALL in msg_flags

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 3/23/22 7:12 AM, Constantine Gavrilov wrote:
>> From: Jens Axboe <axboe@xxxxxxxxx>
>> Sent: Wednesday, March 23, 2022 14:19
>> To: Constantine Gavrilov <CONSTG@xxxxxxxxxx>; linux-kernel@xxxxxxxxxxxxxxx <linux-kernel@xxxxxxxxxxxxxxx>
>> Cc: io-uring <io-uring@xxxxxxxxxxxxxxx>
>> Subject: [EXTERNAL] Re: io_uring_enter() with opcode IORING_OP_RECV ignores MSG_WAITALL in msg_flags
>>
>> On 3/23/22 4:31 AM, Constantine Gavrilov wrote:
>>> I get partial receives on TCP socket, even though I specify
>>> MSG_WAITALL with IORING_OP_RECV opcode. Looking at tcpdump in
>>> wireshark, I see entire reassambled packet (+4k), so it is not a
>>> disconnect. The MTU is smaller than 4k.
>>>
>>> From the mailing list history, looks like this was discussed before
>>> and it seems the fix was supposed to be in. Can someone clarify the
>>> expected behavior?
>>>
>>> I do not think rsvmsg() has this issue.
>>
>> Do you have a test case? I added the io-uring list, that's the
>> appropriate forum for this kind of discussion.
>>
>> --
>> Jens Axboe
> 
> Yes, I have a real test case. I cannot share it vebratim, but with a
> little effort I believe I can come with a simple code of
> client/server.
> 
> It seems the issue shall be directly seen from the implementation, but
> if it is not so, I will provide a sample code.
> 
> Forgot to mention that the issue is seen of Fedora kernel version
> 5.16.12-200.fc35.x86_64.

Can you try with the below? Neither recv nor recvmsg handle MSG_WAITALL
correctly as far as I can tell.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 810d2bd90f4d..ee3848da885a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -612,6 +612,7 @@ struct io_sr_msg {
 	int				msg_flags;
 	int				bgid;
 	size_t				len;
+	size_t				done_io;
 };
 
 struct io_open {
@@ -782,6 +783,7 @@ enum {
 	REQ_F_SKIP_LINK_CQES_BIT,
 	REQ_F_SINGLE_POLL_BIT,
 	REQ_F_DOUBLE_POLL_BIT,
+	REQ_F_PARTIAL_IO_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_SUPPORT_NOWAIT_BIT,
 	REQ_F_ISREG_BIT,
@@ -844,6 +846,8 @@ enum {
 	REQ_F_SINGLE_POLL	= BIT(REQ_F_SINGLE_POLL_BIT),
 	/* double poll may active */
 	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT),
+	/* request has already done partial IO */
+	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),
 };
 
 struct async_poll {
@@ -1391,6 +1395,9 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 
 	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
 		return;
+	/* don't recycle if we already did IO to this buffer */
+	if (req->flags & REQ_F_PARTIAL_IO)
+		return;
 
 	if (issue_flags & IO_URING_F_UNLOCKED)
 		mutex_lock(&ctx->uring_lock);
@@ -5431,12 +5438,14 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (req->ctx->compat)
 		sr->msg_flags |= MSG_CMSG_COMPAT;
 #endif
+	sr->done_io = 0;
 	return 0;
 }
 
 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_async_msghdr iomsg, *kmsg;
+	struct io_sr_msg *sr = &req->sr_msg;
 	struct socket *sock;
 	struct io_buffer *kbuf;
 	unsigned flags;
@@ -5479,6 +5488,11 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 			return io_setup_async_msg(req, kmsg);
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
+		if (ret > 0 && flags & MSG_WAITALL) {
+			sr->done_io += ret;
+			req->flags |= REQ_F_PARTIAL_IO;
+			return io_setup_async_msg(req, kmsg);
+		}
 		req_set_fail(req);
 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
 		req_set_fail(req);
@@ -5488,6 +5502,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 	if (kmsg->free_iov)
 		kfree(kmsg->free_iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
+	if (ret >= 0)
+		ret += sr->done_io;
+	else if (sr->done_io)
+		ret = sr->done_io;
 	__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
 	return 0;
 }
@@ -5538,12 +5556,23 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 			return -EAGAIN;
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
+		if (ret > 0 && flags & MSG_WAITALL) {
+			sr->len -= ret;
+			sr->buf += ret;
+			sr->done_io += ret;
+			req->flags |= REQ_F_PARTIAL_IO;
+			return -EAGAIN;
+		}
 		req_set_fail(req);
 	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
 out_free:
 		req_set_fail(req);
 	}
 
+	if (ret >= 0)
+		ret += sr->done_io;
+	else if (sr->done_io)
+		ret = sr->done_io;
 	__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
 	return 0;
 }

-- 
Jens Axboe




[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux