On 3/23/22 7:12 AM, Constantine Gavrilov wrote: >> From: Jens Axboe <axboe@xxxxxxxxx> >> Sent: Wednesday, March 23, 2022 14:19 >> To: Constantine Gavrilov <CONSTG@xxxxxxxxxx>; linux-kernel@xxxxxxxxxxxxxxx <linux-kernel@xxxxxxxxxxxxxxx> >> Cc: io-uring <io-uring@xxxxxxxxxxxxxxx> >> Subject: [EXTERNAL] Re: io_uring_enter() with opcode IORING_OP_RECV ignores MSG_WAITALL in msg_flags >> >> On 3/23/22 4:31 AM, Constantine Gavrilov wrote: >>> I get partial receives on TCP socket, even though I specify >>> MSG_WAITALL with IORING_OP_RECV opcode. Looking at tcpdump in >>> wireshark, I see entire reassambled packet (+4k), so it is not a >>> disconnect. The MTU is smaller than 4k. >>> >>> From the mailing list history, looks like this was discussed before >>> and it seems the fix was supposed to be in. Can someone clarify the >>> expected behavior? >>> >>> I do not think rsvmsg() has this issue. >> >> Do you have a test case? I added the io-uring list, that's the >> appropriate forum for this kind of discussion. >> >> -- >> Jens Axboe > > Yes, I have a real test case. I cannot share it vebratim, but with a > little effort I believe I can come with a simple code of > client/server. > > It seems the issue shall be directly seen from the implementation, but > if it is not so, I will provide a sample code. > > Forgot to mention that the issue is seen of Fedora kernel version > 5.16.12-200.fc35.x86_64. Can you try with the below? Neither recv nor recvmsg handle MSG_WAITALL correctly as far as I can tell. diff --git a/fs/io_uring.c b/fs/io_uring.c index 810d2bd90f4d..ee3848da885a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -612,6 +612,7 @@ struct io_sr_msg { int msg_flags; int bgid; size_t len; + size_t done_io; }; struct io_open { @@ -782,6 +783,7 @@ enum { REQ_F_SKIP_LINK_CQES_BIT, REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, + REQ_F_PARTIAL_IO_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -844,6 +846,8 @@ enum { REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), /* double poll may active */ REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), + /* request has already done partial IO */ + REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), }; struct async_poll { @@ -1391,6 +1395,9 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return; + /* don't recycle if we already did IO to this buffer */ + if (req->flags & REQ_F_PARTIAL_IO) + return; if (issue_flags & IO_URING_F_UNLOCKED) mutex_lock(&ctx->uring_lock); @@ -5431,12 +5438,14 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif + sr->done_io = 0; return 0; } static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; + struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; struct io_buffer *kbuf; unsigned flags; @@ -5479,6 +5488,11 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && flags & MSG_WAITALL) { + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } req_set_fail(req); } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { req_set_fail(req); @@ -5488,6 +5502,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); return 0; } @@ -5538,12 +5556,23 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && flags & MSG_WAITALL) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } req_set_fail(req); } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { out_free: req_set_fail(req); } + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); return 0; } -- Jens Axboe