Note: this is the first I've seen of this series -- not sure why I never received any of these patches. That means I haven't seen the cover letter and do not have any context for this proposed change. > On Mar 16, 2023, at 12:17 PM, Trond Myklebust <trondmy@xxxxxxxxxxxxxxx> wrote: > >> On Mar 16, 2023, at 11:26, David Howells <dhowells@xxxxxxxxxx> wrote: >> >> When transmitting data, call down into TCP using a single sendmsg with >> MSG_SPLICE_PAGES to indicate that content should be spliced rather than >> performing several sendmsg and sendpage calls to transmit header, data >> pages and trailer. We've tried combining the sendpages calls in here before. It results in a significant and measurable performance regression. See: da1661b93bf4 ("SUNRPC: Teach server to use xprt_sock_sendmsg for socket sends") and it's subsequent revert: 4a85a6a3320b ("SUNRPC: Handle TCP socket sends with kernel_sendpage() again") Therefore, this kind of change needs to be accompanied by both benchmark results and some field testing to convince me it won't cause harm. Also, I'd rather see struct xdr_buf changed to /replace/ the head/pagevec/tail arrangement with bvecs before we do this kind of overhaul. And, we have to make certain that this doesn't break operation with kTLS sockets... do they support MSG_SPLICE_PAGES ? >> To make this work, the data is assembled in a bio_vec array and attached to >> a BVEC-type iterator. The bio_vec array has two extra slots before the >> first for headers and one after the last for a trailer. The headers and >> trailer are copied into memory acquired from zcopy_alloc() which just >> breaks a page up into small pieces that can be freed with put_page(). >> >> Signed-off-by: David Howells <dhowells@xxxxxxxxxx> >> cc: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx> >> cc: Anna Schumaker <anna@xxxxxxxxxx> >> cc: Chuck Lever <chuck.lever@xxxxxxxxxx> >> cc: Jeff Layton <jlayton@xxxxxxxxxx> >> cc: "David S. Miller" <davem@xxxxxxxxxxxxx> >> cc: Eric Dumazet <edumazet@xxxxxxxxxx> >> cc: Jakub Kicinski <kuba@xxxxxxxxxx> >> cc: Paolo Abeni <pabeni@xxxxxxxxxx> >> cc: Jens Axboe <axboe@xxxxxxxxx> >> cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> >> cc: linux-nfs@xxxxxxxxxxxxxxx >> cc: netdev@xxxxxxxxxxxxxxx >> --- >> net/sunrpc/svcsock.c | 70 ++++++++++++-------------------------------- >> net/sunrpc/xdr.c | 24 ++++++++++++--- >> 2 files changed, 38 insertions(+), 56 deletions(-) >> >> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c >> index 03a4f5615086..1fa41ddbc40e 100644 >> --- a/net/sunrpc/svcsock.c >> +++ b/net/sunrpc/svcsock.c >> @@ -36,6 +36,7 @@ >> #include <linux/skbuff.h> >> #include <linux/file.h> >> #include <linux/freezer.h> >> +#include <linux/zcopy_alloc.h> >> #include <net/sock.h> >> #include <net/checksum.h> >> #include <net/ip.h> >> @@ -1060,16 +1061,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) >> return 0; /* record not complete */ >> } >> >> -static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, >> - int flags) >> -{ >> - return kernel_sendpage(sock, virt_to_page(vec->iov_base), >> - offset_in_page(vec->iov_base), >> - vec->iov_len, flags); >> -} >> - >> /* >> - * kernel_sendpage() is used exclusively to reduce the number of >> + * MSG_SPLICE_PAGES is used exclusively to reduce the number of >> * copy operations in this path. Therefore the caller must ensure >> * that the pages backing @xdr are unchanging. >> * >> @@ -1081,65 +1074,38 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr, >> { >> const struct kvec *head = xdr->head; >> const struct kvec *tail = xdr->tail; >> - struct kvec rm = { >> - .iov_base = &marker, >> - .iov_len = sizeof(marker), >> - }; >> struct msghdr msg = { >> - .msg_flags = 0, >> + .msg_flags = MSG_SPLICE_PAGES, >> }; >> - int ret; >> + int ret, n = xdr_buf_pagecount(xdr), size; >> >> *sentp = 0; >> ret = xdr_alloc_bvec(xdr, GFP_KERNEL); >> if (ret < 0) >> return ret; >> >> - ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len); >> + ret = zcopy_memdup(sizeof(marker), &marker, &xdr->bvec[-2], GFP_KERNEL); >> if (ret < 0) >> return ret; >> - *sentp += ret; >> - if (ret != rm.iov_len) >> - return -EAGAIN; >> >> - ret = svc_tcp_send_kvec(sock, head, 0); >> + ret = zcopy_memdup(head->iov_len, head->iov_base, &xdr->bvec[-1], GFP_KERNEL); >> if (ret < 0) >> return ret; >> - *sentp += ret; >> - if (ret != head->iov_len) >> - goto out; >> >> - if (xdr->page_len) { >> - unsigned int offset, len, remaining; >> - struct bio_vec *bvec; >> - >> - bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT); >> - offset = offset_in_page(xdr->page_base); >> - remaining = xdr->page_len; >> - while (remaining > 0) { >> - len = min(remaining, bvec->bv_len - offset); >> - ret = kernel_sendpage(sock, bvec->bv_page, >> - bvec->bv_offset + offset, >> - len, 0); >> - if (ret < 0) >> - return ret; >> - *sentp += ret; >> - if (ret != len) >> - goto out; >> - remaining -= len; >> - offset = 0; >> - bvec++; >> - } >> - } >> + ret = zcopy_memdup(tail->iov_len, tail->iov_base, &xdr->bvec[n], GFP_KERNEL); >> + if (ret < 0) >> + return ret; >> >> - if (tail->iov_len) { >> - ret = svc_tcp_send_kvec(sock, tail, 0); >> - if (ret < 0) >> - return ret; >> - *sentp += ret; >> - } >> + size = sizeof(marker) + head->iov_len + xdr->page_len + tail->iov_len; >> + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec - 2, n + 3, size); >> >> -out: >> + ret = sock_sendmsg(sock, &msg); >> + if (ret < 0) >> + return ret; >> + if (ret > 0) >> + *sentp = ret; >> + if (ret != size) >> + return -EAGAIN; >> return 0; >> } >> >> diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c >> index 36835b2f5446..6dff0b4f17b8 100644 >> --- a/net/sunrpc/xdr.c >> +++ b/net/sunrpc/xdr.c >> @@ -145,14 +145,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) >> { >> size_t i, n = xdr_buf_pagecount(buf); >> >> - if (n != 0 && buf->bvec == NULL) { >> - buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp); >> + if (buf->bvec == NULL) { >> + /* Allow for two headers and a trailer to be attached */ >> + buf->bvec = kmalloc_array(n + 3, sizeof(buf->bvec[0]), gfp); >> if (!buf->bvec) >> return -ENOMEM; >> + buf->bvec += 2; >> + buf->bvec[-2].bv_page = NULL; >> + buf->bvec[-1].bv_page = NULL; > > NACK. > >> for (i = 0; i < n; i++) { >> bvec_set_page(&buf->bvec[i], buf->pages[i], PAGE_SIZE, >> 0); >> } >> + buf->bvec[n].bv_page = NULL; >> } >> return 0; >> } >> @@ -160,8 +165,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) >> void >> xdr_free_bvec(struct xdr_buf *buf) >> { >> - kfree(buf->bvec); >> - buf->bvec = NULL; >> + if (buf->bvec) { >> + size_t n = xdr_buf_pagecount(buf); >> + >> + if (buf->bvec[-2].bv_page) >> + put_page(buf->bvec[-2].bv_page); >> + if (buf->bvec[-1].bv_page) >> + put_page(buf->bvec[-1].bv_page); >> + if (buf->bvec[n].bv_page) >> + put_page(buf->bvec[n].bv_page); >> + buf->bvec -= 2; >> + kfree(buf->bvec); >> + buf->bvec = NULL; >> + } >> } >> >> /** >> > -- Chuck Lever