> On Mar 16, 2023, at 11:26, David Howells <dhowells@xxxxxxxxxx> wrote: > > When transmitting data, call down into TCP using a single sendmsg with > MSG_SPLICE_PAGES to indicate that content should be spliced rather than > performing several sendmsg and sendpage calls to transmit header, data > pages and trailer. > > To make this work, the data is assembled in a bio_vec array and attached to > a BVEC-type iterator. The bio_vec array has two extra slots before the > first for headers and one after the last for a trailer. The headers and > trailer are copied into memory acquired from zcopy_alloc() which just > breaks a page up into small pieces that can be freed with put_page(). > > Signed-off-by: David Howells <dhowells@xxxxxxxxxx> > cc: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx> > cc: Anna Schumaker <anna@xxxxxxxxxx> > cc: Chuck Lever <chuck.lever@xxxxxxxxxx> > cc: Jeff Layton <jlayton@xxxxxxxxxx> > cc: "David S. Miller" <davem@xxxxxxxxxxxxx> > cc: Eric Dumazet <edumazet@xxxxxxxxxx> > cc: Jakub Kicinski <kuba@xxxxxxxxxx> > cc: Paolo Abeni <pabeni@xxxxxxxxxx> > cc: Jens Axboe <axboe@xxxxxxxxx> > cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> > cc: linux-nfs@xxxxxxxxxxxxxxx > cc: netdev@xxxxxxxxxxxxxxx > --- > net/sunrpc/svcsock.c | 70 ++++++++++++-------------------------------- > net/sunrpc/xdr.c | 24 ++++++++++++--- > 2 files changed, 38 insertions(+), 56 deletions(-) > > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c > index 03a4f5615086..1fa41ddbc40e 100644 > --- a/net/sunrpc/svcsock.c > +++ b/net/sunrpc/svcsock.c > @@ -36,6 +36,7 @@ > #include <linux/skbuff.h> > #include <linux/file.h> > #include <linux/freezer.h> > +#include <linux/zcopy_alloc.h> > #include <net/sock.h> > #include <net/checksum.h> > #include <net/ip.h> > @@ -1060,16 +1061,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) > return 0; /* record not complete */ > } > > -static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, > - int flags) > -{ > - return kernel_sendpage(sock, virt_to_page(vec->iov_base), > - offset_in_page(vec->iov_base), > - vec->iov_len, flags); > -} > - > /* > - * kernel_sendpage() is used exclusively to reduce the number of > + * MSG_SPLICE_PAGES is used exclusively to reduce the number of > * copy operations in this path. Therefore the caller must ensure > * that the pages backing @xdr are unchanging. > * > @@ -1081,65 +1074,38 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr, > { > const struct kvec *head = xdr->head; > const struct kvec *tail = xdr->tail; > - struct kvec rm = { > - .iov_base = &marker, > - .iov_len = sizeof(marker), > - }; > struct msghdr msg = { > - .msg_flags = 0, > + .msg_flags = MSG_SPLICE_PAGES, > }; > - int ret; > + int ret, n = xdr_buf_pagecount(xdr), size; > > *sentp = 0; > ret = xdr_alloc_bvec(xdr, GFP_KERNEL); > if (ret < 0) > return ret; > > - ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len); > + ret = zcopy_memdup(sizeof(marker), &marker, &xdr->bvec[-2], GFP_KERNEL); > if (ret < 0) > return ret; > - *sentp += ret; > - if (ret != rm.iov_len) > - return -EAGAIN; > > - ret = svc_tcp_send_kvec(sock, head, 0); > + ret = zcopy_memdup(head->iov_len, head->iov_base, &xdr->bvec[-1], GFP_KERNEL); > if (ret < 0) > return ret; > - *sentp += ret; > - if (ret != head->iov_len) > - goto out; > > - if (xdr->page_len) { > - unsigned int offset, len, remaining; > - struct bio_vec *bvec; > - > - bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT); > - offset = offset_in_page(xdr->page_base); > - remaining = xdr->page_len; > - while (remaining > 0) { > - len = min(remaining, bvec->bv_len - offset); > - ret = kernel_sendpage(sock, bvec->bv_page, > - bvec->bv_offset + offset, > - len, 0); > - if (ret < 0) > - return ret; > - *sentp += ret; > - if (ret != len) > - goto out; > - remaining -= len; > - offset = 0; > - bvec++; > - } > - } > + ret = zcopy_memdup(tail->iov_len, tail->iov_base, &xdr->bvec[n], GFP_KERNEL); > + if (ret < 0) > + return ret; > > - if (tail->iov_len) { > - ret = svc_tcp_send_kvec(sock, tail, 0); > - if (ret < 0) > - return ret; > - *sentp += ret; > - } > + size = sizeof(marker) + head->iov_len + xdr->page_len + tail->iov_len; > + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec - 2, n + 3, size); > > -out: > + ret = sock_sendmsg(sock, &msg); > + if (ret < 0) > + return ret; > + if (ret > 0) > + *sentp = ret; > + if (ret != size) > + return -EAGAIN; > return 0; > } > > diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c > index 36835b2f5446..6dff0b4f17b8 100644 > --- a/net/sunrpc/xdr.c > +++ b/net/sunrpc/xdr.c > @@ -145,14 +145,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) > { > size_t i, n = xdr_buf_pagecount(buf); > > - if (n != 0 && buf->bvec == NULL) { > - buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp); > + if (buf->bvec == NULL) { > + /* Allow for two headers and a trailer to be attached */ > + buf->bvec = kmalloc_array(n + 3, sizeof(buf->bvec[0]), gfp); > if (!buf->bvec) > return -ENOMEM; > + buf->bvec += 2; > + buf->bvec[-2].bv_page = NULL; > + buf->bvec[-1].bv_page = NULL; NACK. > for (i = 0; i < n; i++) { > bvec_set_page(&buf->bvec[i], buf->pages[i], PAGE_SIZE, > 0); > } > + buf->bvec[n].bv_page = NULL; > } > return 0; > } > @@ -160,8 +165,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) > void > xdr_free_bvec(struct xdr_buf *buf) > { > - kfree(buf->bvec); > - buf->bvec = NULL; > + if (buf->bvec) { > + size_t n = xdr_buf_pagecount(buf); > + > + if (buf->bvec[-2].bv_page) > + put_page(buf->bvec[-2].bv_page); > + if (buf->bvec[-1].bv_page) > + put_page(buf->bvec[-1].bv_page); > + if (buf->bvec[n].bv_page) > + put_page(buf->bvec[n].bv_page); > + buf->bvec -= 2; > + kfree(buf->bvec); > + buf->bvec = NULL; > + } > } > > /** >