Re: [RFC PATCH 27/28] sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Note: this is the first I've seen of this series -- not sure why
I never received any of these patches.

That means I haven't seen the cover letter and do not have any
context for this proposed change.


> On Mar 16, 2023, at 12:17 PM, Trond Myklebust <trondmy@xxxxxxxxxxxxxxx> wrote:
> 
>> On Mar 16, 2023, at 11:26, David Howells <dhowells@xxxxxxxxxx> wrote:
>> 
>> When transmitting data, call down into TCP using a single sendmsg with
>> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
>> performing several sendmsg and sendpage calls to transmit header, data
>> pages and trailer.

We've tried combining the sendpages calls in here before. It
results in a significant and measurable performance regression.
See:

da1661b93bf4 ("SUNRPC: Teach server to use xprt_sock_sendmsg for socket sends")

and it's subsequent revert:

4a85a6a3320b ("SUNRPC: Handle TCP socket sends with kernel_sendpage() again")


Therefore, this kind of change needs to be accompanied by both
benchmark results and some field testing to convince me it won't
cause harm.

Also, I'd rather see struct xdr_buf changed to /replace/ the
head/pagevec/tail arrangement with bvecs before we do this
kind of overhaul.

And, we have to make certain that this doesn't break operation
with kTLS sockets... do they support MSG_SPLICE_PAGES ?


>> To make this work, the data is assembled in a bio_vec array and attached to
>> a BVEC-type iterator.  The bio_vec array has two extra slots before the
>> first for headers and one after the last for a trailer.  The headers and
>> trailer are copied into memory acquired from zcopy_alloc() which just
>> breaks a page up into small pieces that can be freed with put_page().
>> 
>> Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
>> cc: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
>> cc: Anna Schumaker <anna@xxxxxxxxxx>
>> cc: Chuck Lever <chuck.lever@xxxxxxxxxx>
>> cc: Jeff Layton <jlayton@xxxxxxxxxx>
>> cc: "David S. Miller" <davem@xxxxxxxxxxxxx>
>> cc: Eric Dumazet <edumazet@xxxxxxxxxx>
>> cc: Jakub Kicinski <kuba@xxxxxxxxxx>
>> cc: Paolo Abeni <pabeni@xxxxxxxxxx>
>> cc: Jens Axboe <axboe@xxxxxxxxx>
>> cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
>> cc: linux-nfs@xxxxxxxxxxxxxxx
>> cc: netdev@xxxxxxxxxxxxxxx
>> ---
>> net/sunrpc/svcsock.c | 70 ++++++++++++--------------------------------
>> net/sunrpc/xdr.c     | 24 ++++++++++++---
>> 2 files changed, 38 insertions(+), 56 deletions(-)
>> 
>> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
>> index 03a4f5615086..1fa41ddbc40e 100644
>> --- a/net/sunrpc/svcsock.c
>> +++ b/net/sunrpc/svcsock.c
>> @@ -36,6 +36,7 @@
>> #include <linux/skbuff.h>
>> #include <linux/file.h>
>> #include <linux/freezer.h>
>> +#include <linux/zcopy_alloc.h>
>> #include <net/sock.h>
>> #include <net/checksum.h>
>> #include <net/ip.h>
>> @@ -1060,16 +1061,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
>> return 0; /* record not complete */
>> }
>> 
>> -static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
>> -      int flags)
>> -{
>> - return kernel_sendpage(sock, virt_to_page(vec->iov_base),
>> -       offset_in_page(vec->iov_base),
>> -       vec->iov_len, flags);
>> -}
>> -
>> /*
>> - * kernel_sendpage() is used exclusively to reduce the number of
>> + * MSG_SPLICE_PAGES is used exclusively to reduce the number of
>> * copy operations in this path. Therefore the caller must ensure
>> * that the pages backing @xdr are unchanging.
>> *
>> @@ -1081,65 +1074,38 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
>> {
>> const struct kvec *head = xdr->head;
>> const struct kvec *tail = xdr->tail;
>> - struct kvec rm = {
>> - .iov_base = &marker,
>> - .iov_len = sizeof(marker),
>> - };
>> struct msghdr msg = {
>> - .msg_flags = 0,
>> + .msg_flags = MSG_SPLICE_PAGES,
>> };
>> - int ret;
>> + int ret, n = xdr_buf_pagecount(xdr), size;
>> 
>> *sentp = 0;
>> ret = xdr_alloc_bvec(xdr, GFP_KERNEL);
>> if (ret < 0)
>> return ret;
>> 
>> - ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len);
>> + ret = zcopy_memdup(sizeof(marker), &marker, &xdr->bvec[-2], GFP_KERNEL);
>> if (ret < 0)
>> return ret;
>> - *sentp += ret;
>> - if (ret != rm.iov_len)
>> - return -EAGAIN;
>> 
>> - ret = svc_tcp_send_kvec(sock, head, 0);
>> + ret = zcopy_memdup(head->iov_len, head->iov_base, &xdr->bvec[-1], GFP_KERNEL);
>> if (ret < 0)
>> return ret;
>> - *sentp += ret;
>> - if (ret != head->iov_len)
>> - goto out;
>> 
>> - if (xdr->page_len) {
>> - unsigned int offset, len, remaining;
>> - struct bio_vec *bvec;
>> -
>> - bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT);
>> - offset = offset_in_page(xdr->page_base);
>> - remaining = xdr->page_len;
>> - while (remaining > 0) {
>> - len = min(remaining, bvec->bv_len - offset);
>> - ret = kernel_sendpage(sock, bvec->bv_page,
>> -      bvec->bv_offset + offset,
>> -      len, 0);
>> - if (ret < 0)
>> - return ret;
>> - *sentp += ret;
>> - if (ret != len)
>> - goto out;
>> - remaining -= len;
>> - offset = 0;
>> - bvec++;
>> - }
>> - }
>> + ret = zcopy_memdup(tail->iov_len, tail->iov_base, &xdr->bvec[n], GFP_KERNEL);
>> + if (ret < 0)
>> + return ret;
>> 
>> - if (tail->iov_len) {
>> - ret = svc_tcp_send_kvec(sock, tail, 0);
>> - if (ret < 0)
>> - return ret;
>> - *sentp += ret;
>> - }
>> + size = sizeof(marker) + head->iov_len + xdr->page_len + tail->iov_len;
>> + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec - 2, n + 3, size);
>> 
>> -out:
>> + ret = sock_sendmsg(sock, &msg);
>> + if (ret < 0)
>> + return ret;
>> + if (ret > 0)
>> + *sentp = ret;
>> + if (ret != size)
>> + return -EAGAIN;
>> return 0;
>> }
>> 
>> diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
>> index 36835b2f5446..6dff0b4f17b8 100644
>> --- a/net/sunrpc/xdr.c
>> +++ b/net/sunrpc/xdr.c
>> @@ -145,14 +145,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
>> {
>> size_t i, n = xdr_buf_pagecount(buf);
>> 
>> - if (n != 0 && buf->bvec == NULL) {
>> - buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp);
>> + if (buf->bvec == NULL) {
>> + /* Allow for two headers and a trailer to be attached */
>> + buf->bvec = kmalloc_array(n + 3, sizeof(buf->bvec[0]), gfp);
>> if (!buf->bvec)
>> return -ENOMEM;
>> + buf->bvec += 2;
>> + buf->bvec[-2].bv_page = NULL;
>> + buf->bvec[-1].bv_page = NULL;
> 
> NACK.
> 
>> for (i = 0; i < n; i++) {
>> bvec_set_page(&buf->bvec[i], buf->pages[i], PAGE_SIZE,
>>     0);
>> }
>> + buf->bvec[n].bv_page = NULL;
>> }
>> return 0;
>> }
>> @@ -160,8 +165,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
>> void
>> xdr_free_bvec(struct xdr_buf *buf)
>> {
>> - kfree(buf->bvec);
>> - buf->bvec = NULL;
>> + if (buf->bvec) {
>> + size_t n = xdr_buf_pagecount(buf);
>> +
>> + if (buf->bvec[-2].bv_page)
>> + put_page(buf->bvec[-2].bv_page);
>> + if (buf->bvec[-1].bv_page)
>> + put_page(buf->bvec[-1].bv_page);
>> + if (buf->bvec[n].bv_page)
>> + put_page(buf->bvec[n].bv_page);
>> + buf->bvec -= 2;
>> + kfree(buf->bvec);
>> + buf->bvec = NULL;
>> + }
>> }
>> 
>> /**
>> 
> 

--
Chuck Lever






[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux