> On Dec 11, 2020, at 5:33 PM, Chuck Lever <chuck.lever@xxxxxxxxxx> wrote: > > Daire Byrne reports a ~50% aggregrate throughput regression on his > Linux NFS server after commit da1661b93bf4 ("SUNRPC: Teach server to > use xprt_sock_sendmsg for socket sends"), which replaced > kernel_send_page() calls in NFSD's socket send path with calls to > sock_sendmsg() using iov_iter. > > Investigation showed that tcp_sendmsg() was not using zero-copy to > send the xdr_buf's bvec pages, but instead was relying on memcpy. > This means copying every byte of a large NFS READ payload. > > It looks like TLS sockets do indeed support a ->sendpage method, > so it's really not necessary to use xprt_sock_sendmsg() to support > TLS fully on the server. A mechanical reversion of da1661b93bf4 is > not possible at this point, but we can re-implement the server's > TCP socket sendmsg path using kernel_sendpage(). > > No Fixes: tag. If needed, please backport this fix by hand. > > Reported-by: Daire Byrne <daire@xxxxxxxx> > BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=209439 > Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> > --- > net/sunrpc/svcsock.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 88 insertions(+), 4 deletions(-) > > > This replaces the SVC zero-copy send patch I posted a couple of > weeks ago. > > > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c > index b248f2349437..30332111bd37 100644 > --- a/net/sunrpc/svcsock.c > +++ b/net/sunrpc/svcsock.c > @@ -1062,6 +1062,92 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) > return 0; /* record not complete */ > } > > +/** > + * svc_tcp_sendmsg - Send an RPC message on a TCP socket > + * @sock: socket to write the RPC message onto > + * @xdr: XDR buffer containing the RPC message > + * @marker: TCP record marker > + * @sentp: OUT: number of bytes actually written > + * > + * Caller serializes calls on this @sock, and ensures the pages > + * backing @xdr are unchanging. In addition, it is assumed that > + * no .bv_len is larger than PAGE_SIZE. > + * > + * Returns zero on success or a negative errno value. > + */ > +static int svc_tcp_sendmsg(struct socket *sock, const struct xdr_buf *xdr, > + rpc_fraghdr marker, unsigned int *sentp) > +{ > + struct kvec vec[2] = { > + [0] = { > + .iov_base = &marker, > + .iov_len = sizeof(marker), > + }, > + [1] = *xdr->head, > + }; > + size_t len = vec[0].iov_len + vec[1].iov_len; > + const struct kvec *tail = xdr->tail; > + struct msghdr msg = { > + .msg_flags = 0, > + }; > + int ret; > + > + *sentp = 0; > + > + /* > + * Optimized for the common case where we have just the record > + * marker and xdr->head. > + */ > + if (xdr->head[0].iov_len < xdr->len) > + msg.msg_flags = MSG_MORE; > + iov_iter_kvec(&msg.msg_iter, WRITE, vec, ARRAY_SIZE(vec), len); > + ret = sock_sendmsg(sock, &msg); > + if (ret < 0) > + return ret; > + *sentp += ret; > + if (*sentp != len) > + goto out; > + > + if (xdr->page_len) { > + unsigned int offset, len, remaining; > + struct bio_vec *bvec; > + int flags, ret; > + > + bvec = xdr->bvec; > + offset = xdr->page_base; > + remaining = xdr->page_len; > + flags = MSG_MORE | MSG_SENDPAGE_NOTLAST; > + while (remaining > 0) { > + if (remaining <= PAGE_SIZE && tail->iov_len == 0) > + flags = 0; > + len = min(remaining, bvec->bv_len); > + ret = kernel_sendpage(sock, bvec->bv_page, > + bvec->bv_offset + offset, > + len, flags); > + if (ret < 0) > + return ret; > + *sentp += ret; > + if (ret != len) > + goto out; > + remaining -= len; > + offset = 0; > + bvec++; > + } > + } > + > + if (tail->iov_len) { > + ret = kernel_sendpage(sock, virt_to_page(tail->iov_base), > + offset_in_page(tail->iov_base), > + tail->iov_len, 0); > + if (ret < 0) > + return ret; > + *sentp += ret; > + } > + > +out: > + return 0; > +} > + > /** > * svc_tcp_sendto - Send out a reply on a TCP socket > * @rqstp: completed svc_rqst > @@ -1078,18 +1164,16 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) > struct xdr_buf *xdr = &rqstp->rq_res; > rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | > (u32)xdr->len); > - struct msghdr msg = { > - .msg_flags = 0, > - }; > unsigned int sent; > int err; > > svc_tcp_release_rqst(rqstp); > + xdr_alloc_bvec(xdr); This should be + xdr_alloc_bvec(xdr, GFP_KERNEL); > mutex_lock(&xprt->xpt_mutex); > if (svc_xprt_is_dead(xprt)) > goto out_notconn; > - err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent); > + err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent); > xdr_free_bvec(xdr); > trace_svcsock_tcp_send(xprt, err < 0 ? err : sent); > if (err < 0 || sent != (xdr->len + sizeof(marker))) > > -- Chuck Lever