Daire Byrne reports a ~50% aggregrate throughput regression on his Linux NFS server after commit da1661b93bf4 ("SUNRPC: Teach server to use xprt_sock_sendmsg for socket sends"), which replaced kernel_send_page() calls in NFSD's socket send path with calls to sock_sendmsg() using iov_iter. Investigation showed that tcp_sendmsg() was not using zero-copy to send the xdr_buf's bvec pages, but instead was relying on memcpy. This means copying every byte of a large NFS READ payload. It looks like TLS sockets do indeed support a ->sendpage method, so it's really not necessary to use xprt_sock_sendmsg() to support TLS fully on the server. A mechanical reversion of da1661b93bf4 is not possible at this point, but we can re-implement the server's TCP socket sendmsg path using kernel_sendpage(). No Fixes: tag. If needed, please backport this fix by hand. Reported-by: Daire Byrne <daire@xxxxxxxx> BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=209439 Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> --- net/sunrpc/svcsock.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 4 deletions(-) This replaces the SVC zero-copy send patch I posted a couple of weeks ago. diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index b248f2349437..30332111bd37 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1062,6 +1062,92 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) return 0; /* record not complete */ } +/** + * svc_tcp_sendmsg - Send an RPC message on a TCP socket + * @sock: socket to write the RPC message onto + * @xdr: XDR buffer containing the RPC message + * @marker: TCP record marker + * @sentp: OUT: number of bytes actually written + * + * Caller serializes calls on this @sock, and ensures the pages + * backing @xdr are unchanging. In addition, it is assumed that + * no .bv_len is larger than PAGE_SIZE. + * + * Returns zero on success or a negative errno value. + */ +static int svc_tcp_sendmsg(struct socket *sock, const struct xdr_buf *xdr, + rpc_fraghdr marker, unsigned int *sentp) +{ + struct kvec vec[2] = { + [0] = { + .iov_base = &marker, + .iov_len = sizeof(marker), + }, + [1] = *xdr->head, + }; + size_t len = vec[0].iov_len + vec[1].iov_len; + const struct kvec *tail = xdr->tail; + struct msghdr msg = { + .msg_flags = 0, + }; + int ret; + + *sentp = 0; + + /* + * Optimized for the common case where we have just the record + * marker and xdr->head. + */ + if (xdr->head[0].iov_len < xdr->len) + msg.msg_flags = MSG_MORE; + iov_iter_kvec(&msg.msg_iter, WRITE, vec, ARRAY_SIZE(vec), len); + ret = sock_sendmsg(sock, &msg); + if (ret < 0) + return ret; + *sentp += ret; + if (*sentp != len) + goto out; + + if (xdr->page_len) { + unsigned int offset, len, remaining; + struct bio_vec *bvec; + int flags, ret; + + bvec = xdr->bvec; + offset = xdr->page_base; + remaining = xdr->page_len; + flags = MSG_MORE | MSG_SENDPAGE_NOTLAST; + while (remaining > 0) { + if (remaining <= PAGE_SIZE && tail->iov_len == 0) + flags = 0; + len = min(remaining, bvec->bv_len); + ret = kernel_sendpage(sock, bvec->bv_page, + bvec->bv_offset + offset, + len, flags); + if (ret < 0) + return ret; + *sentp += ret; + if (ret != len) + goto out; + remaining -= len; + offset = 0; + bvec++; + } + } + + if (tail->iov_len) { + ret = kernel_sendpage(sock, virt_to_page(tail->iov_base), + offset_in_page(tail->iov_base), + tail->iov_len, 0); + if (ret < 0) + return ret; + *sentp += ret; + } + +out: + return 0; +} + /** * svc_tcp_sendto - Send out a reply on a TCP socket * @rqstp: completed svc_rqst @@ -1078,18 +1164,16 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) struct xdr_buf *xdr = &rqstp->rq_res; rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | (u32)xdr->len); - struct msghdr msg = { - .msg_flags = 0, - }; unsigned int sent; int err; svc_tcp_release_rqst(rqstp); + xdr_alloc_bvec(xdr); mutex_lock(&xprt->xpt_mutex); if (svc_xprt_is_dead(xprt)) goto out_notconn; - err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent); + err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent); xdr_free_bvec(xdr); trace_svcsock_tcp_send(xprt, err < 0 ? err : sent); if (err < 0 || sent != (xdr->len + sizeof(marker)))