[PATCH RFC] SUNRPC: Handle TCP socket sends with kernel_sendpage() again

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Daire Byrne reports a ~50% aggregrate throughput regression on his
Linux NFS server after commit da1661b93bf4 ("SUNRPC: Teach server to
use xprt_sock_sendmsg for socket sends"), which replaced
kernel_send_page() calls in NFSD's socket send path with calls to
sock_sendmsg() using iov_iter.

Investigation showed that tcp_sendmsg() was not using zero-copy to
send the xdr_buf's bvec pages, but instead was relying on memcpy.
This means copying every byte of a large NFS READ payload.

It looks like TLS sockets do indeed support a ->sendpage method,
so it's really not necessary to use xprt_sock_sendmsg() to support
TLS fully on the server. A mechanical reversion of da1661b93bf4 is
not possible at this point, but we can re-implement the server's
TCP socket sendmsg path using kernel_sendpage().

No Fixes: tag. If needed, please backport this fix by hand.

Reported-by: Daire Byrne <daire@xxxxxxxx>
BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=209439
Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
---
 net/sunrpc/svcsock.c |   92 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 88 insertions(+), 4 deletions(-)


This replaces the SVC zero-copy send patch I posted a couple of
weeks ago.


diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index b248f2349437..30332111bd37 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1062,6 +1062,92 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	return 0;	/* record not complete */
 }
 
+/**
+ * svc_tcp_sendmsg - Send an RPC message on a TCP socket
+ * @sock: socket to write the RPC message onto
+ * @xdr: XDR buffer containing the RPC message
+ * @marker: TCP record marker
+ * @sentp: OUT: number of bytes actually written
+ *
+ * Caller serializes calls on this @sock, and ensures the pages
+ * backing @xdr are unchanging. In addition, it is assumed that
+ * no .bv_len is larger than PAGE_SIZE.
+ *
+ * Returns zero on success or a negative errno value.
+ */
+static int svc_tcp_sendmsg(struct socket *sock, const struct xdr_buf *xdr,
+			   rpc_fraghdr marker, unsigned int *sentp)
+{
+	struct kvec vec[2] = {
+		[0] = {
+			.iov_base	= &marker,
+			.iov_len	= sizeof(marker),
+		},
+		[1] = *xdr->head,
+	};
+	size_t len = vec[0].iov_len + vec[1].iov_len;
+	const struct kvec *tail = xdr->tail;
+	struct msghdr msg = {
+		.msg_flags	= 0,
+	};
+	int ret;
+
+	*sentp = 0;
+
+	/*
+	 * Optimized for the common case where we have just the record
+	 * marker and xdr->head.
+	 */
+	if (xdr->head[0].iov_len < xdr->len)
+		msg.msg_flags = MSG_MORE;
+	iov_iter_kvec(&msg.msg_iter, WRITE, vec, ARRAY_SIZE(vec), len);
+	ret = sock_sendmsg(sock, &msg);
+	if (ret < 0)
+		return ret;
+	*sentp += ret;
+	if (*sentp != len)
+		goto out;
+
+	if (xdr->page_len) {
+		unsigned int offset, len, remaining;
+		struct bio_vec *bvec;
+		int flags, ret;
+
+		bvec = xdr->bvec;
+		offset = xdr->page_base;
+		remaining = xdr->page_len;
+		flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+		while (remaining > 0) {
+			if (remaining <= PAGE_SIZE && tail->iov_len == 0)
+				flags = 0;
+			len = min(remaining, bvec->bv_len);
+			ret = kernel_sendpage(sock, bvec->bv_page,
+						bvec->bv_offset + offset,
+						len, flags);
+			if (ret < 0)
+				return ret;
+			*sentp += ret;
+			if (ret != len)
+				goto out;
+			remaining -= len;
+			offset = 0;
+			bvec++;
+		}
+	}
+
+	if (tail->iov_len) {
+		ret = kernel_sendpage(sock, virt_to_page(tail->iov_base),
+				      offset_in_page(tail->iov_base),
+				      tail->iov_len, 0);
+		if (ret < 0)
+			return ret;
+		*sentp += ret;
+	}
+
+out:
+	return 0;
+}
+
 /**
  * svc_tcp_sendto - Send out a reply on a TCP socket
  * @rqstp: completed svc_rqst
@@ -1078,18 +1164,16 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
 	struct xdr_buf *xdr = &rqstp->rq_res;
 	rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
 					 (u32)xdr->len);
-	struct msghdr msg = {
-		.msg_flags	= 0,
-	};
 	unsigned int sent;
 	int err;
 
 	svc_tcp_release_rqst(rqstp);
+	xdr_alloc_bvec(xdr);
 
 	mutex_lock(&xprt->xpt_mutex);
 	if (svc_xprt_is_dead(xprt))
 		goto out_notconn;
-	err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent);
+	err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent);
 	xdr_free_bvec(xdr);
 	trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
 	if (err < 0 || sent != (xdr->len + sizeof(marker)))





[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux