[PATCH RFC 3/9] svcrdma: Avoid DMA mapping small RPC Replies

Chuck Lever <chuck.lever@xxxxxxxxxx> · Fri, 14 Feb 2020 10:49:58 -0500

On some platforms, DMA mapping part of a page is more costly than
copying bytes. Indeed, not involving the I/O MMU can help the
RPC/RDMA transport scale better for tiny I/Os across more RDMA
devices. This is because interaction with the I/O MMU is eliminated
for each of these small I/Os. Without the explicit unmapping, the
NIC no longer needs to do a costly internal TLB shoot down for
buffers that are just a handful of bytes.

The heuristic for now is to pull-up when the size of the RPC message
body is smaller than half the minimum Send buffer size.

Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
---
 include/trace/events/rpcrdma.h        |   40 +++++++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/svc_rdma_sendto.c |   25 +++++++++++++++++----
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index c0e4c93324f5..6f0d3e8ce95c 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -336,6 +336,44 @@
 				),					\
 				TP_ARGS(rqst))
 
+DECLARE_EVENT_CLASS(xdr_buf_class,
+	TP_PROTO(
+		const struct xdr_buf *xdr
+	),
+
+	TP_ARGS(xdr),
+
+	TP_STRUCT__entry(
+		__field(const void *, head_base)
+		__field(size_t, head_len)
+		__field(const void *, tail_base)
+		__field(size_t, tail_len)
+		__field(unsigned int, page_len)
+		__field(unsigned int, msg_len)
+	),
+
+	TP_fast_assign(
+		__entry->head_base = xdr->head[0].iov_base;
+		__entry->head_len = xdr->head[0].iov_len;
+		__entry->tail_base = xdr->tail[0].iov_base;
+		__entry->tail_len = xdr->tail[0].iov_len;
+		__entry->page_len = xdr->page_len;
+		__entry->msg_len = xdr->len;
+	),
+
+	TP_printk("head=[%p,%zu] page=%u tail=[%p,%zu] len=%u",
+		__entry->head_base, __entry->head_len, __entry->page_len,
+		__entry->tail_base, __entry->tail_len, __entry->msg_len
+	)
+);
+
+#define DEFINE_XDRBUF_EVENT(name)					\
+		DEFINE_EVENT(xdr_buf_class, name,			\
+				TP_PROTO(				\
+					const struct xdr_buf *xdr	\
+				),					\
+				TP_ARGS(xdr))
+
 /**
  ** Connection events
  **/
@@ -1634,6 +1672,8 @@
 	)
 );
 
+DEFINE_XDRBUF_EVENT(svcrdma_send_pullup);
+
 TRACE_EVENT(svcrdma_send_failed,
 	TP_PROTO(
 		const struct svc_rqst *rqst,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index a11983c2056f..8ea21ca351e2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -537,16 +537,32 @@ void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma,
 				      DMA_TO_DEVICE);
 }
 
-/* If the xdr_buf has more elements than the device can
- * transmit in a single RDMA Send, then the reply will
- * have to be copied into a bounce buffer.
+/**
+ * svc_rdma_pull_up_needed - Determine whether to use pull-up
+ * @rdma: controlling transport
+ * @ctxt: I/O resources for an RDMA Send
+ * @xdr: xdr_buf containing RPC message to transmit
+ * @wr_lst: pointer to start of Write chunk list
+ *
+ * Returns:
+ *	%true if pull-up should be used
+ *	%false otherwise
  */
 static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
+				    struct svc_rdma_send_ctxt *ctxt,
 				    struct xdr_buf *xdr,
 				    __be32 *wr_lst)
 {
 	int elements;
 
+	/* Avoid the overhead of DMA mapping for small messages.
+	 */
+	if (xdr->len < RPCRDMA_V1_DEF_INLINE_SIZE >> 1)
+		return true;
+
+	/* Check whether the xdr_buf has more elements than can
+	 * fit in a single RDMA Send.
+	 */
 	/* xdr->head */
 	elements = 1;
 
@@ -627,6 +643,7 @@ static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
 				      ctxt->sc_sges[0].length,
 				      DMA_TO_DEVICE);
 
+	trace_svcrdma_send_pullup(xdr);
 	return 0;
 }
 
@@ -652,7 +669,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 	u32 xdr_pad;
 	int ret;
 
-	if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst))
+	if (svc_rdma_pull_up_needed(rdma, ctxt, xdr, wr_lst))
 		return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst);
 
 	++ctxt->sc_cur_sge_no;