On some platforms, DMA mapping part of a page is more costly than copying bytes. Indeed, not involving the I/O MMU can help the RPC/RDMA transport scale better for tiny I/Os across more RDMA devices. This is because interaction with the I/O MMU is eliminated for each of these small I/Os. Without the explicit unmapping, the NIC no longer needs to do a costly internal TLB shoot down for buffers that are just a handful of bytes. The heuristic for now is to pull-up when the size of the RPC message body is smaller than half the minimum Send buffer size. Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> --- include/trace/events/rpcrdma.h | 40 +++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/svc_rdma_sendto.c | 25 +++++++++++++++++---- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index c0e4c93324f5..6f0d3e8ce95c 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -336,6 +336,44 @@ ), \ TP_ARGS(rqst)) +DECLARE_EVENT_CLASS(xdr_buf_class, + TP_PROTO( + const struct xdr_buf *xdr + ), + + TP_ARGS(xdr), + + TP_STRUCT__entry( + __field(const void *, head_base) + __field(size_t, head_len) + __field(const void *, tail_base) + __field(size_t, tail_len) + __field(unsigned int, page_len) + __field(unsigned int, msg_len) + ), + + TP_fast_assign( + __entry->head_base = xdr->head[0].iov_base; + __entry->head_len = xdr->head[0].iov_len; + __entry->tail_base = xdr->tail[0].iov_base; + __entry->tail_len = xdr->tail[0].iov_len; + __entry->page_len = xdr->page_len; + __entry->msg_len = xdr->len; + ), + + TP_printk("head=[%p,%zu] page=%u tail=[%p,%zu] len=%u", + __entry->head_base, __entry->head_len, __entry->page_len, + __entry->tail_base, __entry->tail_len, __entry->msg_len + ) +); + +#define DEFINE_XDRBUF_EVENT(name) \ + DEFINE_EVENT(xdr_buf_class, name, \ + TP_PROTO( \ + const struct xdr_buf *xdr \ + ), \ + TP_ARGS(xdr)) + /** ** Connection events **/ @@ -1634,6 +1672,8 @@ ) ); +DEFINE_XDRBUF_EVENT(svcrdma_send_pullup); + TRACE_EVENT(svcrdma_send_failed, TP_PROTO( const struct svc_rqst *rqst, diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index a11983c2056f..8ea21ca351e2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -537,16 +537,32 @@ void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma, DMA_TO_DEVICE); } -/* If the xdr_buf has more elements than the device can - * transmit in a single RDMA Send, then the reply will - * have to be copied into a bounce buffer. +/** + * svc_rdma_pull_up_needed - Determine whether to use pull-up + * @rdma: controlling transport + * @ctxt: I/O resources for an RDMA Send + * @xdr: xdr_buf containing RPC message to transmit + * @wr_lst: pointer to start of Write chunk list + * + * Returns: + * %true if pull-up should be used + * %false otherwise */ static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, struct xdr_buf *xdr, __be32 *wr_lst) { int elements; + /* Avoid the overhead of DMA mapping for small messages. + */ + if (xdr->len < RPCRDMA_V1_DEF_INLINE_SIZE >> 1) + return true; + + /* Check whether the xdr_buf has more elements than can + * fit in a single RDMA Send. + */ /* xdr->head */ elements = 1; @@ -627,6 +643,7 @@ static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, ctxt->sc_sges[0].length, DMA_TO_DEVICE); + trace_svcrdma_send_pullup(xdr); return 0; } @@ -652,7 +669,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, u32 xdr_pad; int ret; - if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst)) + if (svc_rdma_pull_up_needed(rdma, ctxt, xdr, wr_lst)) return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst); ++ctxt->sc_cur_sge_no;