[PATCH v1 03/19] xprtrdma: Defer completion only when local invalidation is needed

Chuck Lever <chuck.lever@xxxxxxxxxx> · Wed, 10 Apr 2019 16:06:47 -0400

While looking at another issue, I noticed that deferred completion
happens to run on the same CPU as Receive completion, thanks to the
fact that the deferred completion workqueue is BOUND. That suggests
there's really no benefit to deferring completion unless it will
have to context switch while waiting for LocalInv to complete.

A somewhat non-intuitive side benefit of this change is that there
are fewer waits for Send completions. Now that this wait is always
done in the Reply handler (a single process) it serializes
subsequent replies. Send completions are batched, so waiting for one
Send completion means waiting for all outstanding Send completions
at once. When the Reply handler gets to subsequent replies, waiting
(and the context switch that goes with it) is less likely to be
needed.

Measurements of IOPS throughput without deferred completion show
improvement of several percent, and latency is just as good or
slightly better for 4KB 100% read and 8KB 70% read / 30% write.

Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
---
 net/sunrpc/xprtrdma/rpc_rdma.c  |   31 +++++++++++++++++++++++++------
 net/sunrpc/xprtrdma/verbs.c     |    8 ++++----
 net/sunrpc/xprtrdma/xprt_rdma.h |    1 -
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index b759b16..c3bd18a 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1226,7 +1226,7 @@ static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
  * RPC completion while holding the transport lock to ensure
  * the rep, rqst, and rq_task pointers remain stable.
  */
-void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
+static void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
 {
 	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
@@ -1268,6 +1268,12 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
 	goto out;
 }
 
+/**
+ * rpcrdma_release_rqst - Release hardware resources
+ * @r_xprt: controlling transport
+ * @req: request with resources to release
+ *
+ */
 void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
 	/* Invalidate and unmap the data payloads before waking
@@ -1295,7 +1301,11 @@ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	}
 }
 
-/* Reply handling runs in the poll worker thread. Anything that
+/**
+ * rpcrdma_deferred_completion
+ * @work: work struct embedded in an rpcrdma_rep
+ *
+ * Reply handling runs in the poll worker thread. Anything that
  * might wait is deferred to a separate workqueue.
  */
 void rpcrdma_deferred_completion(struct work_struct *work)
@@ -1306,13 +1316,14 @@ void rpcrdma_deferred_completion(struct work_struct *work)
 	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
 
 	trace_xprtrdma_defer_cmp(rep);
-	if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
-		frwr_reminv(rep, &req->rl_registered);
+
 	rpcrdma_release_rqst(r_xprt, req);
 	rpcrdma_complete_rqst(rep);
 }
 
-/* Process received RPC/RDMA messages.
+/**
+ * rpcrdma_reply_handler - Process received RPC/RDMA messages
+ * @rep: Incoming rpcrdma_rep object to process
  *
  * Errors must result in the RPC task either being awakened, or
  * allowed to timeout, to discover the errors at that time.
@@ -1375,7 +1386,15 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
 
 	trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
-	queue_work(buf->rb_completion_wq, &rep->rr_work);
+
+	if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
+		frwr_reminv(rep, &req->rl_registered);
+	if (!list_empty(&req->rl_registered)) {
+		queue_work(buf->rb_completion_wq, &rep->rr_work);
+	} else {
+		rpcrdma_release_rqst(r_xprt, req);
+		rpcrdma_complete_rqst(rep);
+	}
 	return;
 
 out_badversion:
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 30cfc0e..fe005c6 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1106,10 +1106,10 @@ struct rpcrdma_req *
 	if (rc)
 		goto out;
 
-	buf->rb_completion_wq = alloc_workqueue("rpcrdma-%s",
-						WQ_MEM_RECLAIM | WQ_HIGHPRI,
-						0,
-			r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]);
+	buf->rb_completion_wq =
+		alloc_workqueue("rpcrdma-%s",
+				WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0,
+				r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]);
 	if (!buf->rb_completion_wq) {
 		rc = -ENOMEM;
 		goto out;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 10f6593..6a49597 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -613,7 +613,6 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
 void rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc);
 int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
 void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
-void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
 void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
 void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt,
 			  struct rpcrdma_req *req);