While looking at another issue, I noticed that deferred completion happens to run on the same CPU as Receive completion, thanks to the fact that the deferred completion workqueue is BOUND. That suggests there's really no benefit to deferring completion unless it will have to context switch while waiting for LocalInv to complete. A somewhat non-intuitive side benefit of this change is that there are fewer waits for Send completions. Now that this wait is always done in the Reply handler (a single process) it serializes subsequent replies. Send completions are batched, so waiting for one Send completion means waiting for all outstanding Send completions at once. When the Reply handler gets to subsequent replies, waiting (and the context switch that goes with it) is less likely to be needed. Measurements of IOPS throughput without deferred completion show improvement of several percent, and latency is just as good or slightly better for 4KB 100% read and 8KB 70% read / 30% write. Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> --- net/sunrpc/xprtrdma/rpc_rdma.c | 31 +++++++++++++++++++++++++------ net/sunrpc/xprtrdma/verbs.c | 8 ++++---- net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index b759b16..c3bd18a 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1226,7 +1226,7 @@ static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) * RPC completion while holding the transport lock to ensure * the rep, rqst, and rq_task pointers remain stable. */ -void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) +static void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) { struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; struct rpc_xprt *xprt = &r_xprt->rx_xprt; @@ -1268,6 +1268,12 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) goto out; } +/** + * rpcrdma_release_rqst - Release hardware resources + * @r_xprt: controlling transport + * @req: request with resources to release + * + */ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { /* Invalidate and unmap the data payloads before waking @@ -1295,7 +1301,11 @@ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) } } -/* Reply handling runs in the poll worker thread. Anything that +/** + * rpcrdma_deferred_completion + * @work: work struct embedded in an rpcrdma_rep + * + * Reply handling runs in the poll worker thread. Anything that * might wait is deferred to a separate workqueue. */ void rpcrdma_deferred_completion(struct work_struct *work) @@ -1306,13 +1316,14 @@ void rpcrdma_deferred_completion(struct work_struct *work) struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; trace_xprtrdma_defer_cmp(rep); - if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) - frwr_reminv(rep, &req->rl_registered); + rpcrdma_release_rqst(r_xprt, req); rpcrdma_complete_rqst(rep); } -/* Process received RPC/RDMA messages. +/** + * rpcrdma_reply_handler - Process received RPC/RDMA messages + * @rep: Incoming rpcrdma_rep object to process * * Errors must result in the RPC task either being awakened, or * allowed to timeout, to discover the errors at that time. @@ -1375,7 +1386,15 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); - queue_work(buf->rb_completion_wq, &rep->rr_work); + + if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) + frwr_reminv(rep, &req->rl_registered); + if (!list_empty(&req->rl_registered)) { + queue_work(buf->rb_completion_wq, &rep->rr_work); + } else { + rpcrdma_release_rqst(r_xprt, req); + rpcrdma_complete_rqst(rep); + } return; out_badversion: diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 30cfc0e..fe005c6 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1106,10 +1106,10 @@ struct rpcrdma_req * if (rc) goto out; - buf->rb_completion_wq = alloc_workqueue("rpcrdma-%s", - WQ_MEM_RECLAIM | WQ_HIGHPRI, - 0, - r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]); + buf->rb_completion_wq = + alloc_workqueue("rpcrdma-%s", + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0, + r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]); if (!buf->rb_completion_wq) { rc = -ENOMEM; goto out; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 10f6593..6a49597 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -613,7 +613,6 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, void rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); -void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); void rpcrdma_reply_handler(struct rpcrdma_rep *rep); void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);