I noticed that on RDMA, NFSv4 operations were using "hardway" allocations much more than not. A "hardway" allocation uses GFP_NOFS during each RPC to allocate the XDR buffer, instead of using a pre-allocated pre-registered buffer for each RPC. The pre-allocated buffers are 2200 bytes in length. The requested XDR buffer sizes looked like this: GETATTR: 3220 bytes LOOKUP: 3612 bytes WRITE: 3256 bytes OPEN: 6344 bytes But an NFSv4 GETATTR RPC request should be small. It's the reply part of GETATTR that can grow large. call_allocate() passes a single value as the XDR buffer size: the sum of call and reply buffers. However, the xprtrdma transport allocates its XDR request and reply buffers separately. xprtrdma needs to know the maximum call size, as guidance for how large the outgoing request is going to be and how the NFS payload will be marshalled into chunks. But RDMA XDR reply buffers are pre-posted, fixed-size buffers, not allocated by xprt_rdma_allocate(). Because of the sum passed through ->buf_alloc(), xprtrdma's ->buf_alloc() always allocates more XDR buffer than it will ever use. For NFSv4, it is unnecessarily triggering the slow "hardway" path for almost every RPC. Pass the call and reply buffer size values separately to the transport's ->buf_alloc method. The RDMA transport ->buf_alloc can now ignore the reply size, and allocate just what it will use for the call buffer. The socket transport ->buf_alloc can simply add them together, as call_allocate() did before. With this patch, an NFSv4 GETATTR request now allocates a 476 byte RDMA XDR buffer. I didn't see a single NFSv4 request that did not fit into the transport's pre-allocated XDR buffer. Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> --- include/linux/sunrpc/sched.h | 2 +- include/linux/sunrpc/xprt.h | 3 ++- net/sunrpc/clnt.c | 4 ++-- net/sunrpc/sched.c | 6 ++++-- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtsock.c | 3 ++- 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 1a89599..68fa71d 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -232,7 +232,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *, void *); void rpc_wake_up_status(struct rpc_wait_queue *, int); void rpc_delay(struct rpc_task *, unsigned long); -void * rpc_malloc(struct rpc_task *, size_t); +void *rpc_malloc(struct rpc_task *, size_t, size_t); void rpc_free(void *); int rpciod_up(void); void rpciod_down(void); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index fcbfe87..632685c 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -124,7 +124,8 @@ struct rpc_xprt_ops { void (*rpcbind)(struct rpc_task *task); void (*set_port)(struct rpc_xprt *xprt, unsigned short port); void (*connect)(struct rpc_xprt *xprt, struct rpc_task *task); - void * (*buf_alloc)(struct rpc_task *task, size_t size); + void * (*buf_alloc)(struct rpc_task *task, + size_t call, size_t reply); void (*buf_free)(void *buffer); int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 488ddee..5e817d6 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1599,8 +1599,8 @@ call_allocate(struct rpc_task *task) req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen; req->rq_rcvsize <<= 2; - req->rq_buffer = xprt->ops->buf_alloc(task, - req->rq_callsize + req->rq_rcvsize); + req->rq_buffer = xprt->ops->buf_alloc(task, req->rq_callsize, + req->rq_rcvsize); if (req->rq_buffer != NULL) return; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9358c79..fc4f939 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -829,7 +829,8 @@ static void rpc_async_schedule(struct work_struct *work) /** * rpc_malloc - allocate an RPC buffer * @task: RPC task that will use this buffer - * @size: requested byte size + * @call: maximum size of on-the-wire RPC call, in bytes + * @reply: maximum size of on-the-wire RPC reply, in bytes * * To prevent rpciod from hanging, this allocator never sleeps, * returning NULL and suppressing warning if the request cannot be serviced @@ -843,8 +844,9 @@ static void rpc_async_schedule(struct work_struct *work) * In order to avoid memory starvation triggering more writebacks of * NFS requests, we avoid using GFP_KERNEL. */ -void *rpc_malloc(struct rpc_task *task, size_t size) +void *rpc_malloc(struct rpc_task *task, size_t call, size_t reply) { + size_t size = call + reply; struct rpc_buffer *buf; gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2faac49..6e9d0a7 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -459,7 +459,7 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) * the receive buffer portion when using reply chunks. */ static void * -xprt_rdma_allocate(struct rpc_task *task, size_t size) +xprt_rdma_allocate(struct rpc_task *task, size_t size, size_t replen) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; struct rpcrdma_req *req, *nreq; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 43cd89e..b4aca48 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2423,8 +2423,9 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) * we allocate pages instead doing a kmalloc like rpc_malloc is because we want * to use the server side send routines. */ -static void *bc_malloc(struct rpc_task *task, size_t size) +static void *bc_malloc(struct rpc_task *task, size_t call, size_t reply) { + size_t size = call + reply; struct page *page; struct rpc_buffer *buf; -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html