Currently call_bind_status places a hard limit of 9 seconds for retries on EACCES error. This limit was done to prevent the RPC request from being retried forever if the remote server has problem and never comes up However this 9 seconds timeout is too short, comparing to other RPC timeouts which are generally in minutes. This causes intermittent failure with EIO on the client side when there are lots of NLM activity and the NFS server is restarted. Instead of removing the max timeout for retry and relying on the RPC timeout mechanism to handle the retry, which can lead to the RPC being retried forever if the remote NLM service fails to come up. This patch simply increases the max timeout of call_bind_status from 9 to 90 seconds which should allow enough time for NLM to register after a restart, and not retrying forever if there is real problem with the remote system. Fixes: 0b760113a3a1 ("NLM: Don't hang forever on NLM unlock requests") Reported-by: Helen Chao <helen.chao@xxxxxxxxxx> Tested-by: Helen Chao <helen.chao@xxxxxxxxxx> Signed-off-by: Dai Ngo <dai.ngo@xxxxxxxxxx> --- include/linux/sunrpc/clnt.h | 3 +++ include/linux/sunrpc/sched.h | 4 ++-- net/sunrpc/clnt.c | 2 +- net/sunrpc/sched.c | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 770ef2cb5775..81afc5ea2665 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -162,6 +162,9 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) #define RPC_CLNT_CREATE_CONNECTED (1UL << 12) +#define RPC_CLNT_REBIND_DELAY 3 +#define RPC_CLNT_REBIND_MAX_TIMEOUT 90 + struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, const struct rpc_program *, u32); diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index b8ca3ecaf8d7..e9dc142f10bb 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -90,8 +90,8 @@ struct rpc_task { #endif unsigned char tk_priority : 2,/* Task priority */ tk_garb_retry : 2, - tk_cred_retry : 2, - tk_rebind_retry : 2; + tk_cred_retry : 2; + unsigned char tk_rebind_retry; }; typedef void (*rpc_action)(struct rpc_task *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index fd7e1c630493..222578af6b01 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2053,7 +2053,7 @@ call_bind_status(struct rpc_task *task) if (task->tk_rebind_retry == 0) break; task->tk_rebind_retry--; - rpc_delay(task, 3*HZ); + rpc_delay(task, RPC_CLNT_REBIND_DELAY * HZ); goto retry_timeout; case -ENOBUFS: rpc_delay(task, HZ >> 2); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index be587a308e05..5c18a35752aa 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -817,7 +817,8 @@ rpc_init_task_statistics(struct rpc_task *task) /* Initialize retry counters */ task->tk_garb_retry = 2; task->tk_cred_retry = 2; - task->tk_rebind_retry = 2; + task->tk_rebind_retry = RPC_CLNT_REBIND_MAX_TIMEOUT / + RPC_CLNT_REBIND_DELAY; /* starting timestamp */ task->tk_start = ktime_get(); -- 2.9.5