On Fri, 2023-04-07 at 20:30 -0700, Dai Ngo wrote: > Currently call_bind_status places a hard limit of 9 seconds for > retries > on EACCES error. This limit was done to prevent the RPC request from > being retried forever if the remote server has problem and never > comes > up > > However this 9 seconds timeout is too short, comparing to other RPC > timeouts which are generally in minutes. This causes intermittent > failure > with EIO on the client side when there are lots of NLM activity and > the > NFS server is restarted. > > Instead of removing the max timeout for retry and relying on the RPC > timeout mechanism to handle the retry, which can lead to the RPC > being > retried forever if the remote NLM service fails to come up. This > patch > simply increases the max timeout of call_bind_status from 9 to 90 > seconds > which should allow enough time for NLM to register after a restart, > and > not retrying forever if there is real problem with the remote system. > > Fixes: 0b760113a3a1 ("NLM: Don't hang forever on NLM unlock > requests") > Reported-by: Helen Chao <helen.chao@xxxxxxxxxx> > Tested-by: Helen Chao <helen.chao@xxxxxxxxxx> > Signed-off-by: Dai Ngo <dai.ngo@xxxxxxxxxx> > --- > include/linux/sunrpc/clnt.h | 3 +++ > include/linux/sunrpc/sched.h | 4 ++-- > net/sunrpc/clnt.c | 2 +- > net/sunrpc/sched.c | 3 ++- > 4 files changed, 8 insertions(+), 4 deletions(-) > > diff --git a/include/linux/sunrpc/clnt.h > b/include/linux/sunrpc/clnt.h > index 770ef2cb5775..81afc5ea2665 100644 > --- a/include/linux/sunrpc/clnt.h > +++ b/include/linux/sunrpc/clnt.h > @@ -162,6 +162,9 @@ struct rpc_add_xprt_test { > #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) > #define RPC_CLNT_CREATE_CONNECTED (1UL << 12) > > +#define RPC_CLNT_REBIND_DELAY 3 > +#define RPC_CLNT_REBIND_MAX_TIMEOUT 90 > + > struct rpc_clnt *rpc_create(struct rpc_create_args *args); > struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, > const struct rpc_program *, u32); > diff --git a/include/linux/sunrpc/sched.h > b/include/linux/sunrpc/sched.h > index b8ca3ecaf8d7..e9dc142f10bb 100644 > --- a/include/linux/sunrpc/sched.h > +++ b/include/linux/sunrpc/sched.h > @@ -90,8 +90,8 @@ struct rpc_task { > #endif > unsigned char tk_priority : 2,/* Task priority */ > tk_garb_retry : 2, > - tk_cred_retry : 2, > - tk_rebind_retry : 2; > + tk_cred_retry : 2; > + unsigned char tk_rebind_retry; > }; > > typedef void (*rpc_action)(struct rpc_task *); > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c > index fd7e1c630493..222578af6b01 100644 > --- a/net/sunrpc/clnt.c > +++ b/net/sunrpc/clnt.c > @@ -2053,7 +2053,7 @@ call_bind_status(struct rpc_task *task) > if (task->tk_rebind_retry == 0) > break; > task->tk_rebind_retry--; > - rpc_delay(task, 3*HZ); > + rpc_delay(task, RPC_CLNT_REBIND_DELAY * HZ); > goto retry_timeout; > case -ENOBUFS: > rpc_delay(task, HZ >> 2); > diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c > index be587a308e05..5c18a35752aa 100644 > --- a/net/sunrpc/sched.c > +++ b/net/sunrpc/sched.c > @@ -817,7 +817,8 @@ rpc_init_task_statistics(struct rpc_task *task) > /* Initialize retry counters */ > task->tk_garb_retry = 2; > task->tk_cred_retry = 2; > - task->tk_rebind_retry = 2; > + task->tk_rebind_retry = RPC_CLNT_REBIND_MAX_TIMEOUT / > + RPC_CLNT_REBIND_DELAY; Why not just implement an exponential back off? If the server is slow to come up, then pounding the rpcbind service every 3 seconds isn't going to help. > > /* starting timestamp */ > task->tk_start = ktime_get(); -- Trond Myklebust Linux NFS client maintainer, Hammerspace trond.myklebust@xxxxxxxxxxxxxxx