On Sat, 2010-10-30 at 17:46 -0400, Brian J. Murrell wrote: > On Sat, 2010-10-30 at 17:41 -0400, Trond Myklebust wrote: > > > > There are 2 cases which can trigger recovery: server reboot, and network > > partition (i.e. a networking fault that causes the client to be unable > > to contact the server in time in order to renew its lease). > > Yes, seems two cases which I would suspect also. > > > If none of the above apply, > > None should be applicable. Of course I could never know if the network > "blipped" (but highly doubt even that happened) but there should not > have been an outtage long enough to prevent a lease renewal. > > > then we need to look at whether it is the > > client or the server that is screwed up. > > Yes, fair enough. I suppose having more than 1 client using the same > server could be sufficient to determine fault at the client or server? > Or is there a more direct route? How about something like the following patch? It should allow you to log only the errors that lead to a state recovery situation. Cheers Trond ------------------------------------------------------------------------------ >From 16d0f28f44f4b4075548c359bf2027c8e6c52816 Mon Sep 17 00:00:00 2001 From: Trond Myklebust <Trond.Myklebust@xxxxxxxxxx> Date: Sat, 30 Oct 2010 18:12:00 -0400 Subject: [PATCH] NFSv4: Allow administrators to monitor NFSv4 state recovery When the client received an error from the server that requires it to initiate state recovery, it may be useful to log that error via the dprintk mechanism. This patch allows the administrator to turn on logging with echo 4096 > /proc/sys/sunrpc/nfs_debug Signed-off-by: Trond Myklebust <Trond.Myklebust@xxxxxxxxxx> --- fs/nfs/nfs4proc.c | 28 +++++++++++++++++++--------- include/linux/nfs_fs.h | 1 + 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 330a3c9..fa9b465 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -234,6 +234,15 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) return res; } +static void nfs4_error_recover_state(struct nfs_client *clp, int error) +{ + dfprintk(STATE_RECOVERY, "%s: Received error %d from server %s. " + "Initiating NFSv4 state recovery\n", + __func__, error, + clp->cl_rpcclient->cl_server); + nfs4_schedule_state_recovery(clp); +} + /* This is the error handling routine for processes that are allowed * to sleep. */ @@ -271,7 +280,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR: %d Reset session\n", __func__, errorcode); - nfs4_schedule_state_recovery(clp); + nfs4_error_recover_state(clp, errorcode); exception->retry = 1; break; #endif /* defined(CONFIG_NFS_V4_1) */ @@ -295,7 +304,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, /* We failed to handle the error */ return nfs4_map_errors(ret); do_state_recovery: - nfs4_schedule_state_recovery(clp); + nfs4_error_recover_state(clp, errorcode); ret = nfs4_wait_clnt_recover(clp); if (ret == 0) exception->retry = 1; @@ -1223,14 +1232,15 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_state_recovery( - server->nfs_client); + nfs4_error_recover_state(server->nfs_client, + err); goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ - nfs4_schedule_state_recovery(server->nfs_client); + nfs4_error_recover_state(server->nfs_client, + err); goto out; case -ERESTARTSYS: /* @@ -3195,7 +3205,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata) if (task->tk_status < 0) { /* Unless we're shutting down, schedule state recovery! */ if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) - nfs4_schedule_state_recovery(clp); + nfs4_error_recover_state(clp, task->tk_status); return; } spin_lock(&clp->cl_lock); @@ -3495,7 +3505,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); - nfs4_schedule_state_recovery(clp); + nfs4_error_recover_state(clp, task->tk_status); task->tk_status = 0; return -EAGAIN; #endif /* CONFIG_NFS_V4_1 */ @@ -3515,7 +3525,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, return 0; do_state_recovery: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); - nfs4_schedule_state_recovery(clp); + nfs4_error_recover_state(clp, task->tk_status); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); task->tk_status = 0; @@ -4398,7 +4408,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_state_recovery(server->nfs_client); + nfs4_error_recover_state(server->nfs_client, err); goto out; case -ERESTARTSYS: /* diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index bad4d12..7c2eb94 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -604,6 +604,7 @@ extern void * nfs_root_data(void); #define NFSDBG_CLIENT 0x0200 #define NFSDBG_MOUNT 0x0400 #define NFSDBG_FSCACHE 0x0800 +#define NFSDBG_STATE_RECOVERY 0x1000 #define NFSDBG_ALL 0xFFFF #ifdef __KERNEL__ -- 1.7.2.3 -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html