Re: Error: state manager failed on NFSv4 server linux with error 127

Trond Myklebust <Trond.Myklebust@xxxxxxxxxx> · Sat, 30 Oct 2010 18:22:07 -0400

On Sat, 2010-10-30 at 17:46 -0400, Brian J. Murrell wrote:
> On Sat, 2010-10-30 at 17:41 -0400, Trond Myklebust wrote: 
> > 
> > There are 2 cases which can trigger recovery: server reboot, and network
> > partition (i.e. a networking fault that causes the client to be unable
> > to contact the server in time in order to renew its lease).
> 
> Yes, seems two cases which I would suspect also.
> 
> > If none of the above apply,
> 
> None should be applicable.  Of course I could never know if the network
> "blipped" (but highly doubt even that happened) but there should not
> have been an outtage long enough to prevent a lease renewal.
> 
> > then we need to look at whether it is the
> > client or the server that is screwed up.
> 
> Yes, fair enough.  I suppose having more than 1 client using the same
> server could be sufficient to determine fault at the client or server?
> Or is there a more direct route?

How about something like the following patch? It should allow you to log
only the errors that lead to a state recovery situation.

Cheers
  Trond

------------------------------------------------------------------------------
>From 16d0f28f44f4b4075548c359bf2027c8e6c52816 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@xxxxxxxxxx>
Date: Sat, 30 Oct 2010 18:12:00 -0400
Subject: [PATCH] NFSv4: Allow administrators to monitor NFSv4 state recovery

When the client received an error from the server that requires it to
initiate state recovery, it may be useful to log that error via the
dprintk mechanism.

This patch allows the administrator to turn on logging with
  echo 4096 > /proc/sys/sunrpc/nfs_debug

Signed-off-by: Trond Myklebust <Trond.Myklebust@xxxxxxxxxx>
---
 fs/nfs/nfs4proc.c      |   28 +++++++++++++++++++---------
 include/linux/nfs_fs.h |    1 +
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 330a3c9..fa9b465 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -234,6 +234,15 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 	return res;
 }
 
+static void nfs4_error_recover_state(struct nfs_client *clp, int error)
+{
+	dfprintk(STATE_RECOVERY, "%s: Received error %d from server %s. "
+			"Initiating NFSv4 state recovery\n",
+			__func__, error,
+			clp->cl_rpcclient->cl_server);
+	nfs4_schedule_state_recovery(clp);
+}
+
 /* This is the error handling routine for processes that are allowed
  * to sleep.
  */
@@ -271,7 +280,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 		case -NFS4ERR_SEQ_MISORDERED:
 			dprintk("%s ERROR: %d Reset session\n", __func__,
 				errorcode);
-			nfs4_schedule_state_recovery(clp);
+			nfs4_error_recover_state(clp, errorcode);
 			exception->retry = 1;
 			break;
 #endif /* defined(CONFIG_NFS_V4_1) */
@@ -295,7 +304,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 	/* We failed to handle the error */
 	return nfs4_map_errors(ret);
 do_state_recovery:
-	nfs4_schedule_state_recovery(clp);
+	nfs4_error_recover_state(clp, errorcode);
 	ret = nfs4_wait_clnt_recover(clp);
 	if (ret == 0)
 		exception->retry = 1;
@@ -1223,14 +1232,15 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 			case -NFS4ERR_BAD_HIGH_SLOT:
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			case -NFS4ERR_DEADSESSION:
-				nfs4_schedule_state_recovery(
-					server->nfs_client);
+				nfs4_error_recover_state(server->nfs_client,
+						err);
 				goto out;
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_STALE_STATEID:
 			case -NFS4ERR_EXPIRED:
 				/* Don't recall a delegation if it was lost */
-				nfs4_schedule_state_recovery(server->nfs_client);
+				nfs4_error_recover_state(server->nfs_client,
+						err);
 				goto out;
 			case -ERESTARTSYS:
 				/*
@@ -3195,7 +3205,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
 	if (task->tk_status < 0) {
 		/* Unless we're shutting down, schedule state recovery! */
 		if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
-			nfs4_schedule_state_recovery(clp);
+			nfs4_error_recover_state(clp, task->tk_status);
 		return;
 	}
 	spin_lock(&clp->cl_lock);
@@ -3495,7 +3505,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_SEQ_MISORDERED:
 			dprintk("%s ERROR %d, Reset session\n", __func__,
 				task->tk_status);
-			nfs4_schedule_state_recovery(clp);
+			nfs4_error_recover_state(clp, task->tk_status);
 			task->tk_status = 0;
 			return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
@@ -3515,7 +3525,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 	return 0;
 do_state_recovery:
 	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
-	nfs4_schedule_state_recovery(clp);
+	nfs4_error_recover_state(clp, task->tk_status);
 	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
 		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
 	task->tk_status = 0;
@@ -4398,7 +4408,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 			case -NFS4ERR_BAD_HIGH_SLOT:
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			case -NFS4ERR_DEADSESSION:
-				nfs4_schedule_state_recovery(server->nfs_client);
+				nfs4_error_recover_state(server->nfs_client, err);
 				goto out;
 			case -ERESTARTSYS:
 				/*
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index bad4d12..7c2eb94 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -604,6 +604,7 @@ extern void * nfs_root_data(void);
 #define NFSDBG_CLIENT		0x0200
 #define NFSDBG_MOUNT		0x0400
 #define NFSDBG_FSCACHE		0x0800
+#define NFSDBG_STATE_RECOVERY	0x1000
 #define NFSDBG_ALL		0xFFFF
 
 #ifdef __KERNEL__
-- 
1.7.2.3


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html