[PATCH 6/7] pnfs-submit: forgetful client (layouts)

Alexandros Batsakis <batsakis@xxxxxxxxxx> · Tue, 8 Jun 2010 15:15:00 -0700

Forgetful client model:

If we receive a CB_LAYOUTRECALL
        - we spawn a thread to handle the recall
        (xxx: now only one recall can be active at a time, else NFS4ERR_DELAY)
        - we check the stateid seqid
        if it does not match we return NFS4ERR_DELAY
        - we check for pending I/O
        if there is we return NFS4ERR_DELAY
        Else we return NO_MATCHING_LAYOUT.
        Note that for whole file layouts there is no need to serialize LAYOUTGETs/LAYOUTRETURNs
For bulk layouts, if there is a layout active, we return NFS4_OK and we start
cleaning the layouts asynchronously. At the end we send a bulk LAYOUTRETURN.
Note that there is no need to prevent any new LAYOUTGETs explicitly as the server should reject them.

Signed-off-by: Alexandros Batsakis <batsakis@xxxxxxxxxx>
---
 fs/nfs/callback_proc.c |  146 ++++++++++++++++++++++++++++++++++--------------
 fs/nfs/nfs4_fs.h       |    1 +
 fs/nfs/pnfs.c          |   70 ++++++++++-------------
 3 files changed, 136 insertions(+), 81 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 3bae785..abdbf40 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -129,6 +129,38 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
 
 #if defined(CONFIG_NFS_V4_1)
 
+static bool
+pnfs_is_next_layout_stateid(const struct pnfs_layout_type *lo,
+			    const nfs4_stateid stateid)
+{
+	int seqlock;
+	bool res;
+	u32 oldseqid, newseqid;
+
+	do {
+		seqlock = read_seqbegin(&lo->seqlock);
+		oldseqid = be32_to_cpu(lo->stateid.u.stateid.seqid);
+		newseqid = be32_to_cpu(stateid.u.stateid.seqid);
+		res = !memcmp(lo->stateid.u.stateid.other,
+			      stateid.u.stateid.other,
+			      NFS4_STATEID_OTHER_SIZE);
+		if (res) { /* comparing layout stateids */
+			if (oldseqid == ~0)
+				res = (newseqid == 1);
+			else
+				res = (newseqid == oldseqid + 1);
+		} else { /* open stateid */
+			res = !memcmp(lo->stateid.u.data,
+				      &zero_stateid,
+				      NFS4_STATEID_SIZE);
+			if (res)
+				res = (newseqid == 1);
+		}
+	} while (read_seqretry(&lo->seqlock, seqlock));
+
+	return res;
+}
+
 /*
  * Retrieve an inode based on layout recall parameters
  *
@@ -191,9 +223,10 @@ static int pnfs_recall_layout(void *data)
 	struct inode *inode, *ino;
 	struct nfs_client *clp;
 	struct cb_pnfs_layoutrecallargs rl;
+	struct nfs4_pnfs_layoutreturn *lrp;
 	struct recall_layout_threadargs *args =
 		(struct recall_layout_threadargs *)data;
-	int status;
+	int status = 0;
 
 	daemonize("nfsv4-layoutreturn");
 
@@ -204,47 +237,59 @@ static int pnfs_recall_layout(void *data)
 	clp = args->clp;
 	inode = args->inode;
 	rl = *args->rl;
-	args->result = 0;
-	complete(&args->started);
-	args = NULL;
-	/* Note: args must not be used after this point!!! */
-
-/* FIXME: need barrier here:
-   pause I/O to data servers
-   pause layoutgets
-   drain all outstanding writes to storage devices
-   wait for any outstanding layoutreturns and layoutgets mentioned in
-   cb_sequence.
-   then return layouts, resume after layoutreturns complete
- */
 
 	/* support whole file layouts only */
 	rl.cbl_seg.offset = 0;
 	rl.cbl_seg.length = NFS4_MAX_UINT64;
 
 	if (rl.cbl_recall_type == RETURN_FILE) {
-		status = pnfs_return_layout(inode, &rl.cbl_seg, &rl.cbl_stateid,
-					    RETURN_FILE, true);
+		if (pnfs_is_next_layout_stateid(&NFS_I(inode)->layout,
+						rl.cbl_stateid))
+			status = pnfs_return_layout(inode, &rl.cbl_seg,
+						    &rl.cbl_stateid, RETURN_FILE,
+						    false);
+		else
+			status = cpu_to_be32(NFS4ERR_DELAY);
 		if (status)
 			dprintk("%s RETURN_FILE error: %d\n", __func__, status);
+		else
+			status =  cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
+		args->result = status;
+		complete(&args->started);
 		goto out;
 	}
 
-	/* FIXME: This loop is inefficient, running in O(|s_inodes|^2) */
+	status = cpu_to_be32(NFS4_OK);
+	args->result = status;
+	complete(&args->started);
+	args = NULL;
+
+	/* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */
 	while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) {
-		/* XXX need to check status on pnfs_return_layout */
-		pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, true);
+		/* FIXME: need to check status on pnfs_return_layout */
+		pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false);
 		iput(ino);
 	}
 
+	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+	if (!lrp) {
+		dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n",
+			__func__);
+		goto out;
+	}
+
 	/* send final layoutreturn */
-	status = pnfs_return_layout(inode, &rl.cbl_seg, NULL,
-				    rl.cbl_recall_type, true);
-	if (status)
-		printk(KERN_INFO "%s: ignoring pnfs_return_layout status=%d\n",
-				__func__, status);
+	lrp->args.reclaim = 0;
+	lrp->args.layout_type = rl.cbl_layout_type;
+	lrp->args.return_type = rl.cbl_recall_type;
+	lrp->args.lseg = rl.cbl_seg;
+	lrp->args.inode = inode;
+	lrp->lo = NULL;
+	pnfs4_proc_layoutreturn(lrp, true);
+
 out:
-	iput(inode);
+	clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
+	nfs_put_client(clp);
 	module_put_and_exit(0);
 	dprintk("%s: exit status %d\n", __func__, 0);
 	return 0;
@@ -262,15 +307,18 @@ static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
 		.rl = rl,
 	};
 	struct task_struct *t;
-	int status;
-
-	/* should have returned NFS4ERR_NOMATCHING_LAYOUT... */
-	BUG_ON(inode == NULL);
+	int status = -EAGAIN;
 
 	dprintk("%s: -->\n", __func__);
 
+	/* FIXME: do not allow two concurrent layout recalls */
+	if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state))
+		return status;
+
 	init_completion(&data.started);
 	__module_get(THIS_MODULE);
+	if (!atomic_inc_not_zero(&clp->cl_count))
+		goto out_put_no_client;
 
 	t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout");
 	if (IS_ERR(t)) {
@@ -284,6 +332,9 @@ static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
 	wait_for_completion(&data.started);
 	return data.result;
 out_module_put:
+	nfs_put_client(clp);
+out_put_no_client:
+	clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
 	module_put(THIS_MODULE);
 	return status;
 }
@@ -294,35 +345,46 @@ __be32 pnfs_cb_layoutrecall(struct cb_pnfs_layoutrecallargs *args,
 	struct nfs_client *clp;
 	struct inode *inode = NULL;
 	__be32 res;
+	int status;
 	unsigned int num_client = 0;
 
 	dprintk("%s: -->\n", __func__);
 
-	res = htonl(NFS4ERR_INVAL);
-	clp = nfs_find_client(args->cbl_addr, 4);
+	res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+	clp  = nfs_find_client(args->cbl_addr, 4);
 	if (clp == NULL) {
 		dprintk("%s: no client for addr %u.%u.%u.%u\n",
 			__func__, NIPQUAD(args->cbl_addr));
 		goto out;
 	}
 
-	res = htonl(NFS4ERR_NOMATCHING_LAYOUT);
+	res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
 	do {
 		struct nfs_client *prev = clp;
 		num_client++;
-		inode = nfs_layoutrecall_find_inode(clp, args);
-		if (inode != NULL) {
-			if (PNFS_LD(&NFS_I(inode)->layout)->id ==
-			    args->cbl_layout_type) {
-				/* Set up a helper thread to actually
-				 * return the delegation */
-				res = pnfs_async_return_layout(clp, inode, args);
-				if (res != 0)
-					res = htonl(NFS4ERR_RESOURCE);
-				break;
+		/* the callback must come from the MDS personality */
+		if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
+			goto loop;
+		if (args->cbl_recall_type == RETURN_FILE) {
+			inode = nfs_layoutrecall_find_inode(clp, args);
+			if (inode != NULL) {
+				status = pnfs_async_return_layout(clp, inode,
+								  args);
+				if (status)
+					res = cpu_to_be32(NFS4ERR_DELAY);
+				iput(inode);
 			}
+		} else { /* _ALL or _FSID */
+			/* we need the inode to get the nfs_server struct */
+			inode = nfs_layoutrecall_find_inode(clp, args);
+			if (!inode)
+				goto loop;
+			status = pnfs_async_return_layout(clp, inode, args);
+			if (status)
+				res = cpu_to_be32(NFS4ERR_DELAY);
 			iput(inode);
 		}
+loop:
 		clp = nfs_find_client_next(prev);
 		nfs_put_client(prev);
 	} while (clp != NULL);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ebc9b3b..2f7974b 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -47,6 +47,7 @@ enum nfs4_client_state {
 	NFS4CLNT_SESSION_RESET,
 	NFS4CLNT_SESSION_DRAINING,
 	NFS4CLNT_RECALL_SLOT,
+	NFS4CLNT_LAYOUT_RECALL,
 };
 
 /*
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bdd0d19..42c46d8 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -687,6 +687,8 @@ return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
 
 	dprintk("--> %s\n", __func__);
 
+	BUG_ON(type != RETURN_FILE);
+
 	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
 	if (lrp == NULL) {
 		if (lo && (type == RETURN_FILE))
@@ -723,13 +725,11 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
 
 	dprintk("--> %s type %d\n", __func__, type);
 
-	if (range)
-		arg = *range;
-	else {
-		arg.iomode = IOMODE_ANY;
-		arg.offset = 0;
-		arg.length = NFS4_MAX_UINT64;
-	}
+
+	arg.iomode = range ? range->iomode : IOMODE_ANY;
+	arg.offset = 0;
+	arg.length = NFS4_MAX_UINT64;
+
 	if (type == RETURN_FILE) {
 		lo = get_lock_current_layout(nfsi);
 		if (lo && !has_layout_to_return(lo, &arg)) {
@@ -738,11 +738,7 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
 		}
 		if (!lo) {
 			dprintk("%s: no layout segments to return\n", __func__);
-			/* must send the LAYOUTRETURN in response to recall */
-			if (stateid)
-				goto send_return;
-			else
-				goto out;
+			goto out;
 		}
 
 		/* unlock w/o put rebalanced by eventual call to
@@ -751,12 +747,23 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
 		spin_unlock(&nfsi->lo_lock);
 
 		if (pnfs_return_layout_barrier(nfsi, &arg)) {
+			if (stateid) { /* callback */
+				status = -EAGAIN;
+				spin_lock(&nfsi->lo_lock);
+				put_unlock_current_layout(lo);
+				goto out;
+			}
 			dprintk("%s: waiting\n", __func__);
 			wait_event(nfsi->lo_waitq,
-				!pnfs_return_layout_barrier(nfsi, &arg));
+				   !pnfs_return_layout_barrier(nfsi, &arg));
 		}
 
 		if (layoutcommit_needed(nfsi)) {
+			if (stateid && !wait) { /* callback */
+				dprintk("%s: layoutcommit pending\n", __func__);
+				status = -EAGAIN;
+				goto out;
+			}
 			status = pnfs_layoutcommit_inode(ino, wait);
 			if (status) {
 				dprintk("%s: layoutcommit failed, status=%d. "
@@ -765,9 +772,13 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
 				status = 0;
 			}
 		}
+
+		if (stateid && wait)
+			status = return_layout(ino, &arg, stateid, type,
+					       lo, wait);
+		else
+			pnfs_layout_release(lo, &arg);
 	}
-send_return:
-	status = return_layout(ino, &arg, stateid, type, lo, wait);
 out:
 	dprintk("<-- %s status: %d\n", __func__, status);
 	return status;
@@ -1022,7 +1033,7 @@ pnfs_update_layout(struct inode *ino,
 	struct nfs4_pnfs_layout_segment arg = {
 		.iomode = iomode,
 		.offset = 0,
-		.length = ~0
+		.length = NFS4_MAX_UINT64,
 	};
 	struct nfs_inode *nfsi = NFS_I(ino);
 	struct pnfs_layout_type *lo;
@@ -1041,31 +1052,12 @@ pnfs_update_layout(struct inode *ino,
 	/* Check to see if the layout for the given range already exists */
 	lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref);
 	if (lseg && !lseg->valid) {
-		spin_unlock(&nfsi->lo_lock);
 		if (take_ref)
 			put_lseg(lseg);
-		for (;;) {
-			prepare_to_wait(&nfsi->lo_waitq, &__wait,
-					TASK_KILLABLE);
-			spin_lock(&nfsi->lo_lock);
-			lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref);
-			if (!lseg || lseg->valid)
-				break;
-			dprintk("%s: invalid lseg %p ref %d\n", __func__,
-				lseg, atomic_read(&lseg->kref.refcount)-1);
-			if (take_ref)
-				put_lseg(lseg);
-			if (signal_pending(current)) {
-				lseg = NULL;
-				result = -ERESTARTSYS;
-				break;
-			}
-			spin_unlock(&nfsi->lo_lock);
-			schedule();
-		}
-		finish_wait(&nfsi->lo_waitq, &__wait);
-		if (result)
-			goto out_put;
+
+		/* someone is cleaning the layout */
+		result = -EAGAIN;
+		goto out_put;
 	}
 
 	if (lseg) {
-- 
1.6.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html