[PATCH 13/18] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

Fred Isaman <iisaman@xxxxxxxxxx> · Thu, 4 Nov 2010 11:22:26 -0400

Remove NFS_LAYOUT_STATEID_SET in favor of just checking list_empty(lo->segs).

LAYOUTGETs with openstateid are serialized.  Waiting on the condition
(list_empty(lo->segs) && plh_outstanding>0) both drains outstanding RPCs once
the stateid is invalidated and allows only a single LAYOUTGET(openstateid)
through at a time.

Before sending a LAYOUTRETURN, plh_block_lgets is incremented.  It is
decremented in the rpc_release function.  While set, LAYOUTGETs are
paused in their rpc_prepare function, and any responses are
forgotten.

Callbacks are handled by blocking any matching LAYOUTGETS while processing and
initiating drain of IO.  A notification system is set up so that when
all relevant IO is finished, the state manger thread is invoked, which
synchronously sends the final matching LAYOUTRETURN before unblocking
LAYOUTGETS.

Signed-off-by: Fred Isaman <iisaman@xxxxxxxxxx>
---
 fs/nfs/callback.h         |    4 +-
 fs/nfs/callback_proc.c    |  471 +++++++++++++++++++++++----------------------
 fs/nfs/client.c           |    3 +
 fs/nfs/inode.c            |    3 +-
 fs/nfs/nfs4proc.c         |  105 +++++++---
 fs/nfs/nfs4state.c        |    4 +
 fs/nfs/nfs4xdr.c          |   16 ++-
 fs/nfs/pnfs.c             |  181 +++++++++++++----
 fs/nfs/pnfs.h             |   41 +++-
 include/linux/nfs_fs.h    |    1 +
 include/linux/nfs_fs_sb.h |    4 +
 11 files changed, 518 insertions(+), 315 deletions(-)

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 817b0f4..c1c7f3e 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -161,7 +161,8 @@ struct cb_layoutrecallargs {
 extern unsigned nfs4_callback_layoutrecall(
 	struct cb_layoutrecallargs *args,
 	void *dummy, struct cb_process_state *cps);
-
+extern bool matches_outstanding_recall(struct inode *ino,
+				       struct pnfs_layout_range *range);
 #endif /* CONFIG_NFS_V4_1 */
 
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
@@ -171,6 +172,7 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
 				   struct cb_process_state *cps);
 
 #ifdef CONFIG_NFS_V4
+extern void nfs_client_return_layouts(struct nfs_client *clp);
 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
 extern void nfs_callback_down(int minorversion);
 extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 1509c34..583446b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -132,270 +132,291 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
 #if defined(CONFIG_NFS_V4_1)
 
 static bool
-pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo,
-			    const nfs4_stateid stateid)
+_recall_matches_lget(struct pnfs_cb_lrecall_info *cb_info,
+		     struct inode *ino, struct pnfs_layout_range *range)
 {
-	bool res;
-	u32 oldseqid, newseqid;
-
-	spin_lock(&lo->inode->i_lock);
-	{
-		oldseqid = be32_to_cpu(lo->stateid.stateid.seqid);
-		newseqid = be32_to_cpu(stateid.stateid.seqid);
-		res = !memcmp(lo->stateid.stateid.other,
-			      stateid.stateid.other,
-			      NFS4_STATEID_OTHER_SIZE);
-		if (res) { /* comparing layout stateids */
-			if (oldseqid == ~0)
-				res = (newseqid == 1);
-			else
-				res = (newseqid == oldseqid + 1);
-		} else { /* open stateid */
-			res = !memcmp(lo->stateid.data,
-				      &zero_stateid,
-				      NFS4_STATEID_SIZE);
-			if (res)
-				res = (newseqid == 1);
-		}
-	}
-	spin_unlock(&lo->inode->i_lock);
+	struct cb_layoutrecallargs *cb_args = &cb_info->pcl_args;
 
-	return res;
+	switch (cb_args->cbl_recall_type) {
+	case RETURN_ALL:
+		return true;
+	case RETURN_FSID:
+		return !memcmp(&NFS_SERVER(ino)->fsid, &cb_args->cbl_fsid,
+			       sizeof(struct nfs_fsid));
+	case RETURN_FILE:
+		if (ino != cb_info->pcl_ino)
+			return false;
+		return should_free_lseg(range, &cb_args->cbl_range);
+	default:
+		BUG();
+	}
 }
 
-/*
- * Retrieve an inode based on layout recall parameters
- *
- * Note: caller must iput(inode) to dereference the inode.
- */
-static struct inode *
-nfs_layoutrecall_find_inode(struct nfs_client *clp,
-			    const struct cb_layoutrecallargs *args)
+bool
+matches_outstanding_recall(struct inode *ino, struct pnfs_layout_range *range)
 {
-	struct nfs_inode *nfsi;
-	struct pnfs_layout_hdr *lo;
-	struct nfs_server *server;
-	struct inode *ino = NULL;
-
-	dprintk("%s: Begin recall_type=%d clp %p\n",
-		__func__, args->cbl_recall_type, clp);
-
-	spin_lock(&clp->cl_lock);
-	list_for_each_entry(lo, &clp->cl_layouts, layouts) {
-		nfsi = NFS_I(lo->inode);
-		if (!nfsi)
-			continue;
-
-		dprintk("%s: Searching inode=%lu\n",
-			__func__, nfsi->vfs_inode.i_ino);
-
-		if (args->cbl_recall_type == RETURN_FILE) {
-		    if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh))
-			continue;
-		} else if (args->cbl_recall_type == RETURN_FSID) {
-			server = NFS_SERVER(&nfsi->vfs_inode);
-			if (server->fsid.major != args->cbl_fsid.major ||
-			    server->fsid.minor != args->cbl_fsid.minor)
-				continue;
+	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
+	struct pnfs_cb_lrecall_info *cb_info;
+	bool rv = false;
+
+	assert_spin_locked(&clp->cl_lock);
+	list_for_each_entry(cb_info, &clp->cl_layoutrecalls, pcl_list) {
+		if (_recall_matches_lget(cb_info, ino, range)) {
+			rv = true;
+			break;
 		}
-
-		/* Make sure client didn't clean up layout without
-		 * telling the server */
-		if (!has_layout(nfsi))
-			continue;
-
-		ino = igrab(&nfsi->vfs_inode);
-		dprintk("%s: Found inode=%p\n", __func__, ino);
-		break;
 	}
-	spin_unlock(&clp->cl_lock);
-	return ino;
+	return rv;
 }
 
-struct recall_layout_threadargs {
-	struct inode *inode;
-	struct nfs_client *clp;
-	struct completion started;
-	struct cb_layoutrecallargs *rl;
-	int result;
-};
-
-static int pnfs_recall_layout(void *data)
+/* Send a synchronous LAYOUTRETURN.  By the time this is called, we know
+ * all IO has been drained, any matching lsegs deleted, and that no
+ * overlapping LAYOUTGETs will be sent or processed for the duration
+ * of this call.
+ * Note that it is possible that when this is called, the stateid has
+ * been invalidated.  But will not be cleared, so can still use.
+ */
+static int
+pnfs_send_layoutreturn(struct nfs_client *clp,
+		       struct pnfs_cb_lrecall_info *cb_info)
 {
-	struct inode *inode, *ino;
-	struct nfs_client *clp;
-	struct cb_layoutrecallargs rl;
+	struct cb_layoutrecallargs *args = &cb_info->pcl_args;
 	struct nfs4_layoutreturn *lrp;
-	struct recall_layout_threadargs *args =
-		(struct recall_layout_threadargs *)data;
-	int status = 0;
-
-	daemonize("nfsv4-layoutreturn");
-
-	dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n",
-		__func__, args->rl->cbl_recall_type,
-		args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor);
-
-	clp = args->clp;
-	inode = args->inode;
-	rl = *args->rl;
-
-	/* support whole file layouts only */
-	rl.cbl_range.offset = 0;
-	rl.cbl_range.length = NFS4_MAX_UINT64;
-
-	if (rl.cbl_recall_type == RETURN_FILE) {
-		if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout,
-						rl.cbl_stateid))
-			status = pnfs_return_layout(inode, &rl.cbl_range,
-						    &rl.cbl_stateid, RETURN_FILE,
-						    false);
-		else
-			status = cpu_to_be32(NFS4ERR_DELAY);
-		if (status)
-			dprintk("%s RETURN_FILE error: %d\n", __func__, status);
-		else
-			status =  cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
-		args->result = status;
-		complete(&args->started);
-		goto out;
-	}
-
-	status = cpu_to_be32(NFS4_OK);
-	args->result = status;
-	complete(&args->started);
-	args = NULL;
-
-	/* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */
-	while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) {
-		/* FIXME: need to check status on pnfs_return_layout */
-		pnfs_return_layout(ino, &rl.cbl_range, NULL, RETURN_FILE, false);
-		iput(ino);
-	}
 
 	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
-	if (!lrp) {
-		dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n",
-			__func__);
-		goto out;
-	}
-
-	/* send final layoutreturn */
+	if (!lrp)
+		return -ENOMEM;
 	lrp->args.reclaim = 0;
-	lrp->args.layout_type = rl.cbl_layout_type;
-	lrp->args.return_type = rl.cbl_recall_type;
+	lrp->args.layout_type = args->cbl_layout_type;
+	lrp->args.return_type = args->cbl_recall_type;
 	lrp->clp = clp;
-	lrp->args.range = rl.cbl_range;
-	lrp->args.inode = inode;
-	nfs4_proc_layoutreturn(lrp, true);
-
-out:
-	clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
-	nfs_put_client(clp);
-	module_put_and_exit(0);
-	dprintk("%s: exit status %d\n", __func__, 0);
-	return 0;
+	if (args->cbl_recall_type == RETURN_FILE) {
+		lrp->args.range = args->cbl_range;
+		lrp->args.inode = cb_info->pcl_ino;
+	} else {
+		lrp->args.range.iomode = IOMODE_ANY;
+		lrp->args.inode = NULL;
+	}
+	return nfs4_proc_layoutreturn(lrp, true);
 }
 
-/*
- * Asynchronous layout recall!
+/* Called by state manager to finish CB_LAYOUTRECALLS initiated by
+ * nfs4_callback_layoutrecall().
  */
-static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
-				    struct cb_layoutrecallargs *rl)
+void nfs_client_return_layouts(struct nfs_client *clp)
 {
-	struct recall_layout_threadargs data = {
-		.clp = clp,
-		.inode = inode,
-		.rl = rl,
-	};
-	struct task_struct *t;
-	int status = -EAGAIN;
+	struct pnfs_cb_lrecall_info *cb_info;
 
-	dprintk("%s: -->\n", __func__);
+	spin_lock(&clp->cl_lock);
+	while (true) {
+		if (list_empty(&clp->cl_layoutrecalls)) {
+			spin_unlock(&clp->cl_lock);
+			break;
+		}
+		cb_info = list_first_entry(&clp->cl_layoutrecalls,
+					   struct pnfs_cb_lrecall_info,
+					   pcl_list);
+		spin_unlock(&clp->cl_lock);
+		if (atomic_read(&cb_info->pcl_count) != 0)
+			break;
+		/* What do on error return?  These layoutreturns are
+		 * required by the protocol.  So if do not get
+		 * successful reply, probably have to do something
+		 * more drastic.
+		 */
+		pnfs_send_layoutreturn(clp, cb_info);
+		spin_lock(&clp->cl_lock);
+		/* Removing from the list unblocks LAYOUTGETs */
+		list_del(&cb_info->pcl_list);
+		clp->cl_cb_lrecall_count--;
+		rpc_wake_up(&clp->cl_rpcwaitq_recall);
+		kfree(cb_info);
+	}
+}
 
-	/* FIXME: do not allow two concurrent layout recalls */
-	if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state))
-		return status;
-
-	init_completion(&data.started);
-	__module_get(THIS_MODULE);
-	atomic_inc(&clp->cl_count);
-
-	t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout");
-	if (IS_ERR(t)) {
-		printk(KERN_INFO "NFS: Layout recall callback thread failed "
-			"for client (clientid %08x/%08x)\n",
-			(unsigned)(clp->cl_clientid >> 32),
-			(unsigned)(clp->cl_clientid));
-		status = PTR_ERR(t);
-		goto out_module_put;
+void notify_drained(struct pnfs_cb_lrecall_info *d)
+{
+	if (d && atomic_dec_and_test(&d->pcl_count)) {
+		set_bit(NFS4CLNT_LAYOUT_RECALL, &d->pcl_clp->cl_state);
+		nfs4_schedule_state_manager(d->pcl_clp);
 	}
-	wait_for_completion(&data.started);
-	return data.result;
-out_module_put:
-	nfs_put_client(clp);
-	clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
-	module_put(THIS_MODULE);
-	return status;
 }
 
-static int pnfs_recall_all_layouts(struct nfs_client *clp)
+static int initiate_layout_draining(struct pnfs_cb_lrecall_info *cb_info)
 {
-	struct cb_layoutrecallargs rl;
-	struct inode *inode;
-	int status = 0;
-
-	rl.cbl_recall_type = RETURN_ALL;
-	rl.cbl_range.iomode = IOMODE_ANY;
-	rl.cbl_range.offset = 0;
-	rl.cbl_range.length = NFS4_MAX_UINT64;
-
-	/* we need the inode to get the nfs_server struct */
-	inode = nfs_layoutrecall_find_inode(clp, &rl);
-	if (!inode)
-		return status;
-	status = pnfs_async_return_layout(clp, inode, &rl);
-	iput(inode);
+	struct nfs_client *clp = cb_info->pcl_clp;
+	struct pnfs_layout_hdr *lo;
+	int rv = NFS4ERR_NOMATCHING_LAYOUT;
+	struct cb_layoutrecallargs *args = &cb_info->pcl_args;
+
+	if (args->cbl_recall_type == RETURN_FILE) {
+		LIST_HEAD(free_me_list);
+
+		spin_lock(&clp->cl_lock);
+		list_for_each_entry(lo, &clp->cl_layouts, layouts) {
+			if (nfs_compare_fh(&args->cbl_fh,
+					   &NFS_I(lo->inode)->fh))
+				continue;
+			if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+				rv = NFS4ERR_DELAY;
+			else {
+				/* FIXME I need to better understand igrab and
+				 * does having a layout ref keep ino around?
+				 *  It should.
+				 */
+				/* We need to hold the reference until any
+				 * potential LAYOUTRETURN is finished.
+				 */
+				get_layout_hdr(lo);
+				cb_info->pcl_ino = lo->inode;
+				rv = NFS4_OK;
+			}
+			break;
+		}
+		spin_unlock(&clp->cl_lock);
+
+		spin_lock(&lo->inode->i_lock);
+		if (rv == NFS4_OK) {
+			lo->plh_block_lgets++;
+			nfs4_asynch_forget_layouts(lo, &args->cbl_range,
+						   cb_info, &free_me_list);
+		}
+		pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+		spin_unlock(&lo->inode->i_lock);
+		pnfs_free_lseg_list(&free_me_list);
+	} else {
+		struct pnfs_layout_hdr *tmp;
+		LIST_HEAD(recall_list);
+		LIST_HEAD(free_me_list);
+		struct pnfs_layout_range range = {
+			.iomode = IOMODE_ANY,
+			.offset = 0,
+			.length = NFS4_MAX_UINT64,
+		};
+
+		spin_lock(&clp->cl_lock);
+		/* Per RFC 5661, 12.5.5.2.1.5, bulk recall must be serialized */
+		if (!list_is_singular(&clp->cl_layoutrecalls)) {
+			spin_unlock(&clp->cl_lock);
+			return NFS4ERR_DELAY;
+		}
+		list_for_each_entry(lo, &clp->cl_layouts, layouts) {
+			if ((args->cbl_recall_type == RETURN_FSID) &&
+			    memcmp(&NFS_SERVER(lo->inode)->fsid,
+				   &args->cbl_fsid, sizeof(struct nfs_fsid)))
+				continue;
+			get_layout_hdr(lo);
+			/* We could list_del(&lo->layouts) here */
+			BUG_ON(!list_empty(&lo->plh_bulk_recall));
+			list_add(&lo->plh_bulk_recall, &recall_list);
+		}
+		spin_unlock(&clp->cl_lock);
+		list_for_each_entry_safe(lo, tmp,
+					 &recall_list, plh_bulk_recall) {
+			spin_lock(&lo->inode->i_lock);
+			set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+			nfs4_asynch_forget_layouts(lo, &range, cb_info,
+						   &free_me_list);
+			list_del_init(&lo->plh_bulk_recall);
+			spin_unlock(&lo->inode->i_lock);
+			put_layout_hdr(lo->inode);
+			rv = NFS4_OK;
+		}
+		pnfs_free_lseg_list(&free_me_list);
+	}
+	return rv;
+}
+
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+				    struct cb_layoutrecallargs *args)
+{
+	struct pnfs_cb_lrecall_info *new;
+	u32 res;
+
+	dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new) {
+		res = NFS4ERR_RESOURCE;
+		goto out;
+	}
+	memcpy(&new->pcl_args, args, sizeof(*args));
+	atomic_set(&new->pcl_count, 1);
+	new->pcl_clp = clp;
+	new->pcl_ino = NULL;
+	spin_lock(&clp->cl_lock);
+	if (clp->cl_cb_lrecall_count >= PNFS_MAX_CB_LRECALLS) {
+		kfree(new);
+		res = NFS4ERR_DELAY;
+		spin_unlock(&clp->cl_lock);
+		goto out;
+	}
+	clp->cl_cb_lrecall_count++;
+	/* Adding to the list will block conflicting LGET activity */
+	list_add_tail(&new->pcl_list, &clp->cl_layoutrecalls);
+	spin_unlock(&clp->cl_lock);
+	res = initiate_layout_draining(new);
+	if (res || atomic_dec_and_test(&new->pcl_count)) {
+		spin_lock(&clp->cl_lock);
+		list_del(&new->pcl_list);
+		clp->cl_cb_lrecall_count--;
+		rpc_wake_up(&clp->cl_rpcwaitq_recall);
+		spin_unlock(&clp->cl_lock);
+		if (res == NFS4_OK) {
+			if (args->cbl_recall_type == RETURN_FILE) {
+				struct pnfs_layout_hdr *lo;
+
+				lo = NFS_I(new->pcl_ino)->layout;
+				spin_lock(&lo->inode->i_lock);
+				lo->plh_block_lgets--;
+				if (!pnfs_layoutgets_blocked(lo, NULL))
+					rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
+				spin_unlock(&lo->inode->i_lock);
+				put_layout_hdr(new->pcl_ino);
+			}
+			res = NFS4ERR_NOMATCHING_LAYOUT;
+		}
+		kfree(new);
+	}
+out:
+	dprintk("%s returning %i\n", __func__, res);
+	return res;
 
-	return status;
 }
 
 __be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
 				  void *dummy, struct cb_process_state *cps)
 {
 	struct nfs_client *clp;
-	struct inode *inode = NULL;
-	__be32 res;
-	int status;
+	u32 res;
 
 	dprintk("%s: -->\n", __func__);
 
-	res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
 	if (cps->session) /* set in cb_sequence */
 		clp = cps->session->clp;
-	else
+	else {
+		res = NFS4ERR_OP_NOT_IN_SESSION;
 		goto out;
-
+	}
 	/* the callback must come from the MDS personality */
-	res = cpu_to_be32(NFS4ERR_NOTSUPP);
-	if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
+	if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) {
+		res = NFS4ERR_INVAL;
 		goto out;
-
-	res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
-	/*
-	 * In the _ALL or _FSID case, we need the inode to get
-	 * the nfs_server struct.
-	 */
-	inode = nfs_layoutrecall_find_inode(clp, args);
-	if (!inode)
-		goto out;
-	status = pnfs_async_return_layout(clp, inode, args);
-	if (status)
-		res = cpu_to_be32(NFS4ERR_DELAY);
-	iput(inode);
+	}
+	res = do_callback_layoutrecall(clp, args);
 out:
-	dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
-	return res;
+	dprintk("%s: exit with status = %d\n", __func__, res);
+	return cpu_to_be32(res);
+}
+
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+	struct cb_layoutrecallargs args;
+
+	/* Pretend we got a CB_LAYOUTRECALL(ALL) */
+	memset(&args, 0, sizeof(args));
+	args.cbl_recall_type = RETURN_ALL;
+	/* FIXME we ignore errors, what should we do? */
+	do_callback_layoutrecall(clp, &args);
 }
 
 int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
@@ -677,9 +698,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
 		flags |= FMODE_WRITE;
 	if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
 		     &args->craa_type_mask))
-		if (pnfs_recall_all_layouts(clp) == -EAGAIN)
-			status = cpu_to_be32(NFS4ERR_DELAY);
-
+		pnfs_recall_all_layouts(clp);
 	if (flags)
 		nfs_expire_all_delegation_types(clp, flags);
 out:
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 3c8c841..dbf43e7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -158,6 +158,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 		clp->cl_machine_cred = cred;
 #if defined(CONFIG_NFS_V4_1)
 	INIT_LIST_HEAD(&clp->cl_layouts);
+	INIT_LIST_HEAD(&clp->cl_layoutrecalls);
+	rpc_init_wait_queue(&clp->cl_rpcwaitq_recall,
+			    "NFS client CB_LAYOUTRECALLS");
 #endif
 	nfs_fscache_get_client_cookie(clp);
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 72f27cc..8727ade 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1459,7 +1459,8 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 	nfsi->delegation = NULL;
 	nfsi->delegation_state = 0;
 	init_rwsem(&nfsi->rwsem);
-	rpc_init_wait_queue(&nfsi->lo_rpcwaitq, "pNFS Layout");
+	rpc_init_wait_queue(&nfsi->lo_rpcwaitq, "pNFS Layoutreturn");
+	rpc_init_wait_queue(&nfsi->lo_rpcwaitq_stateid, "pNFS Layoutstateid");
 	nfsi->layout = NULL;
 #endif
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index be19e225..87b2b63 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5346,42 +5346,58 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 	struct inode *ino = lgp->args.inode;
 	struct nfs_inode *nfsi = NFS_I(ino);
 	struct nfs_server *server = NFS_SERVER(ino);
-	struct pnfs_layout_segment *lseg;
+	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 
 	dprintk("--> %s\n", __func__);
+	spin_lock(&clp->cl_lock);
+	if (matches_outstanding_recall(ino, &lgp->args.range)) {
+		rpc_sleep_on(&clp->cl_rpcwaitq_recall, task, NULL);
+		spin_unlock(&clp->cl_lock);
+		return;
+	}
+	spin_unlock(&clp->cl_lock);
+	/* Note the is a race here, where a CB_LAYOUTRECALL can come in
+	 * right now covering the LAYOUTGET we are about to send.
+	 * However, that is not so catastrophic, and there seems
+	 * to be no way to prevent it completely.
+	 */
 	spin_lock(&ino->i_lock);
-	lseg = pnfs_has_layout(nfsi->layout, &lgp->args.range);
-	if (likely(!lseg)) {
+	if (pnfs_layoutgets_blocked(nfsi->layout, NULL)) {
+		rpc_sleep_on(&nfsi->lo_rpcwaitq_stateid, task, NULL);
 		spin_unlock(&ino->i_lock);
-		dprintk("%s: no lseg found, proceeding\n", __func__);
-		if (!nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
-					 &lgp->res.seq_res, 0, task))
-			rpc_call_start(task);
 		return;
 	}
-	if (!lseg->valid) {
+	/* This needs after but atomic with above check in order to properly
+	 * serialize openstateid LAYOUTGETs.
+	 */
+	nfsi->layout->plh_outstanding++;
+	spin_unlock(&ino->i_lock);
+
+	if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
+				&lgp->res.seq_res, 0, task)) {
+		spin_lock(&ino->i_lock);
+		nfsi->layout->plh_outstanding--;
 		spin_unlock(&ino->i_lock);
-		dprintk("%s: invalid lseg found, waiting\n", __func__);
-		rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
 		return;
 	}
-	get_lseg(lseg);
-	*lgp->lsegpp = lseg;
-	spin_unlock(&ino->i_lock);
-	dprintk("%s: valid lseg found, no rpc required\n", __func__);
-	rpc_exit(task, NFS4_OK);
+	rpc_call_start(task);
 }
 
 static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_layoutget *lgp = calldata;
-	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+	struct inode *ino = lgp->args.inode;
 
 	dprintk("--> %s\n", __func__);
 
-	if (!nfs4_sequence_done(task, &lgp->res.seq_res))
+	if (!nfs4_sequence_done(task, &lgp->res.seq_res)) {
+		/* layout code relies on fact that in this case
+		 * code falls back to tk_action=call_start, but not
+		 * back to rpc_prepare_task, to keep plh_outstanding
+		 * correct.
+		 */
 		return;
-
+	}
 	switch (task->tk_status) {
 	case 0:
 		break;
@@ -5390,7 +5406,11 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 		task->tk_status = -NFS4ERR_DELAY;
 		/* Fall through */
 	default:
-		if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
+		if (nfs4_async_handle_error(task, NFS_SERVER(ino),
+					    NULL, NULL) == -EAGAIN) {
+			spin_lock(&ino->i_lock);
+			NFS_I(ino)->layout->plh_outstanding--;
+			spin_unlock(&ino->i_lock);
 			rpc_restart_call_prepare(task);
 			return;
 		}
@@ -5448,13 +5468,20 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 	status = nfs4_wait_for_completion_rpc_task(task);
-	if (status != 0)
-		goto out;
-	status = task->tk_status;
-	if (status != 0)
-		goto out;
-	status = pnfs_layout_process(lgp);
-out:
+	if (status == 0)
+		status = task->tk_status;
+	if (status == 0)
+		status = pnfs_layout_process(lgp);
+	else {
+		struct inode *ino = lgp->args.inode;
+		struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
+
+		spin_lock(&ino->i_lock);
+		lo->plh_outstanding--;
+		if (!pnfs_layoutgets_blocked(lo, NULL))
+			rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
+		spin_unlock(&ino->i_lock);
+	}
 	rpc_put_task(task);
 	dprintk("<-- %s status=%d\n", __func__, status);
 	return status;
@@ -5598,9 +5625,9 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 
 		spin_lock(&lo->inode->i_lock);
 		if (lrp->res.lrs_present)
-			pnfs_set_layout_stateid(lo, &lrp->res.stateid);
+			pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
 		else
-			pnfs_invalidate_layout_stateid(lo);
+			BUG_ON(!list_empty(&lo->segs));
 		spin_unlock(&lo->inode->i_lock);
 	}
 	dprintk("<-- %s\n", __func__);
@@ -5611,8 +5638,18 @@ static void nfs4_layoutreturn_release(void *calldata)
 	struct nfs4_layoutreturn *lrp = calldata;
 
 	dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type);
-	if (lrp->args.return_type == RETURN_FILE)
-		put_layout_hdr(lrp->args.inode);
+	if (lrp->args.return_type == RETURN_FILE) {
+		struct inode *ino = lrp->args.inode;
+		struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
+
+		spin_lock(&ino->i_lock);
+		lo->plh_block_lgets--;
+		lo->plh_outstanding--;
+		if (!pnfs_layoutgets_blocked(lo, NULL))
+			rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
+		spin_unlock(&ino->i_lock);
+		put_layout_hdr(ino);
+	}
 	kfree(calldata);
 	dprintk("<-- %s\n", __func__);
 }
@@ -5641,6 +5678,14 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
 	int status = 0;
 
 	dprintk("--> %s\n", __func__);
+	if (lrp->args.return_type == RETURN_FILE) {
+		struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+		/* FIXME we should test for BULK here */
+		spin_lock(&lo->inode->i_lock);
+		BUG_ON(lo->plh_block_lgets == 0);
+		lo->plh_outstanding++;
+		spin_unlock(&lo->inode->i_lock);
+	}
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 00632f6..ceb0d66 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1560,6 +1560,10 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			nfs_client_return_marked_delegations(clp);
 			continue;
 		}
+		if (test_and_clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) {
+			nfs_client_return_layouts(clp);
+			continue;
+		}
 		/* Recall session slots */
 		if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
 		   && nfs4_has_session(clp)) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 10a6f4a..5208ef7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1827,13 +1827,14 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
 	hdr->replen += decode_getdeviceinfo_maxsz;
 }
 
-static void
+static int
 encode_layoutget(struct xdr_stream *xdr,
 		      const struct nfs4_layoutget_args *args,
 		      struct compound_hdr *hdr)
 {
 	nfs4_stateid stateid;
 	__be32 *p;
+	int status;
 
 	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_LAYOUTGET);
@@ -1843,8 +1844,11 @@ encode_layoutget(struct xdr_stream *xdr,
 	p = xdr_encode_hyper(p, args->range.offset);
 	p = xdr_encode_hyper(p, args->range.length);
 	p = xdr_encode_hyper(p, args->minlength);
-	pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
-				args->ctx->state);
+	status = pnfs_choose_layoutget_stateid(&stateid,
+					       NFS_I(args->inode)->layout,
+					       args->ctx->state);
+	if (status)
+		return status;
 	p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
 	*p = cpu_to_be32(args->maxcount);
 
@@ -1857,6 +1861,7 @@ encode_layoutget(struct xdr_stream *xdr,
 		args->maxcount);
 	hdr->nops++;
 	hdr->replen += decode_layoutget_maxsz;
+	return 0;
 }
 
 static int
@@ -2781,12 +2786,15 @@ static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
+	int status;
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, req, &hdr);
 	encode_sequence(&xdr, &args->seq_args, &hdr);
 	encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
-	encode_layoutget(&xdr, args, &hdr);
+	status = encode_layoutget(&xdr, args, &hdr);
+	if (status)
+		return status;
 	encode_nops(&hdr);
 	return 0;
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ca8be8d..8d04cf2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
  */
 
 /* Need to hold i_lock if caller does not already hold reference */
-static void
+void
 get_layout_hdr(struct pnfs_layout_hdr *lo)
 {
 	atomic_inc(&lo->plh_refcount);
@@ -278,24 +278,29 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 	smp_mb();
 	lseg->valid = true;
 	lseg->layout = lo;
+	lseg->drain_notification = NULL;
 }
 
 static void
 _put_lseg_common(struct pnfs_layout_segment *lseg)
 {
+	struct inode *ino = lseg->layout->inode;
+
 	BUG_ON(lseg->valid == true);
 	list_del(&lseg->fi_list);
 	if (list_empty(&lseg->layout->segs)) {
 		struct nfs_client *clp;
 
-		clp = NFS_SERVER(lseg->layout->inode)->nfs_client;
+		clp = NFS_SERVER(ino)->nfs_client;
 		spin_lock(&clp->cl_lock);
 		/* List does not take a reference, so no need for put here */
 		list_del_init(&lseg->layout->layouts);
 		spin_unlock(&clp->cl_lock);
-		pnfs_invalidate_layout_stateid(lseg->layout);
+		clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->layout->plh_flags);
+		if (!pnfs_layoutgets_blocked(lseg->layout, NULL))
+			rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
 	}
-	rpc_wake_up(&NFS_I(lseg->layout->inode)->lo_rpcwaitq);
+	rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq);
 }
 
 /* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
@@ -325,9 +330,12 @@ put_lseg(struct pnfs_layout_segment *lseg)
 		atomic_read(&lseg->pls_refcount), lseg->valid);
 	ino = lseg->layout->inode;
 	if (atomic_dec_and_lock(&lseg->pls_refcount, &ino->i_lock)) {
+		struct pnfs_cb_lrecall_info *drain_info = lseg->drain_notification;
+
 		_put_lseg_common(lseg);
 		spin_unlock(&ino->i_lock);
 		NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+		notify_drained(drain_info);
 		/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
 		put_layout_hdr(ino);
 	}
@@ -345,7 +353,7 @@ EXPORT_SYMBOL_GPL(put_lseg);
  * READ		READ	true
  * READ		RW	false
  */
-static int
+bool
 should_free_lseg(struct pnfs_layout_range *lseg_range,
 		 struct pnfs_layout_range *recall_range)
 {
@@ -388,16 +396,19 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
 	dprintk("%s:Return\n", __func__);
 }
 
-static void
+void
 pnfs_free_lseg_list(struct list_head *free_me)
 {
 	struct pnfs_layout_segment *lseg, *tmp;
 	struct inode *ino;
+	struct pnfs_cb_lrecall_info *drain_info;
 
 	list_for_each_entry_safe(lseg, tmp, free_me, fi_list) {
 		BUG_ON(atomic_read(&lseg->pls_refcount) != 0);
 		ino = lseg->layout->inode;
+		drain_info = lseg->drain_notification;
 		NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+		notify_drained(drain_info);
 		/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
 		put_layout_hdr(ino);
 	}
@@ -453,31 +464,32 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
 	}
 }
 
-/* update lo->stateid with new if is more recent
- *
- * lo->stateid could be the open stateid, in which case we just use what given.
- */
+/* update lo->stateid with new if is more recent */
 void
-pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
-			const nfs4_stateid *new)
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
+			bool update_barrier)
 {
-	nfs4_stateid *old = &lo->stateid;
-	bool overwrite = false;
+	u32 oldseq, newseq;
 
 	assert_spin_locked(&lo->inode->i_lock);
-	if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags) ||
-	    memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
-		overwrite = true;
-	else {
-		u32 oldseq, newseq;
-
-		oldseq = be32_to_cpu(old->stateid.seqid);
-		newseq = be32_to_cpu(new->stateid.seqid);
-		if ((int)(newseq - oldseq) > 0)
-			overwrite = true;
+	oldseq = be32_to_cpu(lo->stateid.stateid.seqid);
+	newseq = be32_to_cpu(new->stateid.seqid);
+	if ((int)(newseq - oldseq) > 0) {
+		memcpy(&lo->stateid, &new->stateid, sizeof(new->stateid));
+		if (update_barrier)
+			lo->plh_barrier = be32_to_cpu(new->stateid.seqid);
+		else {
+			/* Because of wraparound, we want to keep the barrier
+			 * "close" to the current seqids.  It needs to be
+			 * within 2**31 to count as "behind", so if it
+			 * gets too near that limit, give us a litle leeway
+			 * and bring it to within 2**30.
+			 * NOTE - and yes, this is all unsigned arithmetic.
+			 */
+			if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
+				lo->plh_barrier = newseq - (1 << 30);
+		}
 	}
-	if (overwrite)
-		memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
 }
 
 /* Layoutreturn may use an invalid stateid, just copy what is there */
@@ -487,13 +499,21 @@ void pnfs_copy_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo)
 	memcpy(dst->data, lo->stateid.data, sizeof(lo->stateid.data));
 }
 
-void
-pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-			struct nfs4_state *open_state)
+int
+pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+			      struct nfs4_state *open_state)
 {
+	int status = 0;
+
 	dprintk("--> %s\n", __func__);
 	spin_lock(&lo->inode->i_lock);
-	if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
+	if (lo->plh_block_lgets ||
+	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+		/* We avoid -EAGAIN, as that has special meaning to
+		 * some callers.
+		 */
+		status = -NFS4ERR_LAYOUTTRYLATER;
+	} else if (list_empty(&lo->segs)) {
 		int seq;
 
 		do {
@@ -501,12 +521,11 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 			memcpy(dst->data, open_state->stateid.data,
 			       sizeof(open_state->stateid.data));
 		} while (read_seqretry(&open_state->seqlock, seq));
-		set_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
 	} else
-		memcpy(dst->data, lo->stateid.data,
-		       sizeof(lo->stateid.data));
+		memcpy(dst->data, lo->stateid.data, sizeof(lo->stateid.data));
 	spin_unlock(&lo->inode->i_lock);
 	dprintk("<-- %s\n", __func__);
+	return status;
 }
 
 /*
@@ -573,6 +592,28 @@ has_layout_to_return(struct pnfs_layout_hdr *lo,
 	return out;
 }
 
+void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
+				struct pnfs_layout_range *range,
+				struct pnfs_cb_lrecall_info *drain_info,
+				struct list_head *tmp_list)
+{
+	struct pnfs_layout_segment *lseg, *tmp;
+
+	assert_spin_locked(&lo->inode->i_lock);
+	list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
+		if (should_free_lseg(&lseg->range, range)) {
+			/* FIXME - need to change to something like a
+			 * notification bitmap to remove the restriction
+			 * of only being able to process a single
+			 * CB_LAYOUTRECALL at a time.
+			 */
+			BUG_ON(lseg->drain_notification);
+			lseg->drain_notification = drain_info;
+			atomic_inc(&drain_info->pcl_count);
+			mark_lseg_invalid(lseg, tmp_list);
+		}
+}
+
 /* Return true if there is layout based io in progress in the given range.
  * Assumes range has already been marked invalid, and layout marked to
  * prevent any new lseg from being inserted.
@@ -661,6 +702,7 @@ _pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range,
 			goto out;
 		}
 
+		lo->plh_block_lgets++;
 		list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
 			if (should_free_lseg(&lseg->range, &arg))
 				mark_lseg_invalid(lseg, &tmp_list);
@@ -717,14 +759,6 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
 	dprintk("%s:Begin\n", __func__);
 
 	assert_spin_locked(&lo->inode->i_lock);
-	if (list_empty(&lo->segs)) {
-		struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
-
-		spin_lock(&clp->cl_lock);
-		BUG_ON(!list_empty(&lo->layouts));
-		list_add_tail(&lo->layouts, &clp->cl_layouts);
-		spin_unlock(&clp->cl_lock);
-	}
 	list_for_each_entry(lp, &lo->segs, fi_list) {
 		if (cmp_layout(&lp->range, &lseg->range) > 0)
 			continue;
@@ -741,6 +775,9 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
 	}
 	if (!found) {
 		list_add_tail(&lseg->fi_list, &lo->segs);
+		if (list_is_singular(&lo->segs) &&
+		    !pnfs_layoutgets_blocked(lo, NULL))
+			rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
 		dprintk("%s: inserted lseg %p "
 			"iomode %d offset %llu length %llu at tail\n",
 			__func__, lseg, lseg->range.iomode,
@@ -762,6 +799,7 @@ alloc_init_layout_hdr(struct inode *ino)
 	atomic_set(&lo->plh_refcount, 1);
 	INIT_LIST_HEAD(&lo->layouts);
 	INIT_LIST_HEAD(&lo->segs);
+	INIT_LIST_HEAD(&lo->plh_bulk_recall);
 	lo->inode = ino;
 	return lo;
 }
@@ -849,6 +887,7 @@ pnfs_update_layout(struct inode *ino,
 		.length = NFS4_MAX_UINT64,
 	};
 	struct nfs_inode *nfsi = NFS_I(ino);
+	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 	struct pnfs_layout_hdr *lo;
 	struct pnfs_layout_segment *lseg = NULL;
 
@@ -884,9 +923,28 @@ pnfs_update_layout(struct inode *ino,
 		goto out_unlock;
 
 	get_layout_hdr(lo); /* Matched in pnfs_layoutget_release */
+	if (list_empty(&lo->segs)) {
+		/* The lo must be on the clp list if there is any
+		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
+		 */
+		spin_lock(&clp->cl_lock);
+		BUG_ON(!list_empty(&lo->layouts));
+		list_add_tail(&lo->layouts, &clp->cl_layouts);
+		spin_unlock(&clp->cl_lock);
+	}
 	spin_unlock(&ino->i_lock);
 
 	lseg = send_layoutget(lo, ctx, &arg);
+	if (!lseg) {
+		spin_lock(&ino->i_lock);
+		if (list_empty(&lo->segs)) {
+			spin_lock(&clp->cl_lock);
+			list_del_init(&lo->layouts);
+			spin_unlock(&clp->cl_lock);
+			clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+		}
+		spin_unlock(&ino->i_lock);
+	}
 out:
 	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
 		nfsi->layout->plh_flags, lseg);
@@ -896,6 +954,18 @@ out_unlock:
 	goto out;
 }
 
+bool
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid)
+{
+	assert_spin_locked(&lo->inode->i_lock);
+	if ((stateid) &&
+	    (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
+		return true;
+	return lo->plh_block_lgets ||
+		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+		(list_empty(&lo->segs) && lo->plh_outstanding);
+}
+
 int
 pnfs_layout_process(struct nfs4_layoutget *lgp)
 {
@@ -903,6 +973,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	struct nfs4_layoutget_res *res = &lgp->res;
 	struct pnfs_layout_segment *lseg;
 	struct inode *ino = lo->inode;
+	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 	int status = 0;
 
 	/* Inject layout blob into I/O device driver */
@@ -914,10 +985,25 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 			status = PTR_ERR(lseg);
 		dprintk("%s: Could not allocate layout: error %d\n",
 		       __func__, status);
+		spin_lock(&ino->i_lock);
 		goto out;
 	}
 
 	spin_lock(&ino->i_lock);
+	/* decrement needs to be done before call to pnfs_layoutget_blocked */
+	lo->plh_outstanding--;
+	spin_lock(&clp->cl_lock);
+	if (matches_outstanding_recall(ino, &res->range)) {
+		spin_unlock(&clp->cl_lock);
+		dprintk("%s forget reply due to recall\n", __func__);
+		goto out_forget_reply;
+	}
+	spin_unlock(&clp->cl_lock);
+
+	if (pnfs_layoutgets_blocked(lo, &res->stateid)) {
+		dprintk("%s forget reply due to state\n", __func__);
+		goto out_forget_reply;
+	}
 	init_lseg(lo, lseg);
 	lseg->range = res->range;
 	get_lseg(lseg);
@@ -933,10 +1019,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	}
 
 	/* Done processing layoutget. Set the layout stateid */
-	pnfs_set_layout_stateid(lo, &res->stateid);
-	spin_unlock(&ino->i_lock);
+	pnfs_set_layout_stateid(lo, &res->stateid, false);
 out:
+	if (!pnfs_layoutgets_blocked(lo, NULL))
+		rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
+	spin_unlock(&ino->i_lock);
 	return status;
+
+out_forget_reply:
+	spin_unlock(&ino->i_lock);
+	lseg->layout = lo;
+	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+	spin_lock(&ino->i_lock);
+	goto out;
 }
 
 void
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e631487..810714a 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -31,6 +31,7 @@
 #define FS_NFS_PNFS_H
 
 #include <linux/nfs_page.h>
+#include "callback.h" /* for cb_layoutrecallargs */
 
 struct pnfs_layout_segment {
 	struct list_head fi_list;
@@ -38,6 +39,7 @@ struct pnfs_layout_segment {
 	atomic_t pls_refcount;
 	bool valid;
 	struct pnfs_layout_hdr *layout;
+	struct pnfs_cb_lrecall_info *drain_notification;
 };
 
 enum pnfs_try_status {
@@ -52,7 +54,7 @@ enum pnfs_try_status {
 enum {
 	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
 	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
-	NFS_LAYOUT_STATEID_SET,		/* have a valid layout stateid */
+	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
 	NFS_LAYOUT_NEED_LCOMMIT,	/* LAYOUTCOMMIT needed */
 };
 
@@ -94,9 +96,13 @@ struct pnfs_layoutdriver_type {
 struct pnfs_layout_hdr {
 	atomic_t		plh_refcount;
 	struct list_head	layouts;   /* other client layouts */
+	struct list_head	plh_bulk_recall; /* clnt list of bulk recalls */
 	struct list_head	segs;      /* layout segments list */
 	int			roc_iomode;/* return on close iomode, 0=none */
 	nfs4_stateid		stateid;
+	unsigned long		plh_outstanding; /* number of RPCs out */
+	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */
+	u32			plh_barrier; /* ignore lower seqids */
 	unsigned long		plh_flags;
 	struct rpc_cred		*cred;     /* layoutcommit credential */
 	/* DH: These vars keep track of the maximum write range
@@ -117,6 +123,14 @@ struct pnfs_device {
 	unsigned int  pglen;
 };
 
+struct pnfs_cb_lrecall_info {
+	struct list_head	pcl_list; /* hook into cl_layoutrecalls list */
+	atomic_t		pcl_count;
+	struct nfs_client	*pcl_clp;
+	struct inode		*pcl_ino;
+	struct cb_layoutrecallargs pcl_args;
+};
+
 /*
  * Device ID RCU cache. A device ID is unique per client ID and layout type.
  */
@@ -175,7 +189,10 @@ extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
 
 /* pnfs.c */
+void get_layout_hdr(struct pnfs_layout_hdr *lo);
 void put_lseg(struct pnfs_layout_segment *lseg);
+bool should_free_lseg(struct pnfs_layout_range *lseg_range,
+		      struct pnfs_layout_range *recall_range);
 struct pnfs_layout_segment *
 pnfs_has_layout(struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range);
 struct pnfs_layout_segment *
@@ -200,15 +217,25 @@ enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *,
 void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
 			   struct nfs_open_context *, struct list_head *);
 void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
+bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
 void pnfs_destroy_all_layouts(struct nfs_client *);
 void put_layout_hdr(struct inode *inode);
 void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
-			     const nfs4_stateid *new);
+			     const nfs4_stateid *new,
+			     bool update_barrier);
 void pnfs_copy_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo);
-void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-			     struct nfs4_state *open_state);
+int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+				  struct pnfs_layout_hdr *lo,
+				  struct nfs4_state *open_state);
+void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
+				struct pnfs_layout_range *range,
+				struct pnfs_cb_lrecall_info *drain_info,
+				struct list_head *tmp_list);
+/* FIXME - this should be in callback.h, but pnfs_cb_lrecall_info needs to be there too */
+extern void notify_drained(struct pnfs_cb_lrecall_info *d);
 
 static inline bool
 has_layout(struct nfs_inode *nfsi)
@@ -222,12 +249,6 @@ static inline int lo_fail_bit(u32 iomode)
 			 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 }
 
-static inline void pnfs_invalidate_layout_stateid(struct pnfs_layout_hdr *lo)
-{
-	assert_spin_locked(&lo->inode->i_lock);
-	clear_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
-}
-
 static inline void get_lseg(struct pnfs_layout_segment *lseg)
 {
 	atomic_inc(&lseg->pls_refcount);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d8bfa42..061d81a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -191,6 +191,7 @@ struct nfs_inode {
 
 	/* pNFS layout information */
 	struct rpc_wait_queue lo_rpcwaitq;
+	struct rpc_wait_queue	lo_rpcwaitq_stateid;
 	struct pnfs_layout_hdr *layout;
 #endif /* CONFIG_NFS_V4*/
 #ifdef CONFIG_NFS_FSCACHE
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 3cae408..80dcc00 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -83,6 +83,10 @@ struct nfs_client {
 	u32			cl_exchange_flags;
 	struct nfs4_session	*cl_session; 	/* sharred session */
 	struct list_head	cl_layouts;
+	struct list_head	cl_layoutrecalls;
+	unsigned long		cl_cb_lrecall_count;
+#define PNFS_MAX_CB_LRECALLS (1)
+	struct rpc_wait_queue	cl_rpcwaitq_recall;
 	struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
 #endif /* CONFIG_NFS_V4_1 */
 
-- 
1.7.2.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html