Re: [PATCH 16/22] pnfs-submit: rewrite of layout state handling and cb_layoutrecall

Benny Halevy <bhalevy@xxxxxxxxxxx> · Sun, 14 Nov 2010 13:44:13 +0200

On 2010-11-13 11:11, Trond Myklebust wrote:
> On Fri, 2010-11-12 at 03:48 -0500, Fred Isaman wrote:
>> Remove NFS_LAYOUT_STATEID_SET in favor of just checking list_empty(lo->segs).
>>
>> LAYOUTGETs with openstateid are serialized.  Waiting on the condition
>> (list_empty(lo->segs) && plh_outstanding>0) both drains outstanding RPCs once
>> the stateid is invalidated and allows only a single LAYOUTGET(openstateid)
>> through at a time.
>>
>> Before sending a LAYOUTRETURN, plh_block_lgets is incremented.  It is
>> decremented in the rpc_release function.  While set, LAYOUTGETs are
>> paused in their rpc_prepare function, and any responses are
>> forgotten.
>>
>> Callbacks are handled by blocking any matching LAYOUTGETS while processing and
>> initiating drain of IO.  A notification system is set up so that when
>> all relevant IO is finished, the state manger thread is invoked, which
>> synchronously sends the final matching LAYOUTRETURN before unblocking
>> LAYOUTGETS.
>>
>> Signed-off-by: Fred Isaman <iisaman@xxxxxxxxxx>
>> ---
>>  fs/nfs/callback.h         |    7 +
>>  fs/nfs/callback_proc.c    |  466 +++++++++++++++++++++++----------------------
>>  fs/nfs/client.c           |    3 +
>>  fs/nfs/nfs4proc.c         |   81 ++++++--
>>  fs/nfs/nfs4state.c        |    4 +
>>  fs/nfs/nfs4xdr.c          |   16 ++-
>>  fs/nfs/pnfs.c             |  177 +++++++++++++-----
>>  fs/nfs/pnfs.h             |   41 +++-
>>  include/linux/nfs_fs_sb.h |    4 +
>>  9 files changed, 497 insertions(+), 302 deletions(-)
>>
>> diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
>> index cea58cc..4a9905b 100644
>> --- a/fs/nfs/callback.h
>> +++ b/fs/nfs/callback.h
>> @@ -163,6 +163,9 @@ struct cb_layoutrecallargs {
>>  extern unsigned nfs4_callback_layoutrecall(
>>  	struct cb_layoutrecallargs *args,
>>  	void *dummy, struct cb_process_state *cps);
>> +extern bool matches_outstanding_recall(struct inode *ino,
>> +				       struct pnfs_layout_range *range);
>> +extern void nfs_client_return_layouts(struct nfs_client *clp);
>>  
>>  static inline void put_session_client(struct nfs4_session *session)
>>  {
>> @@ -178,6 +181,10 @@ find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
>>  
>>  #else
>>  
>> +static inline void nfs_client_return_layouts(struct nfs_client *clp)
>> +{
>> +}
>> +
>>  static inline struct nfs_client *
>>  find_client_from_cps(struct cb_process_state *cps, struct sockaddr *addr)
>>  {
>> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
>> index 6e0fc40..af405cf 100644
>> --- a/fs/nfs/callback_proc.c
>> +++ b/fs/nfs/callback_proc.c
>> @@ -124,265 +124,283 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
>>  #if defined(CONFIG_NFS_V4_1)
>>  
>>  static bool
>> -pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo,
>> -			    const nfs4_stateid stateid)
>> +_recall_matches_lget(struct pnfs_cb_lrecall_info *cb_info,
>> +		     struct inode *ino, struct pnfs_layout_range *range)
>>  {
>> -	bool res;
>> -	u32 oldseqid, newseqid;
>> -
>> -	spin_lock(&lo->inode->i_lock);
>> -	{
>> -		oldseqid = be32_to_cpu(lo->stateid.stateid.seqid);
>> -		newseqid = be32_to_cpu(stateid.stateid.seqid);
>> -		res = !memcmp(lo->stateid.stateid.other,
>> -			      stateid.stateid.other,
>> -			      NFS4_STATEID_OTHER_SIZE);
>> -		if (res) { /* comparing layout stateids */
>> -			if (oldseqid == ~0)
>> -				res = (newseqid == 1);
>> -			else
>> -				res = (newseqid == oldseqid + 1);
>> -		} else { /* open stateid */
>> -			res = !memcmp(lo->stateid.data,
>> -				      &zero_stateid,
>> -				      NFS4_STATEID_SIZE);
>> -			if (res)
>> -				res = (newseqid == 1);
>> -		}
>> -	}
>> -	spin_unlock(&lo->inode->i_lock);
>> +	struct cb_layoutrecallargs *cb_args = &cb_info->pcl_args;
>>  
>> -	return res;
>> +	switch (cb_args->cbl_recall_type) {
>> +	case RETURN_ALL:
>> +		return true;
>> +	case RETURN_FSID:
>> +		return !memcmp(&NFS_SERVER(ino)->fsid, &cb_args->cbl_fsid,
>> +			       sizeof(struct nfs_fsid));
>> +	case RETURN_FILE:
>> +		return (ino == cb_info->pcl_ino) &&
>> +			should_free_lseg(range, &cb_args->cbl_range);
>> +	default:
>> +		BUG();
> 
> Why should we BUG() just because the server is screwed up? That's not a
> client bug.
> 

Agreed.  This should be handled earlier in nfs4_callback_layoutrecall
or do_callback_layoutrecall so that we can return NFS4ERR_INVALID.

>> +	}
>>  }
>>  
>> -/*
>> - * Retrieve an inode based on layout recall parameters
>> - *
>> - * Note: caller must iput(inode) to dereference the inode.
>> - */
>> -static struct inode *
>> -nfs_layoutrecall_find_inode(struct nfs_client *clp,
>> -			    const struct cb_layoutrecallargs *args)
>> +bool
>> +matches_outstanding_recall(struct inode *ino, struct pnfs_layout_range *range)
>>  {
>> -	struct nfs_inode *nfsi;
>> -	struct pnfs_layout_hdr *lo;
>> -	struct nfs_server *server;
>> -	struct inode *ino = NULL;
>> -
>> -	dprintk("%s: Begin recall_type=%d clp %p\n",
>> -		__func__, args->cbl_recall_type, clp);
>> -
>> -	spin_lock(&clp->cl_lock);
>> -	list_for_each_entry(lo, &clp->cl_layouts, layouts) {
>> -		nfsi = NFS_I(lo->inode);
>> -		if (!nfsi)
>> -			continue;
>> -
>> -		dprintk("%s: Searching inode=%lu\n",
>> -			__func__, nfsi->vfs_inode.i_ino);
>> -
>> -		if (args->cbl_recall_type == RETURN_FILE) {
>> -		    if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh))
>> -			continue;
>> -		} else if (args->cbl_recall_type == RETURN_FSID) {
>> -			server = NFS_SERVER(&nfsi->vfs_inode);
>> -			if (server->fsid.major != args->cbl_fsid.major ||
>> -			    server->fsid.minor != args->cbl_fsid.minor)
>> -				continue;
>> +	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
>> +	struct pnfs_cb_lrecall_info *cb_info;
>> +	bool rv = false;
>> +
>> +	assert_spin_locked(&clp->cl_lock);
> 
> Can we please go easy on the asserts? There is way too much asserting
> going on in the NFSv4.1 code. This isn't a publicly visible interface,
> so just get it right in the debugging process before the merge, and then
> kill these asserts...
> 

OK. We can keep them in a DEVONLY patch only in the development tree
(it becomes handy when any changes are made on these code paths)

>> +	list_for_each_entry(cb_info, &clp->cl_layoutrecalls, pcl_list) {
>> +		if (_recall_matches_lget(cb_info, ino, range)) {
>> +			rv = true;
>> +			break;
>>  		}
>> -
>> -		/* Make sure client didn't clean up layout without
>> -		 * telling the server */
>> -		if (!has_layout(nfsi))
>> -			continue;
>> -
>> -		ino = igrab(&nfsi->vfs_inode);
>> -		dprintk("%s: Found inode=%p\n", __func__, ino);
>> -		break;
>>  	}
>> -	spin_unlock(&clp->cl_lock);
>> -	return ino;
>> +	return rv;
>>  }
>>  
>> -struct recall_layout_threadargs {
>> -	struct inode *inode;
>> -	struct nfs_client *clp;
>> -	struct completion started;
>> -	struct cb_layoutrecallargs *rl;
>> -	int result;
>> -};
>> -
>> -static int pnfs_recall_layout(void *data)
>> +/* Send a synchronous LAYOUTRETURN.  By the time this is called, we know
>> + * all IO has been drained, any matching lsegs deleted, and that no
>> + * overlapping LAYOUTGETs will be sent or processed for the duration
>> + * of this call.
>> + * Note that it is possible that when this is called, the stateid has
>> + * been invalidated.  But will not be cleared, so can still use.
>> + */
>> +static int
>> +pnfs_send_layoutreturn(struct nfs_client *clp,
>> +		       struct pnfs_cb_lrecall_info *cb_info)
>>  {
>> -	struct inode *inode, *ino;
>> -	struct nfs_client *clp;
>> -	struct cb_layoutrecallargs rl;
>> +	struct cb_layoutrecallargs *args = &cb_info->pcl_args;
>>  	struct nfs4_layoutreturn *lrp;
>> -	struct recall_layout_threadargs *args =
>> -		(struct recall_layout_threadargs *)data;
>> -	int status = 0;
>> -
>> -	daemonize("nfsv4-layoutreturn");
>> -
>> -	dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n",
>> -		__func__, args->rl->cbl_recall_type,
>> -		args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor);
>> -
>> -	clp = args->clp;
>> -	inode = args->inode;
>> -	rl = *args->rl;
>> -
>> -	/* support whole file layouts only */
>> -	rl.cbl_range.offset = 0;
>> -	rl.cbl_range.length = NFS4_MAX_UINT64;
>> -
>> -	if (rl.cbl_recall_type == RETURN_FILE) {
>> -		if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout,
>> -						rl.cbl_stateid))
>> -			status = pnfs_return_layout(inode, &rl.cbl_range,
>> -						    &rl.cbl_stateid, RETURN_FILE,
>> -						    false);
>> -		else
>> -			status = cpu_to_be32(NFS4ERR_DELAY);
>> -		if (status)
>> -			dprintk("%s RETURN_FILE error: %d\n", __func__, status);
>> -		else
>> -			status =  cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
>> -		args->result = status;
>> -		complete(&args->started);
>> -		goto out;
>> -	}
>> -
>> -	status = cpu_to_be32(NFS4_OK);
>> -	args->result = status;
>> -	complete(&args->started);
>> -	args = NULL;
>> -
>> -	/* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */
>> -	while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) {
>> -		/* FIXME: need to check status on pnfs_return_layout */
>> -		pnfs_return_layout(ino, &rl.cbl_range, NULL, RETURN_FILE, false);
>> -		iput(ino);
>> -	}
>>  
>>  	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
>> -	if (!lrp) {
>> -		dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n",
>> -			__func__);
>> -		goto out;
>> -	}
>> -
>> -	/* send final layoutreturn */
>> +	if (!lrp)
>> +		return -ENOMEM;
>>  	lrp->args.reclaim = 0;
>> -	lrp->args.layout_type = rl.cbl_layout_type;
>> -	lrp->args.return_type = rl.cbl_recall_type;
>> +	lrp->args.layout_type = args->cbl_layout_type;
>> +	lrp->args.return_type = args->cbl_recall_type;
>>  	lrp->clp = clp;
>> -	lrp->args.range = rl.cbl_range;
>> -	lrp->args.inode = inode;
>> -	nfs4_proc_layoutreturn(lrp, true);
>> -
>> -out:
>> -	clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
>> -	nfs_put_client(clp);
>> -	module_put_and_exit(0);
>> -	dprintk("%s: exit status %d\n", __func__, 0);
>> -	return 0;
>> +	if (args->cbl_recall_type == RETURN_FILE) {
>> +		lrp->args.range = args->cbl_range;
>> +		lrp->args.inode = cb_info->pcl_ino;
>> +	} else {
>> +		lrp->args.range.iomode = IOMODE_ANY;
>> +		lrp->args.inode = NULL;
>> +	}
>> +	return nfs4_proc_layoutreturn(lrp, true);
>>  }
>>  
>> -/*
>> - * Asynchronous layout recall!
>> +/* Called by state manager to finish CB_LAYOUTRECALLS initiated by
>> + * nfs4_callback_layoutrecall().
>>   */
>> -static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
>> -				    struct cb_layoutrecallargs *rl)
>> +void nfs_client_return_layouts(struct nfs_client *clp)
>>  {
>> -	struct recall_layout_threadargs data = {
>> -		.clp = clp,
>> -		.inode = inode,
>> -		.rl = rl,
>> -	};
>> -	struct task_struct *t;
>> -	int status = -EAGAIN;
>> +	struct pnfs_cb_lrecall_info *cb_info;
>>  
>> -	dprintk("%s: -->\n", __func__);
>> +	spin_lock(&clp->cl_lock);
>> +	while (true) {
>> +		if (list_empty(&clp->cl_layoutrecalls)) {
>> +			spin_unlock(&clp->cl_lock);
>> +			break;
>> +		}
>> +		cb_info = list_first_entry(&clp->cl_layoutrecalls,
>> +					   struct pnfs_cb_lrecall_info,
>> +					   pcl_list);
>> +		spin_unlock(&clp->cl_lock);
>> +		if (atomic_read(&cb_info->pcl_count) != 0)
>> +			break;
>> +		/* What do on error return?  These layoutreturns are
>> +		 * required by the protocol.  So if do not get
>> +		 * successful reply, probably have to do something
>> +		 * more drastic.
>> +		 */
>> +		pnfs_send_layoutreturn(clp, cb_info);
>> +		spin_lock(&clp->cl_lock);
>> +		/* Removing from the list unblocks LAYOUTGETs */
>> +		list_del(&cb_info->pcl_list);
>> +		clp->cl_cb_lrecall_count--;
>> +		rpc_wake_up(&clp->cl_rpcwaitq_recall);
>> +		kfree(cb_info);
>> +	}
>> +}
>>  
>> -	/* FIXME: do not allow two concurrent layout recalls */
>> -	if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state))
>> -		return status;
>> -
>> -	init_completion(&data.started);
>> -	__module_get(THIS_MODULE);
>> -	atomic_inc(&clp->cl_count);
>> -
>> -	t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout");
>> -	if (IS_ERR(t)) {
>> -		printk(KERN_INFO "NFS: Layout recall callback thread failed "
>> -			"for client (clientid %08x/%08x)\n",
>> -			(unsigned)(clp->cl_clientid >> 32),
>> -			(unsigned)(clp->cl_clientid));
>> -		status = PTR_ERR(t);
>> -		goto out_module_put;
>> +void notify_drained(struct pnfs_cb_lrecall_info *d)
>> +{
>> +	if (d && atomic_dec_and_test(&d->pcl_count)) {
>> +		set_bit(NFS4CLNT_LAYOUT_RECALL, &d->pcl_clp->cl_state);
>> +		nfs4_schedule_state_manager(d->pcl_clp);
>>  	}
>> -	wait_for_completion(&data.started);
>> -	return data.result;
>> -out_module_put:
>> -	nfs_put_client(clp);
>> -	clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
>> -	module_put(THIS_MODULE);
>> -	return status;
>>  }
>>  
>> -static int pnfs_recall_all_layouts(struct nfs_client *clp)
>> +static int initiate_layout_draining(struct pnfs_cb_lrecall_info *cb_info)
>>  {
>> -	struct cb_layoutrecallargs rl;
>> -	struct inode *inode;
>> -	int status = 0;
>> -
>> -	rl.cbl_recall_type = RETURN_ALL;
>> -	rl.cbl_range.iomode = IOMODE_ANY;
>> -	rl.cbl_range.offset = 0;
>> -	rl.cbl_range.length = NFS4_MAX_UINT64;
>> -
>> -	/* we need the inode to get the nfs_server struct */
>> -	inode = nfs_layoutrecall_find_inode(clp, &rl);
>> -	if (!inode)
>> -		return status;
>> -	status = pnfs_async_return_layout(clp, inode, &rl);
>> -	iput(inode);
>> +	struct nfs_client *clp = cb_info->pcl_clp;
>> +	struct pnfs_layout_hdr *lo;
>> +	int rv = NFS4ERR_NOMATCHING_LAYOUT;
>> +	struct cb_layoutrecallargs *args = &cb_info->pcl_args;
>> +
>> +	if (args->cbl_recall_type == RETURN_FILE) {
>> +		LIST_HEAD(free_me_list);
>> +
>> +		spin_lock(&clp->cl_lock);
>> +		list_for_each_entry(lo, &clp->cl_layouts, layouts) {
>> +			if (nfs_compare_fh(&args->cbl_fh,
>> +					   &NFS_I(lo->inode)->fh))
>> +				continue;
>> +			if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
>> +				rv = NFS4ERR_DELAY;
>> +			else {
>> +				/* FIXME I need to better understand igrab and
>> +				 * does having a layout ref keep ino around?
>> +				 *  It should.
>> +				 */
>> +				/* We need to hold the reference until any
>> +				 * potential LAYOUTRETURN is finished.
>> +				 */
>> +				get_layout_hdr(lo);
>> +				cb_info->pcl_ino = lo->inode;
>> +				rv = NFS4_OK;
>> +			}
>> +			break;
>> +		}
>> +		spin_unlock(&clp->cl_lock);
>> +
>> +		spin_lock(&lo->inode->i_lock);
>> +		if (rv == NFS4_OK) {
>> +			lo->plh_block_lgets++;
>> +			nfs4_asynch_forget_layouts(lo, &args->cbl_range,
>> +						   cb_info, &free_me_list);
>> +		}
>> +		pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
>> +		spin_unlock(&lo->inode->i_lock);
>> +		pnfs_free_lseg_list(&free_me_list);
>> +	} else {
>> +		struct pnfs_layout_hdr *tmp;
>> +		LIST_HEAD(recall_list);
>> +		LIST_HEAD(free_me_list);
>> +		struct pnfs_layout_range range = {
>> +			.iomode = IOMODE_ANY,
>> +			.offset = 0,
>> +			.length = NFS4_MAX_UINT64,
>> +		};
>> +
>> +		spin_lock(&clp->cl_lock);
>> +		/* Per RFC 5661, 12.5.5.2.1.5, bulk recall must be serialized */
>> +		if (!list_is_singular(&clp->cl_layoutrecalls)) {
>> +			spin_unlock(&clp->cl_lock);
>> +			return NFS4ERR_DELAY;
>> +		}
>> +		list_for_each_entry(lo, &clp->cl_layouts, layouts) {
>> +			if ((args->cbl_recall_type == RETURN_FSID) &&
>> +			    memcmp(&NFS_SERVER(lo->inode)->fsid,
>> +				   &args->cbl_fsid, sizeof(struct nfs_fsid)))
>> +				continue;
>> +			get_layout_hdr(lo);
>> +			/* We could list_del(&lo->layouts) here */
>> +			BUG_ON(!list_empty(&lo->plh_bulk_recall));
>> +			list_add(&lo->plh_bulk_recall, &recall_list);
>> +		}
>> +		spin_unlock(&clp->cl_lock);
>> +		list_for_each_entry_safe(lo, tmp,
>> +					 &recall_list, plh_bulk_recall) {
>> +			spin_lock(&lo->inode->i_lock);
>> +			set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
>> +			nfs4_asynch_forget_layouts(lo, &range, cb_info,
>> +						   &free_me_list);
>> +			list_del_init(&lo->plh_bulk_recall);
>> +			spin_unlock(&lo->inode->i_lock);
>> +			put_layout_hdr(lo->inode);
>> +			rv = NFS4_OK;
>> +		}
>> +		pnfs_free_lseg_list(&free_me_list);
>> +	}
>> +	return rv;
>> +}
>> +
>> +static u32 do_callback_layoutrecall(struct nfs_client *clp,
>> +				    struct cb_layoutrecallargs *args)
>> +{
>> +	struct pnfs_cb_lrecall_info *new;
>> +	u32 res;
>> +
>> +	dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
>> +	new = kmalloc(sizeof(*new), GFP_KERNEL);
>> +	if (!new) {
>> +		res = NFS4ERR_RESOURCE;
>> +		goto out;
>> +	}
>> +	memcpy(&new->pcl_args, args, sizeof(*args));
>> +	atomic_set(&new->pcl_count, 1);
>> +	new->pcl_clp = clp;
>> +	new->pcl_ino = NULL;
>> +	spin_lock(&clp->cl_lock);
>> +	if (clp->cl_cb_lrecall_count >= PNFS_MAX_CB_LRECALLS) {
>> +		kfree(new);
>> +		res = NFS4ERR_DELAY;
>> +		spin_unlock(&clp->cl_lock);
>> +		goto out;
>> +	}
>> +	clp->cl_cb_lrecall_count++;
>> +	/* Adding to the list will block conflicting LGET activity */
>> +	list_add_tail(&new->pcl_list, &clp->cl_layoutrecalls);
>> +	spin_unlock(&clp->cl_lock);
>> +	res = initiate_layout_draining(new);
>> +	if (res || atomic_dec_and_test(&new->pcl_count)) {
>> +		spin_lock(&clp->cl_lock);
>> +		list_del(&new->pcl_list);
>> +		clp->cl_cb_lrecall_count--;
>> +		rpc_wake_up(&clp->cl_rpcwaitq_recall);
>> +		spin_unlock(&clp->cl_lock);
>> +		if (res == NFS4_OK) {
>> +			if (args->cbl_recall_type == RETURN_FILE) {
>> +				struct pnfs_layout_hdr *lo;
>> +
>> +				lo = NFS_I(new->pcl_ino)->layout;
>> +				spin_lock(&lo->inode->i_lock);
>> +				lo->plh_block_lgets--;
>> +				if (!pnfs_layoutgets_blocked(lo, NULL))
>> +					rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
>> +				spin_unlock(&lo->inode->i_lock);
>> +				put_layout_hdr(new->pcl_ino);
>> +			}
>> +			res = NFS4ERR_NOMATCHING_LAYOUT;
>> +		}
>> +		kfree(new);
>> +	}
>> +out:
>> +	dprintk("%s returning %i\n", __func__, res);
>> +	return res;
>>  
>> -	return status;
>>  }
>>  
>>  __be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
>>  				  void *dummy, struct cb_process_state *cps)
>>  {
>>  	struct nfs_client *clp;
>> -	struct inode *inode = NULL;
>> -	__be32 res;
>> -	int status;
>> +	u32 res;
>>  
>>  	dprintk("%s: -->\n", __func__);
>>  
>> -	res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
>> -	if (cps->session) /* set in cb_sequence */
>> +	if (cps->session) { /* set in cb_sequence */
>>  		clp = cps->session->clp;
>> -	else
>> -		goto out;
>> +		res = do_callback_layoutrecall(clp, args);
>> +	} else
>> +		res = NFS4ERR_OP_NOT_IN_SESSION;
>>  
>> -	res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
>> -	/*
>> -	 * In the _ALL or _FSID case, we need the inode to get
>> -	 * the nfs_server struct.
>> -	 */
>> -	inode = nfs_layoutrecall_find_inode(clp, args);
>> -	if (!inode)
>> -		goto out;
>> -	status = pnfs_async_return_layout(clp, inode, args);
>> -	if (status)
>> -		res = cpu_to_be32(NFS4ERR_DELAY);
>> -	iput(inode);
>> -out:
>> -	dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
>> -	return res;
>> +	dprintk("%s: exit with status = %d\n", __func__, res);
>> +	return cpu_to_be32(res);
>> +}
>> +
>> +static void pnfs_recall_all_layouts(struct nfs_client *clp)
>> +{
>> +	struct cb_layoutrecallargs args;
>> +
>> +	/* Pretend we got a CB_LAYOUTRECALL(ALL) */
>> +	memset(&args, 0, sizeof(args));
>> +	args.cbl_recall_type = RETURN_ALL;
>> +	/* FIXME we ignore errors, what should we do? */
> 
> We're a forgetful client: we don't care...
> 

Well, CB_RECALL_ANY is generated in order to trim the server's state down
by allowing the client to *return* state it needs less or no longer needs.
Just forgetting this state doesn't help the server at all with this job!
There's no equivalent error to NFS4ERR_NOMATCHING_LAYOUT for CB_RECALL_ANY.

>> +	do_callback_layoutrecall(clp, &args);
>>  }
> 
> 
> 
>>  
>>  int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
>> @@ -665,9 +683,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
>>  		flags |= FMODE_WRITE;
>>  	if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
>>  		     &args->craa_type_mask))
>> -		if (pnfs_recall_all_layouts(clp) == -EAGAIN)
>> -			status = cpu_to_be32(NFS4ERR_DELAY);
>> -
>> +		pnfs_recall_all_layouts(clp);
>>  	if (flags)
>>  		nfs_expire_all_delegation_types(clp, flags);
>>  out:
>> diff --git a/fs/nfs/client.c b/fs/nfs/client.c
>> index 3c8c841..dbf43e7 100644
>> --- a/fs/nfs/client.c
>> +++ b/fs/nfs/client.c
>> @@ -158,6 +158,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
>>  		clp->cl_machine_cred = cred;
>>  #if defined(CONFIG_NFS_V4_1)
>>  	INIT_LIST_HEAD(&clp->cl_layouts);
>> +	INIT_LIST_HEAD(&clp->cl_layoutrecalls);
>> +	rpc_init_wait_queue(&clp->cl_rpcwaitq_recall,
>> +			    "NFS client CB_LAYOUTRECALLS");
>>  #endif
>>  	nfs_fscache_get_client_cookie(clp);
>>  
>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
>> index fe79872..6223c6a 100644
>> --- a/fs/nfs/nfs4proc.c
>> +++ b/fs/nfs/nfs4proc.c
>> @@ -5346,31 +5346,58 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
>>  	struct inode *ino = lgp->args.inode;
>>  	struct nfs_inode *nfsi = NFS_I(ino);
>>  	struct nfs_server *server = NFS_SERVER(ino);
>> +	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
>>  
>>  	dprintk("--> %s\n", __func__);
>> +	spin_lock(&clp->cl_lock);
>> +	if (matches_outstanding_recall(ino, &lgp->args.range)) {
>> +		rpc_sleep_on(&clp->cl_rpcwaitq_recall, task, NULL);
>> +		spin_unlock(&clp->cl_lock);
>> +		return;
>> +	}
>> +	spin_unlock(&clp->cl_lock);
>> +	/* Note the is a race here, where a CB_LAYOUTRECALL can come in
>> +	 * right now covering the LAYOUTGET we are about to send.
>> +	 * However, that is not so catastrophic, and there seems
>> +	 * to be no way to prevent it completely.
>> +	 */
>>  	spin_lock(&ino->i_lock);
>> -	if (pnfs_layoutgets_blocked(nfsi->layout)) {
>> +	if (pnfs_layoutgets_blocked(nfsi->layout, NULL)) {
>>  		rpc_sleep_on(&nfsi->lo_rpcwaitq_stateid, task, NULL);
>>  		spin_unlock(&ino->i_lock);
>>  		return;
>>  	}
>> +	/* This needs after above check but atomic with it in order to properly
>> +	 * serialize openstateid LAYOUTGETs.
>> +	 */
>> +	nfsi->layout->plh_outstanding++;
>>  	spin_unlock(&ino->i_lock);
>> +
>>  	if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
>> -				&lgp->res.seq_res, 0, task))
>> +				&lgp->res.seq_res, 0, task)) {
>> +		spin_lock(&ino->i_lock);
>> +		nfsi->layout->plh_outstanding--;
>> +		spin_unlock(&ino->i_lock);
>>  		return;
>> +	}
>>  	rpc_call_start(task);
>>  }
>>  
>>  static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
>>  {
>>  	struct nfs4_layoutget *lgp = calldata;
>> -	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
>> +	struct inode *ino = lgp->args.inode;
>>  
>>  	dprintk("--> %s\n", __func__);
>>  
>> -	if (!nfs4_sequence_done(task, &lgp->res.seq_res))
>> +	if (!nfs4_sequence_done(task, &lgp->res.seq_res)) {
>> +		/* layout code relies on fact that in this case
>> +		 * code falls back to tk_action=call_start, but not
>> +		 * back to rpc_prepare_task, to keep plh_outstanding
>> +		 * correct.
>> +		 */
>>  		return;
>> -
>> +	}
>>  	switch (task->tk_status) {
>>  	case 0:
>>  		break;
>> @@ -5379,7 +5406,11 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
>>  		task->tk_status = -NFS4ERR_DELAY;
>>  		/* Fall through */
>>  	default:
>> -		if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
>> +		if (nfs4_async_handle_error(task, NFS_SERVER(ino),
>> +					    NULL, NULL) == -EAGAIN) {
>> +			spin_lock(&ino->i_lock);
>> +			NFS_I(ino)->layout->plh_outstanding--;
>> +			spin_unlock(&ino->i_lock);
>>  			rpc_restart_call_prepare(task);
>>  			return;
>>  		}
>> @@ -5437,13 +5468,20 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
>>  	if (IS_ERR(task))
>>  		return PTR_ERR(task);
>>  	status = nfs4_wait_for_completion_rpc_task(task);
>> -	if (status != 0)
>> -		goto out;
>> -	status = task->tk_status;
>> -	if (status != 0)
>> -		goto out;
>> -	status = pnfs_layout_process(lgp);
>> -out:
>> +	if (status == 0)
>> +		status = task->tk_status;
>> +	if (status == 0)
>> +		status = pnfs_layout_process(lgp);
>> +	else {
>> +		struct inode *ino = lgp->args.inode;
>> +		struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
>> +
>> +		spin_lock(&ino->i_lock);
>> +		lo->plh_outstanding--;
>> +		if (!pnfs_layoutgets_blocked(lo, NULL))
>> +			rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
>> +		spin_unlock(&ino->i_lock);
>> +	}
>>  	rpc_put_task(task);
>>  	dprintk("<-- %s status=%d\n", __func__, status);
>>  	return status;
>> @@ -5587,9 +5625,9 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
>>  
>>  		spin_lock(&lo->inode->i_lock);
>>  		if (lrp->res.lrs_present)
>> -			pnfs_set_layout_stateid(lo, &lrp->res.stateid);
>> +			pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
>>  		else
>> -			pnfs_invalidate_layout_stateid(lo);
>> +			BUG_ON(!list_empty(&lo->segs));
>>  		spin_unlock(&lo->inode->i_lock);
>>  	}
>>  	dprintk("<-- %s\n", __func__);
>> @@ -5606,10 +5644,11 @@ static void nfs4_layoutreturn_release(void *calldata)
>>  
>>  		spin_lock(&ino->i_lock);
>>  		lo->plh_block_lgets--;
>> -		if (!pnfs_layoutgets_blocked(lo))
>> +		lo->plh_outstanding--;
>> +		if (!pnfs_layoutgets_blocked(lo, NULL))
>>  			rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
>>  		spin_unlock(&ino->i_lock);
>> -		put_layout_hdr(lrp->args.inode);
>> +		put_layout_hdr(ino);
>>  	}
>>  	kfree(calldata);
>>  	dprintk("<-- %s\n", __func__);
>> @@ -5639,6 +5678,14 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
>>  	int status = 0;
>>  
>>  	dprintk("--> %s\n", __func__);
>> +	if (lrp->args.return_type == RETURN_FILE) {
>> +		struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
>> +		/* FIXME we should test for BULK here */
>> +		spin_lock(&lo->inode->i_lock);
>> +		BUG_ON(lo->plh_block_lgets == 0);
>> +		lo->plh_outstanding++;
>> +		spin_unlock(&lo->inode->i_lock);
>> +	}
>>  	task = rpc_run_task(&task_setup_data);
>>  	if (IS_ERR(task))
>>  		return PTR_ERR(task);
>> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
>> index 00632f6..ceb0d66 100644
>> --- a/fs/nfs/nfs4state.c
>> +++ b/fs/nfs/nfs4state.c
>> @@ -1560,6 +1560,10 @@ static void nfs4_state_manager(struct nfs_client *clp)
>>  			nfs_client_return_marked_delegations(clp);
>>  			continue;
>>  		}
>> +		if (test_and_clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) {
>> +			nfs_client_return_layouts(clp);
>> +			continue;
>> +		}
>>  		/* Recall session slots */
>>  		if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
>>  		   && nfs4_has_session(clp)) {
>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>> index 328cca5..f530c7e 100644
>> --- a/fs/nfs/nfs4xdr.c
>> +++ b/fs/nfs/nfs4xdr.c
>> @@ -1827,13 +1827,14 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
>>  	hdr->replen += decode_getdeviceinfo_maxsz;
>>  }
>>  
>> -static void
>> +static int
>>  encode_layoutget(struct xdr_stream *xdr,
>>  		      const struct nfs4_layoutget_args *args,
>>  		      struct compound_hdr *hdr)
>>  {
>>  	nfs4_stateid stateid;
>>  	__be32 *p;
>> +	int status;
>>  
>>  	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
>>  	*p++ = cpu_to_be32(OP_LAYOUTGET);
>> @@ -1843,8 +1844,11 @@ encode_layoutget(struct xdr_stream *xdr,
>>  	p = xdr_encode_hyper(p, args->range.offset);
>>  	p = xdr_encode_hyper(p, args->range.length);
>>  	p = xdr_encode_hyper(p, args->minlength);
>> -	pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
>> -				args->ctx->state);
>> +	status = pnfs_choose_layoutget_stateid(&stateid,
>> +					       NFS_I(args->inode)->layout,
>> +					       args->ctx->state);
>> +	if (status)
>> +		return status;
>>  	p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
>>  	*p = cpu_to_be32(args->maxcount);
>>  
>> @@ -1857,6 +1861,7 @@ encode_layoutget(struct xdr_stream *xdr,
>>  		args->maxcount);
>>  	hdr->nops++;
>>  	hdr->replen += decode_layoutget_maxsz;
>> +	return 0;
>>  }
>>  
>>  static int
>> @@ -2782,12 +2787,15 @@ static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
>>  	struct compound_hdr hdr = {
>>  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
>>  	};
>> +	int status;
>>  
>>  	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
>>  	encode_compound_hdr(&xdr, req, &hdr);
>>  	encode_sequence(&xdr, &args->seq_args, &hdr);
>>  	encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
>> -	encode_layoutget(&xdr, args, &hdr);
>> +	status = encode_layoutget(&xdr, args, &hdr);
>> +	if (status)
>> +		return status;
>>  	encode_nops(&hdr);
>>  	return 0;
>>  }
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 07b04e8..2d817be 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
>>   */
>>  
>>  /* Need to hold i_lock if caller does not already hold reference */
>> -static void
>> +void
>>  get_layout_hdr(struct pnfs_layout_hdr *lo)
>>  {
>>  	atomic_inc(&lo->plh_refcount);
>> @@ -278,24 +278,29 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
>>  	smp_mb();
>>  	lseg->valid = true;
>>  	lseg->layout = lo;
>> +	lseg->drain_notification = NULL;
>>  }
>>  
>>  static void
>>  _put_lseg_common(struct pnfs_layout_segment *lseg)
>>  {
>> +	struct inode *ino = lseg->layout->inode;
>> +
>>  	BUG_ON(lseg->valid == true);
>>  	list_del(&lseg->fi_list);
>>  	if (list_empty(&lseg->layout->segs)) {
>>  		struct nfs_client *clp;
>>  
>> -		clp = NFS_SERVER(lseg->layout->inode)->nfs_client;
>> +		clp = NFS_SERVER(ino)->nfs_client;
>>  		spin_lock(&clp->cl_lock);
>>  		/* List does not take a reference, so no need for put here */
>>  		list_del_init(&lseg->layout->layouts);
>>  		spin_unlock(&clp->cl_lock);
>> -		pnfs_invalidate_layout_stateid(lseg->layout);
>> +		clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->layout->plh_flags);
>> +		if (!pnfs_layoutgets_blocked(lseg->layout, NULL))
>> +			rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
>>  	}
>> -	rpc_wake_up(&NFS_I(lseg->layout->inode)->lo_rpcwaitq);
>> +	rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq);
>>  }
>>  
>>  /* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
>> @@ -325,9 +330,12 @@ put_lseg(struct pnfs_layout_segment *lseg)
>>  		atomic_read(&lseg->pls_refcount), lseg->valid);
>>  	ino = lseg->layout->inode;
>>  	if (atomic_dec_and_lock(&lseg->pls_refcount, &ino->i_lock)) {
>> +		struct pnfs_cb_lrecall_info *drain_info = lseg->drain_notification;
>> +
>>  		_put_lseg_common(lseg);
>>  		spin_unlock(&ino->i_lock);
>>  		NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
>> +		notify_drained(drain_info);
>>  		/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
>>  		put_layout_hdr(ino);
>>  	}
>> @@ -345,7 +353,7 @@ EXPORT_SYMBOL_GPL(put_lseg);
>>   * READ		READ	true
>>   * READ		RW	false
>>   */
>> -static int
>> +bool
>>  should_free_lseg(struct pnfs_layout_range *lseg_range,
>>  		 struct pnfs_layout_range *recall_range)
>>  {
>> @@ -388,16 +396,19 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
>>  	dprintk("%s:Return\n", __func__);
>>  }
>>  
>> -static void
>> +void
>>  pnfs_free_lseg_list(struct list_head *free_me)
>>  {
>>  	struct pnfs_layout_segment *lseg, *tmp;
>>  	struct inode *ino;
>> +	struct pnfs_cb_lrecall_info *drain_info;
>>  
>>  	list_for_each_entry_safe(lseg, tmp, free_me, fi_list) {
>>  		BUG_ON(atomic_read(&lseg->pls_refcount) != 0);
>>  		ino = lseg->layout->inode;
>> +		drain_info = lseg->drain_notification;
>>  		NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
>> +		notify_drained(drain_info);
>>  		/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
>>  		put_layout_hdr(ino);
>>  	}
>> @@ -453,40 +464,49 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
>>  	}
>>  }
>>  
>> -/* update lo->stateid with new if is more recent
>> - *
>> - * lo->stateid could be the open stateid, in which case we just use what given.
>> - */
>> +/* update lo->stateid with new if is more recent */
>>  void
>> -pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
>> -			const nfs4_stateid *new)
>> +pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
>> +			bool update_barrier)
>>  {
>> -	nfs4_stateid *old = &lo->stateid;
>> -	bool overwrite = false;
>> +	u32 oldseq, newseq;
>>  
>>  	assert_spin_locked(&lo->inode->i_lock);
>> -	if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags) ||
>> -	    memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
>> -		overwrite = true;
>> -	else {
>> -		u32 oldseq, newseq;
>> -
>> -		oldseq = be32_to_cpu(old->stateid.seqid);
>> -		newseq = be32_to_cpu(new->stateid.seqid);
>> -		if ((int)(newseq - oldseq) > 0)
>> -			overwrite = true;
>> +	oldseq = be32_to_cpu(lo->stateid.stateid.seqid);
>> +	newseq = be32_to_cpu(new->stateid.seqid);
>> +	if ((int)(newseq - oldseq) > 0) {
>> +		memcpy(&lo->stateid, &new->stateid, sizeof(new->stateid));
>> +		if (update_barrier)
>> +			lo->plh_barrier = be32_to_cpu(new->stateid.seqid);
>> +		else {
>> +			/* Because of wraparound, we want to keep the barrier
>> +			 * "close" to the current seqids.  It needs to be
>> +			 * within 2**31 to count as "behind", so if it
>> +			 * gets too near that limit, give us a litle leeway
>> +			 * and bring it to within 2**30.
>> +			 * NOTE - and yes, this is all unsigned arithmetic.
>> +			 */
>> +			if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
>> +				lo->plh_barrier = newseq - (1 << 30);
>> +		}
>>  	}
>> -	if (overwrite)
>> -		memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
>>  }
>>  
>> -void
>> -pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
>> -			struct nfs4_state *open_state)
>> +int
>> +pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
>> +			      struct nfs4_state *open_state)
>>  {
>> +	int status = 0;
>> +
>>  	dprintk("--> %s\n", __func__);
>>  	spin_lock(&lo->inode->i_lock);
>> -	if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
>> +	if (lo->plh_block_lgets ||
>> +	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
>> +		/* We avoid -EAGAIN, as that has special meaning to
>> +		 * some callers.
>> +		 */
>> +		status = -NFS4ERR_LAYOUTTRYLATER;
>> +	} else if (list_empty(&lo->segs)) {
>>  		int seq;
>>  
>>  		do {
>> @@ -494,12 +514,11 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
>>  			memcpy(dst->data, open_state->stateid.data,
>>  			       sizeof(open_state->stateid.data));
>>  		} while (read_seqretry(&open_state->seqlock, seq));
>> -		set_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
>>  	} else
>> -		memcpy(dst->data, lo->stateid.data,
>> -		       sizeof(lo->stateid.data));
>> +		memcpy(dst->data, lo->stateid.data, sizeof(lo->stateid.data));
>>  	spin_unlock(&lo->inode->i_lock);
>>  	dprintk("<-- %s\n", __func__);
>> +	return status;
>>  }
>>  
>>  /*
>> @@ -566,6 +585,28 @@ has_layout_to_return(struct pnfs_layout_hdr *lo,
>>  	return out;
>>  }
>>  
>> +void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
>> +				struct pnfs_layout_range *range,
>> +				struct pnfs_cb_lrecall_info *drain_info,
>> +				struct list_head *tmp_list)
>> +{
>> +	struct pnfs_layout_segment *lseg, *tmp;
>> +
>> +	assert_spin_locked(&lo->inode->i_lock);
> 
> Poor practice. If you want to ensure the caller holds the inode->i_lock,
> then just call the function '*_locked'. That is a lot more helpful than
> these damned asserts.
> 

That makes sense.

Benny

>> +	list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
>> +		if (should_free_lseg(&lseg->range, range)) {
>> +			/* FIXME - need to change to something like a
>> +			 * notification bitmap to remove the restriction
>> +			 * of only being able to process a single
>> +			 * CB_LAYOUTRECALL at a time.
>> +			 */
>> +			BUG_ON(lseg->drain_notification);
>> +			lseg->drain_notification = drain_info;
>> +			atomic_inc(&drain_info->pcl_count);
>> +			mark_lseg_invalid(lseg, tmp_list);
>> +		}
>> +}
>> +
>>  /* Return true if there is layout based io in progress in the given range.
>>   * Assumes range has already been marked invalid, and layout marked to
>>   * prevent any new lseg from being inserted.
>> @@ -711,14 +752,6 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
>>  	dprintk("%s:Begin\n", __func__);
>>  
>>  	assert_spin_locked(&lo->inode->i_lock);
>> -	if (list_empty(&lo->segs)) {
>> -		struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
>> -
>> -		spin_lock(&clp->cl_lock);
>> -		BUG_ON(!list_empty(&lo->layouts));
>> -		list_add_tail(&lo->layouts, &clp->cl_layouts);
>> -		spin_unlock(&clp->cl_lock);
>> -	}
>>  	list_for_each_entry(lp, &lo->segs, fi_list) {
>>  		if (cmp_layout(&lp->range, &lseg->range) > 0)
>>  			continue;
>> @@ -735,6 +768,9 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
>>  	}
>>  	if (!found) {
>>  		list_add_tail(&lseg->fi_list, &lo->segs);
>> +		if (list_is_singular(&lo->segs) &&
>> +		    !pnfs_layoutgets_blocked(lo, NULL))
>> +			rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
>>  		dprintk("%s: inserted lseg %p "
>>  			"iomode %d offset %llu length %llu at tail\n",
>>  			__func__, lseg, lseg->range.iomode,
>> @@ -756,6 +792,7 @@ alloc_init_layout_hdr(struct inode *ino)
>>  	atomic_set(&lo->plh_refcount, 1);
>>  	INIT_LIST_HEAD(&lo->layouts);
>>  	INIT_LIST_HEAD(&lo->segs);
>> +	INIT_LIST_HEAD(&lo->plh_bulk_recall);
>>  	lo->inode = ino;
>>  	return lo;
>>  }
>> @@ -843,6 +880,7 @@ pnfs_update_layout(struct inode *ino,
>>  		.length = NFS4_MAX_UINT64,
>>  	};
>>  	struct nfs_inode *nfsi = NFS_I(ino);
>> +	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
>>  	struct pnfs_layout_hdr *lo;
>>  	struct pnfs_layout_segment *lseg = NULL;
>>  
>> @@ -878,9 +916,28 @@ pnfs_update_layout(struct inode *ino,
>>  		goto out_unlock;
>>  
>>  	get_layout_hdr(lo); /* Matched in pnfs_layoutget_release */
>> +	if (list_empty(&lo->segs)) {
>> +		/* The lo must be on the clp list if there is any
>> +		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
>> +		 */
>> +		spin_lock(&clp->cl_lock);
>> +		BUG_ON(!list_empty(&lo->layouts));
>> +		list_add_tail(&lo->layouts, &clp->cl_layouts);
>> +		spin_unlock(&clp->cl_lock);
>> +	}
>>  	spin_unlock(&ino->i_lock);
>>  
>>  	lseg = send_layoutget(lo, ctx, &arg);
>> +	if (!lseg) {
>> +		spin_lock(&ino->i_lock);
>> +		if (list_empty(&lo->segs)) {
>> +			spin_lock(&clp->cl_lock);
>> +			list_del_init(&lo->layouts);
>> +			spin_unlock(&clp->cl_lock);
>> +			clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
>> +		}
>> +		spin_unlock(&ino->i_lock);
>> +	}
>>  out:
>>  	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
>>  		nfsi->layout->plh_flags, lseg);
>> @@ -891,10 +948,15 @@ out_unlock:
>>  }
>>  
>>  bool
>> -pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo)
>> +pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid)
>>  {
>>  	assert_spin_locked(&lo->inode->i_lock);
>> -	return lo->plh_block_lgets;
>> +	if ((stateid) &&
>> +	    (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
>> +		return true;
>> +	return lo->plh_block_lgets ||
>> +		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
>> +		(list_empty(&lo->segs) && lo->plh_outstanding);
>>  }
>>  
>>  int
>> @@ -904,6 +966,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
>>  	struct nfs4_layoutget_res *res = &lgp->res;
>>  	struct pnfs_layout_segment *lseg;
>>  	struct inode *ino = lo->inode;
>> +	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
>>  	int status = 0;
>>  
>>  	/* Inject layout blob into I/O device driver */
>> @@ -915,10 +978,25 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
>>  			status = PTR_ERR(lseg);
>>  		dprintk("%s: Could not allocate layout: error %d\n",
>>  		       __func__, status);
>> +		spin_lock(&ino->i_lock);
>>  		goto out;
>>  	}
>>  
>>  	spin_lock(&ino->i_lock);
>> +	/* decrement needs to be done before call to pnfs_layoutget_blocked */
>> +	lo->plh_outstanding--;
>> +	spin_lock(&clp->cl_lock);
>> +	if (matches_outstanding_recall(ino, &res->range)) {
>> +		spin_unlock(&clp->cl_lock);
>> +		dprintk("%s forget reply due to recall\n", __func__);
>> +		goto out_forget_reply;
>> +	}
>> +	spin_unlock(&clp->cl_lock);
>> +
>> +	if (pnfs_layoutgets_blocked(lo, &res->stateid)) {
>> +		dprintk("%s forget reply due to state\n", __func__);
>> +		goto out_forget_reply;
>> +	}
>>  	init_lseg(lo, lseg);
>>  	lseg->range = res->range;
>>  	get_lseg(lseg);
>> @@ -934,10 +1012,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
>>  	}
>>  
>>  	/* Done processing layoutget. Set the layout stateid */
>> -	pnfs_set_layout_stateid(lo, &res->stateid);
>> -	spin_unlock(&ino->i_lock);
>> +	pnfs_set_layout_stateid(lo, &res->stateid, false);
>>  out:
>> +	if (!pnfs_layoutgets_blocked(lo, NULL))
>> +		rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
>> +	spin_unlock(&ino->i_lock);
>>  	return status;
>> +
>> +out_forget_reply:
>> +	spin_unlock(&ino->i_lock);
>> +	lseg->layout = lo;
>> +	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
>> +	spin_lock(&ino->i_lock);
>> +	goto out;
>>  }
>>  
>>  void
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index 891aeab..7ea121f 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -31,6 +31,7 @@
>>  #define FS_NFS_PNFS_H
>>  
>>  #include <linux/nfs_page.h>
>> +#include "callback.h" /* for cb_layoutrecallargs */
>>  
>>  struct pnfs_layout_segment {
>>  	struct list_head fi_list;
>> @@ -38,6 +39,7 @@ struct pnfs_layout_segment {
>>  	atomic_t pls_refcount;
>>  	bool valid;
>>  	struct pnfs_layout_hdr *layout;
>> +	struct pnfs_cb_lrecall_info *drain_notification;
>>  };
>>  
>>  enum pnfs_try_status {
>> @@ -52,7 +54,7 @@ enum pnfs_try_status {
>>  enum {
>>  	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
>>  	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
>> -	NFS_LAYOUT_STATEID_SET,		/* have a valid layout stateid */
>> +	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
>>  	NFS_LAYOUT_NEED_LCOMMIT,	/* LAYOUTCOMMIT needed */
>>  };
>>  
>> @@ -94,10 +96,13 @@ struct pnfs_layoutdriver_type {
>>  struct pnfs_layout_hdr {
>>  	atomic_t		plh_refcount;
>>  	struct list_head	layouts;   /* other client layouts */
>> +	struct list_head	plh_bulk_recall; /* clnt list of bulk recalls */
>>  	struct list_head	segs;      /* layout segments list */
>>  	int			roc_iomode;/* return on close iomode, 0=none */
>>  	nfs4_stateid		stateid;
>> +	unsigned long		plh_outstanding; /* number of RPCs out */
>>  	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */
>> +	u32			plh_barrier; /* ignore lower seqids */
>>  	unsigned long		plh_flags;
>>  	struct rpc_cred		*cred;     /* layoutcommit credential */
>>  	/* DH: These vars keep track of the maximum write range
>> @@ -118,6 +123,14 @@ struct pnfs_device {
>>  	unsigned int  pglen;
>>  };
>>  
>> +struct pnfs_cb_lrecall_info {
>> +	struct list_head	pcl_list; /* hook into cl_layoutrecalls list */
>> +	atomic_t		pcl_count;
>> +	struct nfs_client	*pcl_clp;
>> +	struct inode		*pcl_ino;
>> +	struct cb_layoutrecallargs pcl_args;
>> +};
>> +
>>  /*
>>   * Device ID RCU cache. A device ID is unique per client ID and layout type.
>>   */
>> @@ -176,7 +189,10 @@ extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
>>  extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
>>  
>>  /* pnfs.c */
>> +void get_layout_hdr(struct pnfs_layout_hdr *lo);
>>  void put_lseg(struct pnfs_layout_segment *lseg);
>> +bool should_free_lseg(struct pnfs_layout_range *lseg_range,
>> +		      struct pnfs_layout_range *recall_range);
>>  struct pnfs_layout_segment *
>>  pnfs_has_layout(struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range);
>>  struct pnfs_layout_segment *
>> @@ -201,15 +217,24 @@ enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *,
>>  void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
>>  			   struct nfs_open_context *, struct list_head *);
>>  void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
>> -bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo);
>> +bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid);
>>  int pnfs_layout_process(struct nfs4_layoutget *lgp);
>> +void pnfs_free_lseg_list(struct list_head *tmp_list);
>>  void pnfs_destroy_layout(struct nfs_inode *);
>>  void pnfs_destroy_all_layouts(struct nfs_client *);
>>  void put_layout_hdr(struct inode *inode);
>>  void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
>> -			     const nfs4_stateid *new);
>> -void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
>> -			     struct nfs4_state *open_state);
>> +			     const nfs4_stateid *new,
>> +			     bool update_barrier);
>> +int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
>> +				  struct pnfs_layout_hdr *lo,
>> +				  struct nfs4_state *open_state);
>> +void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
>> +				struct pnfs_layout_range *range,
>> +				struct pnfs_cb_lrecall_info *drain_info,
>> +				struct list_head *tmp_list);
>> +/* FIXME - this should be in callback.h, but pnfs_cb_lrecall_info needs to be there too */
>> +extern void notify_drained(struct pnfs_cb_lrecall_info *d);
>>  
>>  static inline bool
>>  has_layout(struct nfs_inode *nfsi)
>> @@ -223,12 +248,6 @@ static inline int lo_fail_bit(u32 iomode)
>>  			 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
>>  }
>>  
>> -static inline void pnfs_invalidate_layout_stateid(struct pnfs_layout_hdr *lo)
>> -{
>> -	assert_spin_locked(&lo->inode->i_lock);
>> -	clear_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
>> -}
>> -
>>  static inline void get_lseg(struct pnfs_layout_segment *lseg)
>>  {
>>  	atomic_inc(&lseg->pls_refcount);
>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>> index 3cae408..80dcc00 100644
>> --- a/include/linux/nfs_fs_sb.h
>> +++ b/include/linux/nfs_fs_sb.h
>> @@ -83,6 +83,10 @@ struct nfs_client {
>>  	u32			cl_exchange_flags;
>>  	struct nfs4_session	*cl_session; 	/* sharred session */
>>  	struct list_head	cl_layouts;
>> +	struct list_head	cl_layoutrecalls;
>> +	unsigned long		cl_cb_lrecall_count;
>> +#define PNFS_MAX_CB_LRECALLS (1)
>> +	struct rpc_wait_queue	cl_rpcwaitq_recall;
>>  	struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>>  #endif /* CONFIG_NFS_V4_1 */
>>  
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html