Re: [PATCH 7/8] pnfs-submit: forgetful client model

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, May 17, 2010 at 10:56:43AM -0700, Alexandros Batsakis wrote:
> Forgetful client model:
> 
> If we receive a CB_LAYOUTRECALL
> 	- we spawn a thread to handle the recall
> 	(xxx: now only one recall can be active at a time, else NFS4ERR_DELAY)
> 	- we check the stateid seqid
> 	if it does not match we return NFS4ERR_DELAY
> 	- we check for pending I/O
> 	if there is we return NFS4ERR_DELAY
> 	Else we return NO_MATCHING_LAYOUT.
> 	Note that for whole file layouts there is no need to serialize LAYOUTGETs/LAYOUTRETURNs
> For bulk layouts, if there is a layout active, we return NFS4_OK and we start
> cleaning the layouts asynchronously. At the end we send a bulk LAYOUTRETURN.
> Note that there is no need to prevent any new LAYOUTGETs explicitly as the server should
> reject them.

There a lot of discussion over the decision to adopt the forgetful
client, and also some over when (and whether) to return NFS4ERR_DELAY.
Would it be possible to also include a brief summary of the
justification for the choices made?

(And ideally to include it either in a block comment somewhere or in
Documentation/, not just in a changelog comment?)

--b.

> 
> Signed-off-by: Alexandros Batsakis <batsakis@xxxxxxxxxx>
> ---
>  fs/nfs/callback_proc.c |  131 ++++++++++++++++++++++++++++++++++--------------
>  fs/nfs/inode.c         |    2 +-
>  fs/nfs/nfs4_fs.h       |    1 +
>  fs/nfs/nfs4proc.c      |    4 +-
>  fs/nfs/nfs4state.c     |    2 +-
>  fs/nfs/pnfs.c          |   78 +++++++++++++---------------
>  fs/nfs/pnfs.h          |    8 ++-
>  7 files changed, 139 insertions(+), 87 deletions(-)
> 
> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
> index 24e2571..419fee7 100644
> --- a/fs/nfs/callback_proc.c
> +++ b/fs/nfs/callback_proc.c
> @@ -129,6 +129,38 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
>  
>  #if defined(CONFIG_NFS_V4_1)
>  
> +static int
> +pnfs_is_next_layout_stateid(const struct pnfs_layout_type *lo,
> +			    const nfs4_stateid stateid)
> +{
> +	int seq;
> +	int res;
> +	u32 oldseqid, newseqid;
> +
> +	do {
> +		seq = read_seqbegin(&lo->seqlock);
> +		oldseqid = ntohl(lo->stateid.u.stateid.seqid);
> +		newseqid = ntohl(stateid.u.stateid.seqid);
> +		res = !memcmp(lo->stateid.u.stateid.other,
> +			      stateid.u.stateid.other,
> +			      NFS4_STATEID_OTHER_SIZE);
> +		if (res) { /* comparing layout stateids */
> +			if (oldseqid == ~0)
> +				res = (newseqid == 1);
> +			else
> +				res = (newseqid == oldseqid + 1);
> +		} else { /* open stateid */
> +			res = !memcmp(lo->stateid.u.data,
> +				      &zero_stateid,
> +				      NFS4_STATEID_SIZE);
> +			if (res)
> +				res = (newseqid == 1);
> +		}
> +	} while (read_seqretry(&lo->seqlock, seq));
> +
> +	return res;
> +}
> +
>  /*
>   * Retrieve an inode based on layout recall parameters
>   *
> @@ -190,10 +222,11 @@ static int pnfs_recall_layout(void *data)
>  {
>  	struct inode *inode, *ino;
>  	struct nfs_client *clp;
> +	struct nfs4_pnfs_layoutreturn *lrp;
>  	struct cb_pnfs_layoutrecallargs rl;
>  	struct recall_layout_threadargs *args =
>  		(struct recall_layout_threadargs *)data;
> -	int status;
> +	int status = 0;
>  
>  	daemonize("nfsv4-layoutreturn");
>  
> @@ -204,46 +237,57 @@ static int pnfs_recall_layout(void *data)
>  	clp = args->clp;
>  	inode = args->inode;
>  	rl = *args->rl;
> -	args->result = 0;
> -	complete(&args->started);
> -	args = NULL;
> -	/* Note: args must not be used after this point!!! */
> -
> -/* FIXME: need barrier here:
> -   pause I/O to data servers
> -   pause layoutgets
> -   drain all outstanding writes to storage devices
> -   wait for any outstanding layoutreturns and layoutgets mentioned in
> -   cb_sequence.
> -   then return layouts, resume after layoutreturns complete
> - */
>  
>  	/* support whole file layouts only */
>  	rl.cbl_seg.offset = 0;
>  	rl.cbl_seg.length = NFS4_MAX_UINT64;
>  
>  	if (rl.cbl_recall_type == RETURN_FILE) {
> -		status = pnfs_return_layout(inode, &rl.cbl_seg, &rl.cbl_stateid,
> -					    RETURN_FILE);
> +		if (pnfs_is_next_layout_stateid(&NFS_I(inode)->layout, rl.cbl_stateid))
> +			status = pnfs_return_layout(inode, &rl.cbl_seg, &rl.cbl_stateid,
> +						    RETURN_FILE, 0);
> +		else
> +			status = htonl(NFS4ERR_DELAY);
> +		/* forgetful client */
>  		if (status)
> -			dprintk("%s RETURN_FILE error: %d\n", __func__, status);
> +			dprintk("%s RETURN_FILE : %d\n", __func__, status);
> +		else
> +			status =  htonl(NFS4ERR_NOMATCHING_LAYOUT);
> +		args->result = status;
> +		complete(&args->started);
>  		goto out;
>  	}
>  
> -	/* FIXME: This loop is inefficient, running in O(|s_inodes|^2) */
> +	status = htonl(NFS4_OK);
> +	args->result = status;
> +	complete(&args->started);
> +	args = NULL;
> +
>  	while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) {
>  		/* XXX need to check status on pnfs_return_layout */
> -		pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE);
> +		pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, 0);
>  		iput(ino);
>  	}
>  
> -	/* send final layoutreturn */
> -	status = pnfs_return_layout(inode, &rl.cbl_seg, NULL, rl.cbl_recall_type);
> -	if (status)
> -		printk(KERN_INFO "%s: ignoring pnfs_return_layout status=%d\n",
> -				__func__, status);
> +	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
> +	if (!lrp) {
> +		dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n",
> +			__func__);
> +		goto out;
> +	}
> +
> +	/* send final layoutreturn (bulk) */
> +	lrp->args.reclaim = 0;
> +	lrp->args.layout_type = rl.cbl_layout_type;
> +	lrp->args.return_type = rl.cbl_recall_type;
> +	lrp->args.lseg = rl.cbl_seg;
> +	lrp->args.inode = inode;
> +	lrp->lo = NULL;
> +
> +	pnfs4_proc_layoutreturn(lrp);
>  out:
> -	iput(inode);
> +	clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
> +	nfs_put_client(clp);
>  	module_put_and_exit(0);
>  	dprintk("%s: exit status %d\n", __func__, 0);
>  	return 0;
> @@ -261,15 +305,18 @@ static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
>  		.rl = rl,
>  	};
>  	struct task_struct *t;
> -	int status;
> -
> -	/* should have returned NFS4ERR_NOMATCHING_LAYOUT... */
> -	BUG_ON(inode == NULL);
> +	int status = htonl(NFS4ERR_DELAY);
>  
>  	dprintk("%s: -->\n", __func__);
>  
> +	/* XXX: do not allow two concurrent layout recalls */
> +	if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state))
> +		return status;
> +
>  	init_completion(&data.started);
>  	__module_get(THIS_MODULE);
> +	if (!atomic_inc_not_zero(&clp->cl_count))
> +		goto out_put_no_client;
>  
>  	t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout");
>  	if (IS_ERR(t)) {
> @@ -283,6 +330,9 @@ static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
>  	wait_for_completion(&data.started);
>  	return data.result;
>  out_module_put:
> +	nfs_put_client(clp);
> +out_put_no_client:
> +	clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
>  	module_put(THIS_MODULE);
>  	return status;
>  }
> @@ -309,19 +359,24 @@ __be32 pnfs_cb_layoutrecall(struct cb_pnfs_layoutrecallargs *args,
>  	do {
>  		struct nfs_client *prev = clp;
>  		num_client++;
> -		inode = nfs_layoutrecall_find_inode(clp, args);
> -		if (inode != NULL) {
> -			if (PNFS_LD(&NFS_I(inode)->layout)->id ==
> -			    args->cbl_layout_type) {
> -				/* Set up a helper thread to actually
> -				 * return the delegation */
> +		/* the callback must come from the MDS personality */
> +		if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
> +			goto loop;
> +		if (args->cbl_recall_type == RETURN_FILE) {
> +			inode = nfs_layoutrecall_find_inode(clp, args);
> +			if (inode != NULL) {
>  				res = pnfs_async_return_layout(clp, inode, args);
> -				if (res != 0)
> -					res = htonl(NFS4ERR_RESOURCE);
> -				break;
> +				iput(inode);
>  			}
> +		} else {
> +			/* we need the inode to get the nfs_server struct */
> +			inode = nfs_layoutrecall_find_inode(clp, args);
> +			if (!inode)
> +				goto loop;
> +			res = pnfs_async_return_layout(clp, inode, args);
>  			iput(inode);
>  		}
> +loop:
>  		clp = nfs_find_client_next(prev);
>  		nfs_put_client(prev);
>  	} while (clp != NULL);
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index adf3280..584c9b8 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -1321,7 +1321,7 @@ void nfs4_clear_inode(struct inode *inode)
>  	/* First call standard NFS clear_inode() code */
>  	nfs_clear_inode(inode);
>  #ifdef CONFIG_NFS_V4_1
> -	pnfs_return_layout(inode, NULL, NULL, RETURN_FILE);
> +	pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, 1);
>  #endif /* CONFIG_NFS_V4_1 */
>  }
>  #endif
> diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
> index 40ebe1d..90c57b3 100644
> --- a/fs/nfs/nfs4_fs.h
> +++ b/fs/nfs/nfs4_fs.h
> @@ -47,6 +47,7 @@ enum nfs4_client_state {
>  	NFS4CLNT_SESSION_RESET,
>  	NFS4CLNT_SESSION_DRAINING,
>  	NFS4CLNT_RECALL_SLOT,
> +	NFS4CLNT_LAYOUT_RECALL,
>  };
>  
>  /*
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 6739d15..14e90e1 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -1085,7 +1085,7 @@ static void pnfs4_layout_reclaim(struct nfs4_state *state)
>  	/* FIXME: send gratuitous layout commits and return with the reclaim
>  	 * flag during grace period
>  	 */
> -	pnfs_return_layout(state->inode, NULL, &state->open_stateid, RETURN_FILE);
> +	pnfs_return_layout(state->inode, NULL, NULL, RETURN_FILE, 1);
>  	pnfs_set_layout_stateid(&NFS_I(state->inode)->layout, &zero_stateid);
>  #endif /* CONFIG_NFS_V4_1 */
>  }
> @@ -2382,7 +2382,7 @@ pnfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
>  
>  	if (pnfs_enabled_sb(server) && has_layout(nfsi) &&
>  	    pnfs_ld_layoutret_on_setattr(server->pnfs_curr_ld))
> -		pnfs_return_layout(inode, NULL, NULL, RETURN_FILE);
> +		pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, 1);
>  	return nfs4_proc_setattr(dentry, fattr, sattr);
>  }
>  #endif /* CONFIG_NFS_V4_1 */
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index 2d52300..cd2ae83 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -598,7 +598,7 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fm
>  			range.offset = 0;
>  			range.length = NFS4_MAX_UINT64;
>  			pnfs_return_layout(state->inode, &range, NULL,
> -					   RETURN_FILE);
> +					   RETURN_FILE, 1);
>  		}
>  #endif /* CONFIG_NFS_V4_1 */
>  		nfs4_do_close(path, state, wait);
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 3b4dea0..2c96723 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -610,7 +610,7 @@ get_layout(struct inode *ino,
>   */
>  static inline int
>  should_free_lseg(struct pnfs_layout_segment *lseg,
> -		   struct nfs4_pnfs_layout_segment *range)
> +		 struct nfs4_pnfs_layout_segment *range)
>  {
>  	return (range->iomode == IOMODE_ANY ||
>  		lseg->range.iomode == range->iomode) &&
> @@ -703,6 +703,8 @@ return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
>  
>  	dprintk("--> %s\n", __func__);
>  
> +	BUG_ON(type != RETURN_FILE);
> +
>  	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
>  	if (lrp == NULL) {
>  		if (lo && (type == RETURN_FILE))
> @@ -729,7 +731,8 @@ out:
>  int
>  _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
>  		    const nfs4_stateid *stateid, /* optional */
> -		    enum pnfs_layoutreturn_type type)
> +		    enum pnfs_layoutreturn_type type,
> +		    int sendreturn)
>  {
>  	struct pnfs_layout_type *lo = NULL;
>  	struct nfs_inode *nfsi = NFS_I(ino);
> @@ -739,14 +742,19 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
>  	dprintk("--> %s type %d\n", __func__, type);
>  
>  	if (range)
> -		arg = *range;
> -	else {
> +		arg.iomode = range->iomode;
> +	else
>  		arg.iomode = IOMODE_ANY;
> -		arg.offset = 0;
> -		arg.length = ~0;
> -	}
> +	arg.offset = 0;
> +	arg.length = ~0;
> +
>  	if (type == RETURN_FILE) {
>  		if (nfsi->layout.layoutcommit_ctx) {
> +			if (stateid && !sendreturn) { /* callback */
> +				dprintk("%s: layoutcommit pending\n", __func__);
> +				status = htonl(NFS4ERR_DELAY);
> +				goto out;
> +			}
>  			status = pnfs_layoutcommit_inode(ino, 1);
>  			if (status) {
>  				dprintk("%s: layoutcommit failed, status=%d. "
> @@ -763,11 +771,7 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
>  		}
>  		if (!lo) {
>  			dprintk("%s: no layout segments to return\n", __func__);
> -			/* must send the LAYOUTRETURN in response to recall */
> -			if (stateid)
> -				goto send_return;
> -			else
> -				goto out;
> +			goto out;
>  		}
>  
>  		/* unlock w/o put rebalanced by eventual call to
> @@ -776,13 +780,22 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
>  		unlock_current_layout(nfsi);
>  
>  		if (pnfs_return_layout_barrier(nfsi, &arg)) {
> +			if (stateid) { /* callback */
> +				status = htonl(NFS4ERR_DELAY);
> +				lock_current_layout(nfsi);
> +				put_unlock_current_layout(lo);
> +				goto out;
> +			}
>  			dprintk("%s: waiting\n", __func__);
>  			wait_event(nfsi->lo_waitq,
> -				!pnfs_return_layout_barrier(nfsi, &arg));
> +				   !pnfs_return_layout_barrier(nfsi, &arg));
>  		}
> +
> +		if (sendreturn)
> +			status = return_layout(ino, &arg, stateid, type, lo);
> +		else
> +			pnfs_layout_release(lo, &arg);
>  	}
> -send_return:
> -	status = return_layout(ino, &arg, stateid, type, lo);
>  out:
>  	dprintk("<-- %s status: %d\n", __func__, status);
>  	return status;
> @@ -1080,31 +1093,12 @@ pnfs_update_layout(struct inode *ino,
>  	/* Check to see if the layout for the given range already exists */
>  	lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref);
>  	if (lseg && !lseg->valid) {
> -		unlock_current_layout(nfsi);
>  		if (take_ref)
>  			put_lseg(lseg);
> -		for (;;) {
> -			prepare_to_wait(&nfsi->lo_waitq, &__wait,
> -					TASK_KILLABLE);
> -			lock_current_layout(nfsi);
> -			lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref);
> -			if (!lseg || lseg->valid)
> -				break;
> -			dprintk("%s: invalid lseg %p ref %d\n", __func__,
> -				lseg, atomic_read(&lseg->kref.refcount)-1);
> -			if (take_ref)
> -				put_lseg(lseg);
> -			if (signal_pending(current)) {
> -				lseg = NULL;
> -				result = -ERESTARTSYS;
> -				break;
> -			}
> -			unlock_current_layout(nfsi);
> -			schedule();
> -		}
> -		finish_wait(&nfsi->lo_waitq, &__wait);
> -		if (result)
> -			goto out_put;
> +
> +		/* someone is cleaning the layout */
> +		result = -EAGAIN; 
> +		goto out_put;
>  	}
>  
>  	if (lseg) {
> @@ -1434,7 +1428,7 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
>  
>  	if (count > 0 && !below_threshold(inode, count, 0)) {
>  		status = pnfs_update_layout(inode, ctx, count,
> -						loff, IOMODE_READ, NULL);
> +					    loff, IOMODE_READ, NULL);
>  		dprintk("%s *rsize %Zd virt update returned %d\n",
>  			__func__, *rsize, status);
>  		if (status != 0)
> @@ -1673,7 +1667,7 @@ pnfs_writeback_done(struct nfs_write_data *data)
>  			.length = data->args.count,
>  		};
>  		dprintk("%s: retrying\n", __func__);
> -		_pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE);
> +		_pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, 1);
>  		pnfs_initiate_write(data, NFS_CLIENT(data->inode),
>  				    pdata->call_ops, pdata->how);
>  	}
> @@ -1804,7 +1798,7 @@ pnfs_read_done(struct nfs_read_data *data)
>  			.length = data->args.count,
>  		};
>  		dprintk("%s: retrying\n", __func__);
> -		_pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE);
> +		_pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, 1);
>  		pnfs_initiate_read(data, NFS_CLIENT(data->inode),
>  				   pdata->call_ops);
>  	}
> @@ -2030,7 +2024,7 @@ pnfs_commit_done(struct nfs_write_data *data)
>  			.length = data->args.count,
>  		};
>  		dprintk("%s: retrying\n", __func__);
> -		_pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE);
> +		_pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, 1);
>  		pnfs_initiate_commit(data, NFS_CLIENT(data->inode),
>  				     pdata->call_ops, pdata->how);
>  	}
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 9e43973..794a2d3 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -40,7 +40,8 @@ int pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
>  
>  int _pnfs_return_layout(struct inode *, struct nfs4_pnfs_layout_segment *,
>  			const nfs4_stateid *stateid, /* optional */
> -			enum pnfs_layoutreturn_type);
> +			enum pnfs_layoutreturn_type,
> +			int sendreturn);
>  void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id);
>  void unmount_pnfs_layoutdriver(struct nfs_server *);
>  int pnfs_use_read(struct inode *inode, ssize_t count);
> @@ -236,14 +237,15 @@ static inline void pnfs_modify_new_request(struct nfs_page *req,
>  static inline int pnfs_return_layout(struct inode *ino,
>  				     struct nfs4_pnfs_layout_segment *lseg,
>  				     const nfs4_stateid *stateid, /* optional */
> -				     enum pnfs_layoutreturn_type type)
> +				     enum pnfs_layoutreturn_type type,
> +				     int sendreturn)
>  {
>  	struct nfs_inode *nfsi = NFS_I(ino);
>  	struct nfs_server *nfss = NFS_SERVER(ino);
>  
>  	if (pnfs_enabled_sb(nfss) &&
>  	    (type != RETURN_FILE || has_layout(nfsi)))
> -		return _pnfs_return_layout(ino, lseg, stateid, type);
> +		return _pnfs_return_layout(ino, lseg, stateid, type, sendreturn);
>  
>  	return 0;
>  }
> -- 
> 1.6.2.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux