Re: [PATCH v5 5/5] nfsd: add the infrastructure to handle the cld upcall

Jeff Layton <jlayton@xxxxxxxxxx> · Sat, 4 Feb 2012 06:49:06 -0500

On Fri, 3 Feb 2012 17:57:36 -0500
"J. Bruce Fields" <bfields@xxxxxxxxxxxx> wrote:

> On Wed, Feb 01, 2012 at 10:44:12AM -0500, Jeff Layton wrote:
> > ...and add a mechanism for switching between the "legacy" tracker and
> > the new one. The decision is made by looking to see whether the
> > v4recoverydir exists. If it does, then the legacy client tracker is
> > used.
> > 
> > If it's not, then the kernel will create a "cld" pipe in rpc_pipefs.
> > That pipe is used to talk to a daemon for handling the upcall.
> > 
> > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> > ---
> >  fs/nfsd/nfs4recover.c |  364 ++++++++++++++++++++++++++++++++++++++++++++++++-
> >  1 files changed, 363 insertions(+), 1 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> > index 3fbf1f4..592fe3d 100644
> > --- a/fs/nfsd/nfs4recover.c
> > +++ b/fs/nfsd/nfs4recover.c
> > @@ -1,5 +1,6 @@
> >  /*
> >  *  Copyright (c) 2004 The Regents of the University of Michigan.
> > +*  Copyright (c) 2011 Jeff Layton <jlayton@xxxxxxxxxx>
> >  *  All rights reserved.
> >  *
> >  *  Andy Adamson <andros@xxxxxxxxxxxxxx>
> > @@ -36,6 +37,10 @@
> >  #include <linux/namei.h>
> >  #include <linux/crypto.h>
> >  #include <linux/sched.h>
> > +#include <linux/fs.h>
> > +#include <linux/sunrpc/rpc_pipe_fs.h>
> > +#include <linux/sunrpc/clnt.h>
> > +#include <linux/nfsd/cld.h>
> >  
> >  #include "nfsd.h"
> >  #include "state.h"
> > @@ -472,12 +477,369 @@ static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
> >  	.grace_done	= nfsd4_recdir_purge_old,
> >  };
> >  
> > +/* Globals */
> > +#define NFSD_PIPE_DIR		"/nfsd"
> > +
> > +static struct dentry *cld_pipe;
> > +
> > +/* list of cld_msg's that are currently in use */
> > +static DEFINE_SPINLOCK(cld_lock);
> > +static LIST_HEAD(cld_list);
> > +static unsigned int cld_xid;
> > +
> > +struct cld_upcall {
> > +	struct list_head	cu_list;
> > +	struct task_struct	*cu_task;
> > +	struct cld_msg	cu_msg;
> > +};
> > +
> > +static int
> > +__cld_pipe_upcall(struct cld_msg *cmsg)
> > +{
> > +	int ret;
> > +	struct rpc_pipe_msg msg;
> > +	struct inode *inode = cld_pipe->d_inode;
> > +
> > +	memset(&msg, 0, sizeof(msg));
> > +	msg.data = cmsg;
> > +	msg.len = sizeof(*cmsg);
> > +
> > +	/*
> > +	 * Set task state before we queue the upcall. That prevents
> > +	 * wake_up_process in the downcall from racing with schedule.
> > +	 */
> > +	set_current_state(TASK_UNINTERRUPTIBLE);
> 
> Is there a risk of nfsd being left unkillable if the daemon dies or
> becomes unresponsive?
> 

If the daemon dies, then the pipe will be closed and this will get
woken back up with msg.errno set to -EAGAIN. At that point, it'll
get requeued and then will eventually time out via the
RPC_PIPE_WAIT_FOR_OPEN workqueue job.

If the daemon is unresponsive but holds the pipe open then this will
sleep indefinitely. I suppose we could use a schedule_timeout() below
with a 30s timeout or so and then dequeue it if it does. I'll note
though that none of the other rpc_pipefs upcalls do that.

Hmm...now that I look though, I think I see a bug in the rpc_pipefs
code. When an upcall is read off the pipe, it moves to the in_upcall
list. If there's never a downcall for it, then I think it'll just
linger there forever until the pipe is removed. Even closing the pipe
won't unwedge it. It seems like we should return an error for those as
well. I'll see about spinning up a patch for that too...

> > +	ret = rpc_queue_upcall(inode, &msg);
> > +	if (ret < 0) {
> > +		set_current_state(TASK_RUNNING);
> > +		goto out;
> > +	}
> > +
> > +	schedule();
> > +	set_current_state(TASK_RUNNING);
> > +
> > +	if (msg.errno < 0)
> > +		ret = msg.errno;
> > +out:
> > +	return ret;
> > +}
> > +
> > +static int
> > +cld_pipe_upcall(struct cld_msg *cmsg)
> > +{
> > +	int ret;
> > +
> > +	/*
> > +	 * -EAGAIN occurs when pipe is closed an reopened while there are
> > +	 *  upcalls queued.
> > +	 */
> > +	do {
> > +		ret = __cld_pipe_upcall(cmsg);
> > +	} while (ret == -EAGAIN);
> > +
> > +	return ret;
> > +}
> > +
> > +static ssize_t
> > +cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
> > +{
> > +	struct cld_upcall *tmp, *cup;
> > +	struct cld_msg *cmsg = (struct cld_msg *)src;
> > +	uint32_t xid;
> > +
> > +	if (mlen != sizeof(*cmsg)) {
> > +		dprintk("%s: got %lu bytes, expected %lu\n", __func__, mlen,
> > +			sizeof(*cmsg));
> > +		return -EINVAL;
> > +	}
> > +
> > +	/* copy just the xid so we can try to find that */
> > +	if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) {
> > +		dprintk("%s: error when copying xid from userspace", __func__);
> > +		return -EFAULT;
> > +	}
> > +
> > +	/* walk the list and find corresponding xid */
> > +	cup = NULL;
> > +	spin_lock(&cld_lock);
> > +	list_for_each_entry(tmp, &cld_list, cu_list) {
> > +		if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) {
> > +			cup = tmp;
> > +			list_del_init(&cup->cu_list);
> > +			break;
> > +		}
> > +	}
> > +	spin_unlock(&cld_lock);
> > +
> > +	/* couldn't find upcall? */
> > +	if (!cup) {
> > +		dprintk("%s: couldn't find upcall -- xid=%u\n", __func__,
> > +			cup->cu_msg.cm_xid);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
> > +		return -EFAULT;
> > +
> > +	wake_up_process(cup->cu_task);
> > +	return mlen;
> > +}
> > +
> > +static void
> > +cld_pipe_destroy_msg(struct rpc_pipe_msg *msg)
> > +{
> > +	struct cld_msg *cmsg = msg->data;
> > +	struct cld_upcall *cup = container_of(cmsg, struct cld_upcall,
> > +						 cu_msg);
> > +
> > +	/* errno >= 0 means we got a downcall */
> > +	if (msg->errno >= 0)
> > +		return;
> > +
> > +	wake_up_process(cup->cu_task);
> > +}
> > +
> > +static const struct rpc_pipe_ops cld_upcall_ops = {
> > +	.upcall		= rpc_pipe_generic_upcall,
> > +	.downcall	= cld_pipe_downcall,
> > +	.destroy_msg	= cld_pipe_destroy_msg,
> > +};
> > +
> > +/* Initialize rpc_pipefs pipe for communication with client tracking daemon */
> > +static int
> > +nfsd4_init_cld_pipe(void)
> > +{
> > +	int ret;
> > +	struct path path;
> > +	struct vfsmount *mnt;
> > +
> > +	if (cld_pipe)
> > +		return 0;
> > +
> > +	mnt = rpc_get_mount();
> > +	if (IS_ERR(mnt))
> > +		return PTR_ERR(mnt);
> > +
> > +	ret = vfs_path_lookup(mnt->mnt_root, mnt, NFSD_PIPE_DIR, 0, &path);
> > +	if (ret)
> > +		goto err;
> > +
> > +	cld_pipe = rpc_mkpipe(path.dentry, "cld", NULL,
> > +				&cld_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
> > +	path_put(&path);
> > +	if (!IS_ERR(cld_pipe))
> > +		return 0;
> > +
> > +	ret = PTR_ERR(cld_pipe);
> > +err:
> > +	rpc_put_mount();
> > +	printk(KERN_ERR "NFSD: unable to create nfsdcld upcall pipe (%d)\n",
> > +			ret);
> > +	return ret;
> > +}
> > +
> > +static void
> > +nfsd4_remove_cld_pipe(void)
> > +{
> > +	int ret;
> > +
> > +	ret = rpc_unlink(cld_pipe);
> > +	if (ret)
> > +		printk(KERN_ERR "NFSD: error removing cld pipe: %d\n", ret);
> > +	cld_pipe = NULL;
> > +	rpc_put_mount();
> > +}
> > +
> > +static struct cld_upcall *
> > +alloc_cld_upcall(void)
> > +{
> > +	struct cld_upcall *new, *tmp;
> > +
> > +	new = kzalloc(sizeof(*new), GFP_KERNEL);
> > +	if (!new)
> > +		return new;
> > +
> > +	/* FIXME: hard cap on number in flight? */
> > +restart_search:
> > +	spin_lock(&cld_lock);
> > +	list_for_each_entry(tmp, &cld_list, cu_list) {
> > +		if (tmp->cu_msg.cm_xid == cld_xid) {
> > +			cld_xid++;
> > +			spin_unlock(&cld_lock);
> > +			goto restart_search;
> > +		}
> > +	}
> > +	new->cu_task = current;
> > +	new->cu_msg.cm_vers = CLD_UPCALL_VERSION;
> > +	put_unaligned(cld_xid++, &new->cu_msg.cm_xid);
> > +	list_add(&new->cu_list, &cld_list);
> > +	spin_unlock(&cld_lock);
> > +
> > +	dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid);
> > +
> > +	return new;
> > +}
> > +
> > +static void
> > +free_cld_upcall(struct cld_upcall *victim)
> > +{
> > +	spin_lock(&cld_lock);
> > +	list_del(&victim->cu_list);
> > +	spin_unlock(&cld_lock);
> > +	kfree(victim);
> > +}
> > +
> > +/* Ask daemon to create a new record */
> > +static void
> > +nfsd4_cld_create(struct nfs4_client *clp)
> > +{
> > +	int ret;
> > +	struct cld_upcall *cup;
> > +
> > +	/* Don't upcall if it's already stored */
> > +	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
> > +		return;
> > +
> > +	cup = alloc_cld_upcall();
> > +	if (!cup) {
> > +		ret = -ENOMEM;
> > +		goto out_err;
> > +	}
> > +
> > +	cup->cu_msg.cm_cmd = Cld_Create;
> > +	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
> > +	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
> > +			clp->cl_name.len);
> > +
> > +	ret = cld_pipe_upcall(&cup->cu_msg);
> > +	if (!ret) {
> > +		ret = cup->cu_msg.cm_status;
> > +		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
> > +	}
> > +
> > +	free_cld_upcall(cup);
> > +out_err:
> > +	if (ret)
> > +		printk(KERN_ERR "NFSD: Unable to create client "
> > +				"record on stable storage: %d\n", ret);
> > +}
> > +
> > +/* Ask daemon to create a new record */
> > +static void
> > +nfsd4_cld_remove(struct nfs4_client *clp)
> > +{
> > +	int ret;
> > +	struct cld_upcall *cup;
> > +
> > +	/* Don't upcall if it's already removed */
> > +	if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
> > +		return;
> > +
> > +	cup = alloc_cld_upcall();
> > +	if (!cup) {
> > +		ret = -ENOMEM;
> > +		goto out_err;
> > +	}
> > +
> > +	cup->cu_msg.cm_cmd = Cld_Remove;
> > +	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
> > +	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
> > +			clp->cl_name.len);
> > +
> > +	ret = cld_pipe_upcall(&cup->cu_msg);
> > +	if (!ret) {
> > +		ret = cup->cu_msg.cm_status;
> > +		clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
> > +	}
> > +
> > +	free_cld_upcall(cup);
> > +out_err:
> > +	if (ret)
> > +		printk(KERN_ERR "NFSD: Unable to remove client "
> > +				"record from stable storage: %d\n", ret);
> > +}
> > +/* Check for presence of a record, and update its timestamp */
> > +static int
> > +nfsd4_cld_check(struct nfs4_client *clp)
> > +{
> > +	int ret;
> > +	struct cld_upcall *cup;
> > +
> > +	/* Don't upcall if one was already stored during this grace pd */
> > +	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
> > +		return 0;
> > +
> > +	cup = alloc_cld_upcall();
> > +	if (!cup) {
> > +		printk(KERN_ERR "NFSD: Unable to check client record on "
> > +				"stable storage: %d\n", -ENOMEM);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	cup->cu_msg.cm_cmd = Cld_Check;
> > +	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
> > +	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
> > +			clp->cl_name.len);
> > +
> > +	ret = cld_pipe_upcall(&cup->cu_msg);
> > +	if (!ret) {
> > +		ret = cup->cu_msg.cm_status;
> > +		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
> > +	}
> 
> Most of this nfsd4_cld_*() code is *really* similar from one function to
> the next--would it make sense to share some more code?
> 
> Anyway, this basically all looks reasonable to me....
> 
> --b.
> 

It is very similar, but I think I'm sharing about all I can here. I'll
see if I can consolidate it some more, but the upcalls are just
different enough to make that difficult...

> > +
> > +	free_cld_upcall(cup);
> > +	return ret;
> > +}
> > +
> > +static void
> > +nfsd4_cld_grace_done(time_t boot_time)
> > +{
> > +	int ret;
> > +	struct cld_upcall *cup;
> > +
> > +	cup = alloc_cld_upcall();
> > +	if (!cup) {
> > +		ret = -ENOMEM;
> > +		goto out_err;
> > +	}
> > +
> > +	cup->cu_msg.cm_cmd = Cld_GraceDone;
> > +	cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time;
> > +	ret = cld_pipe_upcall(&cup->cu_msg);
> > +	if (!ret)
> > +		ret = cup->cu_msg.cm_status;
> > +
> > +	free_cld_upcall(cup);
> > +out_err:
> > +	if (ret)
> > +		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
> > +}
> > +
> > +static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
> > +	.init		= nfsd4_init_cld_pipe,
> > +	.exit		= nfsd4_remove_cld_pipe,
> > +	.create		= nfsd4_cld_create,
> > +	.remove		= nfsd4_cld_remove,
> > +	.check		= nfsd4_cld_check,
> > +	.grace_done	= nfsd4_cld_grace_done,
> > +};
> > +
> >  int
> >  nfsd4_client_tracking_init(void)
> >  {
> >  	int status;
> > +	struct path path;
> >  
> > -	client_tracking_ops = &nfsd4_legacy_tracking_ops;
> > +	if (!client_tracking_ops) {
> > +		client_tracking_ops = &nfsd4_cld_tracking_ops;
> > +		status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
> > +		if (!status) {
> > +			if (S_ISDIR(path.dentry->d_inode->i_mode))
> > +				client_tracking_ops =
> > +						&nfsd4_legacy_tracking_ops;
> > +			path_put(&path);
> > +		}
> > +	}
> >  
> >  	if (!client_tracking_ops->init)
> >  		return 0;
> > -- 
> > 1.7.7.6
> > 

-- 
Jeff Layton <jlayton@xxxxxxxxxx>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html