Re: [RFC] nconnect xprt stickiness for a file

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




> On Mar 17, 2021, at 9:56 PM, Nagendra Tomar <Nagendra.Tomar@xxxxxxxxxxxxx> wrote:
> 
> We have a clustered NFS server behind a L4 load-balancer with the following
> Characteristics (relevant to this discussion):
> 
> 1. RPC requests for the same file issued to different cluster nodes are not efficient.
>    One file one cluster node is efficient. This is particularly true for WRITEs.
> 2. Multiple nconnect xprts land on different cluster nodes due to the source 
>    port being different for all.
> 
> Due to this, the default nconnect roundrobin policy does not work very well as
> it results in RPCs targeted to the same file to be serviced by different cluster nodes.
> 
> To solve this, we tweaked the nfs multipath code to always choose the same xprt 
> for the same file. We do that by adding a new integer field to rpc_message,
> rpc_xprt_hint, which is set by NFS layer and used by RPC layer to pick a xprt.
> NFS layer sets it to the hash of the target file's filehandle, thus ensuring same file
> requests always use the same xprt. This works well.
> 
> I am interested in knowing your thoughts on this, has anyone else also come across
> similar issue, is there any other way of solving this, etc.

Would a pNFS file layout work? The MDS could direct I/O for
a particular file to a specific DS.


> Following  patch is just to demonstrate the idea. It works but covers only what I
> needed for the experiment. Based on the feedback, I can follow it up with a formal 
> patch if needed.
> 
> Thanks,
> Tomar
> 
> ---
> diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
> index 5c4e23abc..8f1cf03dc 100644
> --- a/fs/nfs/nfs3proc.c
> +++ b/fs/nfs/nfs3proc.c
> @@ -108,6 +108,7 @@ nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
> 		.rpc_proc	= &nfs3_procedures[NFS3PROC_GETATTR],
> 		.rpc_argp	= fhandle,
> 		.rpc_resp	= fattr,
> +		.rpc_xprt_hint	= nfs_fh_hash(fhandle),
> 	};
> 	int	status;
> 	unsigned short task_flags = 0;
> @@ -136,6 +137,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
> 		.rpc_proc	= &nfs3_procedures[NFS3PROC_SETATTR],
> 		.rpc_argp	= &arg,
> 		.rpc_resp	= fattr,
> +		.rpc_xprt_hint	= nfs_fh_hash(arg.fh),
> 	};
> 	int	status;
> 
> @@ -171,6 +173,7 @@ __nfs3_proc_lookup(struct inode *dir, const char *name, size_t len,
> 		.rpc_proc	= &nfs3_procedures[NFS3PROC_LOOKUP],
> 		.rpc_argp	= &arg,
> 		.rpc_resp	= &res,
> +		.rpc_xprt_hint	= nfs_fh_hash(arg.fh),
> 	};
> 	int			status;
> 
> @@ -235,6 +238,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
> 		.rpc_argp	= &arg,
> 		.rpc_resp	= &res,
> 		.rpc_cred	= entry->cred,
> +		.rpc_xprt_hint	= nfs_fh_hash(arg.fh),
> 	};
> 	int status = -ENOMEM;
> 
> @@ -266,6 +270,7 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
> 	struct rpc_message msg = {
> 		.rpc_proc	= &nfs3_procedures[NFS3PROC_READLINK],
> 		.rpc_argp	= &args,
> +		.rpc_xprt_hint	= nfs_fh_hash(args.fh),
> 	};
> 	int status = -ENOMEM;
> 
> @@ -355,6 +360,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
> 	data->arg.create.name = dentry->d_name.name;
> 	data->arg.create.len = dentry->d_name.len;
> 	data->arg.create.sattr = sattr;
> +	data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.create.fh);
> 
> 	data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
> 	if (flags & O_EXCL) {
> @@ -442,6 +448,7 @@ nfs3_proc_remove(struct inode *dir, struct dentry *dentry)
> 		.rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE],
> 		.rpc_argp = &arg,
> 		.rpc_resp = &res,
> +		.rpc_xprt_hint = nfs_fh_hash(arg.fh),
> 	};
> 	int status = -ENOMEM;
> 
> @@ -524,6 +531,7 @@ nfs3_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
> 		.rpc_proc	= &nfs3_procedures[NFS3PROC_LINK],
> 		.rpc_argp	= &arg,
> 		.rpc_resp	= &res,
> +		.rpc_xprt_hint	= nfs_fh_hash(arg.tofh),
> 	};
> 	int status = -ENOMEM;
> 
> @@ -566,6 +574,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
> 	data->arg.symlink.pages = &page;
> 	data->arg.symlink.pathlen = len;
> 	data->arg.symlink.sattr = sattr;
> +	data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.symlink.fromfh);
> 
> 	d_alias = nfs3_do_create(dir, dentry, data);
> 	status = PTR_ERR_OR_ZERO(d_alias);
> @@ -602,6 +611,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
> 	data->arg.mkdir.name = dentry->d_name.name;
> 	data->arg.mkdir.len = dentry->d_name.len;
> 	data->arg.mkdir.sattr = sattr;
> +	data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.mkdir.fh);
> 
> 	d_alias = nfs3_do_create(dir, dentry, data);
> 	status = PTR_ERR_OR_ZERO(d_alias);
> @@ -636,6 +646,7 @@ nfs3_proc_rmdir(struct inode *dir, const struct qstr *name)
> 	struct rpc_message msg = {
> 		.rpc_proc	= &nfs3_procedures[NFS3PROC_RMDIR],
> 		.rpc_argp	= &arg,
> +		.rpc_xprt_hint	= nfs_fh_hash(arg.fh),
> 	};
> 	int status = -ENOMEM;
> 
> @@ -682,6 +693,7 @@ static int nfs3_proc_readdir(struct nfs_readdir_arg *nr_arg,
> 		.rpc_argp	= &arg,
> 		.rpc_resp	= &res,
> 		.rpc_cred	= nr_arg->cred,
> +		.rpc_xprt_hint	= nfs_fh_hash(arg.fh),
> 	};
> 	int status = -ENOMEM;
> 
> @@ -735,6 +747,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
> 	data->arg.mknod.len = dentry->d_name.len;
> 	data->arg.mknod.sattr = sattr;
> 	data->arg.mknod.rdev = rdev;
> +	data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.mknod.fh);
> 
> 	switch (sattr->ia_mode & S_IFMT) {
> 	case S_IFBLK:
> @@ -782,6 +795,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
> 		.rpc_proc	= &nfs3_procedures[NFS3PROC_FSSTAT],
> 		.rpc_argp	= fhandle,
> 		.rpc_resp	= stat,
> +		.rpc_xprt_hint	= nfs_fh_hash(fhandle),
> 	};
> 	int	status;
> 
> @@ -800,6 +814,7 @@ do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle,
> 		.rpc_proc	= &nfs3_procedures[NFS3PROC_FSINFO],
> 		.rpc_argp	= fhandle,
> 		.rpc_resp	= info,
> +		.rpc_xprt_hint	= nfs_fh_hash(fhandle),
> 	};
> 	int	status;
> 
> @@ -834,6 +849,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
> 		.rpc_proc	= &nfs3_procedures[NFS3PROC_PATHCONF],
> 		.rpc_argp	= fhandle,
> 		.rpc_resp	= info,
> +		.rpc_xprt_hint	= nfs_fh_hash(fhandle),
> 	};
> 	int	status;
> 
> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> index 78c9c4bde..60578e9fd 100644
> --- a/fs/nfs/pagelist.c
> +++ b/fs/nfs/pagelist.c
> @@ -762,6 +762,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
> 		.rpc_argp = &hdr->args,
> 		.rpc_resp = &hdr->res,
> 		.rpc_cred = cred,
> +		.rpc_xprt_hint = nfs_fh_hash(hdr->args.fh),
> 	};
> 	struct rpc_task_setup task_setup_data = {
> 		.rpc_client = clnt,
> diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
> index b27ebdcce..b1713d6fb 100644
> --- a/fs/nfs/unlink.c
> +++ b/fs/nfs/unlink.c
> @@ -355,6 +355,12 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
> 	msg.rpc_resp = &data->res;
> 	msg.rpc_cred = data->cred;
> 
> +	if (data->args.new_dir)
> +		msg.rpc_xprt_hint = nfs_fh_hash(data->args.new_dir);
> +	else
> +		msg.rpc_xprt_hint = nfs_fh_hash(data->args.old_dir);
> +
> +
> 	/* set up nfs_renamedata */
> 	data->old_dir = old_dir;
> 	ihold(old_dir);
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index 639c34fec..284b364a6 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -1687,6 +1687,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
> 		.rpc_argp = &data->args,
> 		.rpc_resp = &data->res,
> 		.rpc_cred = data->cred,
> +		.rpc_xprt_hint = nfs_fh_hash(data->args.fh),
> 	};
> 	struct rpc_task_setup task_setup_data = {
> 		.task = &data->task,
> diff --git a/include/linux/nfs.h b/include/linux/nfs.h
> index 0dc7ad38a..5d5b2d20b 100644
> --- a/include/linux/nfs.h
> +++ b/include/linux/nfs.h
> @@ -10,6 +10,7 @@
> 
> #include <linux/sunrpc/msg_prot.h>
> #include <linux/string.h>
> +#include <linux/jhash.h>
> #include <uapi/linux/nfs.h>
> 
> /*
> @@ -36,6 +37,10 @@ static inline void nfs_copy_fh(struct nfs_fh *target, const struct nfs_fh *sourc
> 	memcpy(target->data, source->data, source->size);
> }
> 
> +static inline u32 nfs_fh_hash(const struct nfs_fh *fh)
> +{
> +	return (fh ? jhash(fh->data, fh->size, 0) : 0);
> +}
> 
> /*
>  * This is really a general kernel constant, but since nothing like
> diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
> index df696efdd..8f365280c 100644
> --- a/include/linux/sunrpc/sched.h
> +++ b/include/linux/sunrpc/sched.h
> @@ -27,6 +27,7 @@ struct rpc_message {
> 	void *			rpc_argp;	/* Arguments */
> 	void *			rpc_resp;	/* Result */
> 	const struct cred *	rpc_cred;	/* Credentials */
> +	u32			rpc_xprt_hint;
> };
> 
> struct rpc_call_ops;
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index 612f0a641..fcf8e7962 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -1058,6 +1058,53 @@ rpc_task_get_next_xprt(struct rpc_clnt *clnt)
> 	return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi));
> }
> 
> +static
> +bool xprt_is_active(const struct rpc_xprt *xprt)
> +{
> +	return kref_read(&xprt->kref) != 0;
> +}
> +
> +static struct rpc_xprt *
> +rpc_task_get_hashed_xprt(struct rpc_clnt *clnt, const struct rpc_task *task)
> +{
> +	const struct rpc_xprt_switch *xps = NULL;
> +	struct rpc_xprt *xprt = NULL;
> +	const struct rpc_message *rpc_message = &task->tk_msg;
> +	const u32 hash = rpc_message->rpc_xprt_hint;
> +
> +	if (!hash)
> +		return rpc_task_get_next_xprt(clnt);
> +
> +	rcu_read_lock();
> +	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
> +
> +	if (xps && hash) {
> +		const struct list_head *head = &xps->xps_xprt_list;
> +		struct rpc_xprt *pos;
> +		const u32 nactive = READ_ONCE(xps->xps_nactive);
> +		const u32 xprt_idx = (hash % nactive);
> +		u32 idx = 0;
> +
> +		list_for_each_entry_rcu(pos, head, xprt_switch) {
> +			if (xprt_idx > idx++)
> +				continue;
> +			if (xprt_is_active(pos)) {
> +				xprt = xprt_get(pos);
> +				break;
> +			}
> +		}
> +	}
> +
> +	/*
> +	 * Use first transport, if not found any.
> +	 */
> +	if (!xprt)
> +		xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
> +	rcu_read_unlock();
> +
> +	return rpc_task_get_xprt(clnt, xprt);
> +}
> +
> static
> void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
> {
> @@ -1066,7 +1113,7 @@ void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
> 	if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
> 		task->tk_xprt = rpc_task_get_first_xprt(clnt);
> 	else
> -		task->tk_xprt = rpc_task_get_next_xprt(clnt);
> +		task->tk_xprt = rpc_task_get_hashed_xprt(clnt, task);
> }
> 
> static
> @@ -1100,6 +1147,7 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
> 		task->tk_msg.rpc_argp = msg->rpc_argp;
> 		task->tk_msg.rpc_resp = msg->rpc_resp;
> 		task->tk_msg.rpc_cred = msg->rpc_cred;
> +		task->tk_msg.rpc_xprt_hint = msg->rpc_xprt_hint;
> 		if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
> 			get_cred(task->tk_msg.rpc_cred);
> 	}
> @@ -1130,8 +1178,8 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
> 	if (!RPC_IS_ASYNC(task))
> 		task->tk_flags |= RPC_TASK_CRED_NOREF;
> 
> -	rpc_task_set_client(task, task_setup_data->rpc_client);
> 	rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
> +	rpc_task_set_client(task, task_setup_data->rpc_client);
> 
> 	if (task->tk_action == NULL)
> 		rpc_call_start(task);

--
Chuck Lever







[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux