On Thu, 2021-03-18 at 01:56 +0000, Nagendra Tomar wrote: > We have a clustered NFS server behind a L4 load-balancer with the > following > Characteristics (relevant to this discussion): > > 1. RPC requests for the same file issued to different cluster nodes > are not efficient. > One file one cluster node is efficient. This is particularly true > for WRITEs. > 2. Multiple nconnect xprts land on different cluster nodes due to the > source > port being different for all. > > Due to this, the default nconnect roundrobin policy does not work > very well as > it results in RPCs targeted to the same file to be serviced by > different cluster nodes. > > To solve this, we tweaked the nfs multipath code to always choose the > same xprt > for the same file. We do that by adding a new integer field to > rpc_message, > rpc_xprt_hint, which is set by NFS layer and used by RPC layer to > pick a xprt. > NFS layer sets it to the hash of the target file's filehandle, thus > ensuring same file > requests always use the same xprt. This works well. > > I am interested in knowing your thoughts on this, has anyone else > also come across > similar issue, is there any other way of solving this, etc. > > Following patch is just to demonstrate the idea. It works but covers > only what I > needed for the experiment. Based on the feedback, I can follow it up > with a formal > patch if needed. > > Thanks, > Tomar > > --- > diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c > index 5c4e23abc..8f1cf03dc 100644 > --- a/fs/nfs/nfs3proc.c > +++ b/fs/nfs/nfs3proc.c > @@ -108,6 +108,7 @@ nfs3_proc_getattr(struct nfs_server *server, > struct nfs_fh *fhandle, > .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], > .rpc_argp = fhandle, > .rpc_resp = fattr, > + .rpc_xprt_hint = nfs_fh_hash(fhandle), > }; > int status; > unsigned short task_flags = 0; > @@ -136,6 +137,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct > nfs_fattr *fattr, > .rpc_proc = &nfs3_procedures[NFS3PROC_SETATTR], > .rpc_argp = &arg, > .rpc_resp = fattr, > + .rpc_xprt_hint = nfs_fh_hash(arg.fh), > }; > int status; > > @@ -171,6 +173,7 @@ __nfs3_proc_lookup(struct inode *dir, const char > *name, size_t len, > .rpc_proc = &nfs3_procedures[NFS3PROC_LOOKUP], > .rpc_argp = &arg, > .rpc_resp = &res, > + .rpc_xprt_hint = nfs_fh_hash(arg.fh), > }; > int status; > > @@ -235,6 +238,7 @@ static int nfs3_proc_access(struct inode *inode, > struct nfs_access_entry *entry) > .rpc_argp = &arg, > .rpc_resp = &res, > .rpc_cred = entry->cred, > + .rpc_xprt_hint = nfs_fh_hash(arg.fh), > }; > int status = -ENOMEM; > > @@ -266,6 +270,7 @@ static int nfs3_proc_readlink(struct inode > *inode, struct page *page, > struct rpc_message msg = { > .rpc_proc = > &nfs3_procedures[NFS3PROC_READLINK], > .rpc_argp = &args, > + .rpc_xprt_hint = nfs_fh_hash(args.fh), > }; > int status = -ENOMEM; > > @@ -355,6 +360,7 @@ nfs3_proc_create(struct inode *dir, struct dentry > *dentry, struct iattr *sattr, > data->arg.create.name = dentry->d_name.name; > data->arg.create.len = dentry->d_name.len; > data->arg.create.sattr = sattr; > + data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.create.fh); > > data->arg.create.createmode = NFS3_CREATE_UNCHECKED; > if (flags & O_EXCL) { > @@ -442,6 +448,7 @@ nfs3_proc_remove(struct inode *dir, struct dentry > *dentry) > .rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE], > .rpc_argp = &arg, > .rpc_resp = &res, > + .rpc_xprt_hint = nfs_fh_hash(arg.fh), > }; > int status = -ENOMEM; > > @@ -524,6 +531,7 @@ nfs3_proc_link(struct inode *inode, struct inode > *dir, const struct qstr *name) > .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], > .rpc_argp = &arg, > .rpc_resp = &res, > + .rpc_xprt_hint = nfs_fh_hash(arg.tofh), > }; > int status = -ENOMEM; > > @@ -566,6 +574,7 @@ nfs3_proc_symlink(struct inode *dir, struct > dentry *dentry, struct page *page, > data->arg.symlink.pages = &page; > data->arg.symlink.pathlen = len; > data->arg.symlink.sattr = sattr; > + data->msg.rpc_xprt_hint = nfs_fh_hash(data- > >arg.symlink.fromfh); > > d_alias = nfs3_do_create(dir, dentry, data); > status = PTR_ERR_OR_ZERO(d_alias); > @@ -602,6 +611,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry > *dentry, struct iattr *sattr) > data->arg.mkdir.name = dentry->d_name.name; > data->arg.mkdir.len = dentry->d_name.len; > data->arg.mkdir.sattr = sattr; > + data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.mkdir.fh); > > d_alias = nfs3_do_create(dir, dentry, data); > status = PTR_ERR_OR_ZERO(d_alias); > @@ -636,6 +646,7 @@ nfs3_proc_rmdir(struct inode *dir, const struct > qstr *name) > struct rpc_message msg = { > .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], > .rpc_argp = &arg, > + .rpc_xprt_hint = nfs_fh_hash(arg.fh), > }; > int status = -ENOMEM; > > @@ -682,6 +693,7 @@ static int nfs3_proc_readdir(struct > nfs_readdir_arg *nr_arg, > .rpc_argp = &arg, > .rpc_resp = &res, > .rpc_cred = nr_arg->cred, > + .rpc_xprt_hint = nfs_fh_hash(arg.fh), > }; > int status = -ENOMEM; > > @@ -735,6 +747,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry > *dentry, struct iattr *sattr, > data->arg.mknod.len = dentry->d_name.len; > data->arg.mknod.sattr = sattr; > data->arg.mknod.rdev = rdev; > + data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.mknod.fh); > > switch (sattr->ia_mode & S_IFMT) { > case S_IFBLK: > @@ -782,6 +795,7 @@ nfs3_proc_statfs(struct nfs_server *server, > struct nfs_fh *fhandle, > .rpc_proc = &nfs3_procedures[NFS3PROC_FSSTAT], > .rpc_argp = fhandle, > .rpc_resp = stat, > + .rpc_xprt_hint = nfs_fh_hash(fhandle), > }; > int status; > > @@ -800,6 +814,7 @@ do_proc_fsinfo(struct rpc_clnt *client, struct > nfs_fh *fhandle, > .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO], > .rpc_argp = fhandle, > .rpc_resp = info, > + .rpc_xprt_hint = nfs_fh_hash(fhandle), > }; > int status; > > @@ -834,6 +849,7 @@ nfs3_proc_pathconf(struct nfs_server *server, > struct nfs_fh *fhandle, > .rpc_proc = > &nfs3_procedures[NFS3PROC_PATHCONF], > .rpc_argp = fhandle, > .rpc_resp = info, > + .rpc_xprt_hint = nfs_fh_hash(fhandle), > }; > int status; > > diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c > index 78c9c4bde..60578e9fd 100644 > --- a/fs/nfs/pagelist.c > +++ b/fs/nfs/pagelist.c > @@ -762,6 +762,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, > struct nfs_pgio_header *hdr, > .rpc_argp = &hdr->args, > .rpc_resp = &hdr->res, > .rpc_cred = cred, > + .rpc_xprt_hint = nfs_fh_hash(hdr->args.fh), > }; > struct rpc_task_setup task_setup_data = { > .rpc_client = clnt, > diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c > index b27ebdcce..b1713d6fb 100644 > --- a/fs/nfs/unlink.c > +++ b/fs/nfs/unlink.c > @@ -355,6 +355,12 @@ nfs_async_rename(struct inode *old_dir, struct > inode *new_dir, > msg.rpc_resp = &data->res; > msg.rpc_cred = data->cred; > > + if (data->args.new_dir) > + msg.rpc_xprt_hint = nfs_fh_hash(data->args.new_dir); > + else > + msg.rpc_xprt_hint = nfs_fh_hash(data->args.old_dir); > + > + > /* set up nfs_renamedata */ > data->old_dir = old_dir; > ihold(old_dir); > diff --git a/fs/nfs/write.c b/fs/nfs/write.c > index 639c34fec..284b364a6 100644 > --- a/fs/nfs/write.c > +++ b/fs/nfs/write.c > @@ -1687,6 +1687,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, > struct nfs_commit_data *data, > .rpc_argp = &data->args, > .rpc_resp = &data->res, > .rpc_cred = data->cred, > + .rpc_xprt_hint = nfs_fh_hash(data->args.fh), > }; > struct rpc_task_setup task_setup_data = { > .task = &data->task, > diff --git a/include/linux/nfs.h b/include/linux/nfs.h > index 0dc7ad38a..5d5b2d20b 100644 > --- a/include/linux/nfs.h > +++ b/include/linux/nfs.h > @@ -10,6 +10,7 @@ > > #include <linux/sunrpc/msg_prot.h> > #include <linux/string.h> > +#include <linux/jhash.h> > #include <uapi/linux/nfs.h> > > /* > @@ -36,6 +37,10 @@ static inline void nfs_copy_fh(struct nfs_fh > *target, const struct nfs_fh *sourc > memcpy(target->data, source->data, source->size); > } > > +static inline u32 nfs_fh_hash(const struct nfs_fh *fh) > +{ > + return (fh ? jhash(fh->data, fh->size, 0) : 0); > +} > > /* > * This is really a general kernel constant, but since nothing like > diff --git a/include/linux/sunrpc/sched.h > b/include/linux/sunrpc/sched.h > index df696efdd..8f365280c 100644 > --- a/include/linux/sunrpc/sched.h > +++ b/include/linux/sunrpc/sched.h > @@ -27,6 +27,7 @@ struct rpc_message { > void * rpc_argp; /* Arguments */ > void * rpc_resp; /* Result */ > const struct cred * rpc_cred; /* Credentials */ > + u32 rpc_xprt_hint; > }; > > struct rpc_call_ops; > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c > index 612f0a641..fcf8e7962 100644 > --- a/net/sunrpc/clnt.c > +++ b/net/sunrpc/clnt.c > @@ -1058,6 +1058,53 @@ rpc_task_get_next_xprt(struct rpc_clnt *clnt) > return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt- > >cl_xpi)); > } > > +static > +bool xprt_is_active(const struct rpc_xprt *xprt) > +{ > + return kref_read(&xprt->kref) != 0; > +} > + > +static struct rpc_xprt * > +rpc_task_get_hashed_xprt(struct rpc_clnt *clnt, const struct > rpc_task *task) > +{ > + const struct rpc_xprt_switch *xps = NULL; > + struct rpc_xprt *xprt = NULL; > + const struct rpc_message *rpc_message = &task->tk_msg; > + const u32 hash = rpc_message->rpc_xprt_hint; > + > + if (!hash) > + return rpc_task_get_next_xprt(clnt); > + > + rcu_read_lock(); > + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); > + > + if (xps && hash) { > + const struct list_head *head = &xps->xps_xprt_list; > + struct rpc_xprt *pos; > + const u32 nactive = READ_ONCE(xps->xps_nactive); > + const u32 xprt_idx = (hash % nactive); > + u32 idx = 0; > + > + list_for_each_entry_rcu(pos, head, xprt_switch) { > + if (xprt_idx > idx++) > + continue; > + if (xprt_is_active(pos)) { > + xprt = xprt_get(pos); > + break; > + } > + } > + } > + > + /* > + * Use first transport, if not found any. > + */ > + if (!xprt) > + xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); > + rcu_read_unlock(); > + > + return rpc_task_get_xprt(clnt, xprt); > +} > + > static > void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt > *clnt) > { > @@ -1066,7 +1113,7 @@ void rpc_task_set_transport(struct rpc_task > *task, struct rpc_clnt *clnt) > if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) > task->tk_xprt = rpc_task_get_first_xprt(clnt); > else > - task->tk_xprt = rpc_task_get_next_xprt(clnt); > + task->tk_xprt = rpc_task_get_hashed_xprt(clnt, task); > } > > static > @@ -1100,6 +1147,7 @@ rpc_task_set_rpc_message(struct rpc_task *task, > const struct rpc_message *msg) > task->tk_msg.rpc_argp = msg->rpc_argp; > task->tk_msg.rpc_resp = msg->rpc_resp; > task->tk_msg.rpc_cred = msg->rpc_cred; > + task->tk_msg.rpc_xprt_hint = msg->rpc_xprt_hint; > if (!(task->tk_flags & RPC_TASK_CRED_NOREF)) > get_cred(task->tk_msg.rpc_cred); > } > @@ -1130,8 +1178,8 @@ struct rpc_task *rpc_run_task(const struct > rpc_task_setup *task_setup_data) > if (!RPC_IS_ASYNC(task)) > task->tk_flags |= RPC_TASK_CRED_NOREF; > > - rpc_task_set_client(task, task_setup_data->rpc_client); > rpc_task_set_rpc_message(task, task_setup_data->rpc_message); > + rpc_task_set_client(task, task_setup_data->rpc_client); > > if (task->tk_action == NULL) > rpc_call_start(task); This is the wrong approach. It tries to shoehorn everyone into a problem that affects your server only. Worse, it puts all these Jenkins hash calls into what is supposed to be a series of fast paths for I/O. If you want customers to use nconnect (which is not a default option), but are not happy with a round robin policy, then I suggest constructing a pluggable policy engine that does what you want it to do. I designed the code in net/sunrpc/xprtmultipath.c to allow for that possibility, should we want to implement it. -- Trond Myklebust Linux NFS client maintainer, Hammerspace trond.myklebust@xxxxxxxxxxxxxxx