Re: [RFC] nconnect xprt stickiness for a file

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, 2021-03-18 at 01:56 +0000, Nagendra Tomar wrote:
> We have a clustered NFS server behind a L4 load-balancer with the
> following
> Characteristics (relevant to this discussion):
> 
> 1. RPC requests for the same file issued to different cluster nodes
> are not efficient.
>     One file one cluster node is efficient. This is particularly true
> for WRITEs.
> 2. Multiple nconnect xprts land on different cluster nodes due to the
> source 
>     port being different for all.
> 
> Due to this, the default nconnect roundrobin policy does not work
> very well as
> it results in RPCs targeted to the same file to be serviced by
> different cluster nodes.
> 
> To solve this, we tweaked the nfs multipath code to always choose the
> same xprt 
> for the same file. We do that by adding a new integer field to
> rpc_message,
> rpc_xprt_hint, which is set by NFS layer and used by RPC layer to
> pick a xprt.
> NFS layer sets it to the hash of the target file's filehandle, thus
> ensuring same file
> requests always use the same xprt. This works well.
> 
> I am interested in knowing your thoughts on this, has anyone else
> also come across
> similar issue, is there any other way of solving this, etc.
> 
> Following  patch is just to demonstrate the idea. It works but covers
> only what I
> needed for the experiment. Based on the feedback, I can follow it up
> with a formal 
> patch if needed.
> 
> Thanks,
> Tomar
> 
> ---
> diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
> index 5c4e23abc..8f1cf03dc 100644
> --- a/fs/nfs/nfs3proc.c
> +++ b/fs/nfs/nfs3proc.c
> @@ -108,6 +108,7 @@ nfs3_proc_getattr(struct nfs_server *server,
> struct nfs_fh *fhandle,
>                 .rpc_proc       = &nfs3_procedures[NFS3PROC_GETATTR],
>                 .rpc_argp       = fhandle,
>                 .rpc_resp       = fattr,
> +               .rpc_xprt_hint  = nfs_fh_hash(fhandle),
>         };
>         int     status;
>         unsigned short task_flags = 0;
> @@ -136,6 +137,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct
> nfs_fattr *fattr,
>                 .rpc_proc       = &nfs3_procedures[NFS3PROC_SETATTR],
>                 .rpc_argp       = &arg,
>                 .rpc_resp       = fattr,
> +               .rpc_xprt_hint  = nfs_fh_hash(arg.fh),
>         };
>         int     status;
>  
> @@ -171,6 +173,7 @@ __nfs3_proc_lookup(struct inode *dir, const char
> *name, size_t len,
>                 .rpc_proc       = &nfs3_procedures[NFS3PROC_LOOKUP],
>                 .rpc_argp       = &arg,
>                 .rpc_resp       = &res,
> +               .rpc_xprt_hint  = nfs_fh_hash(arg.fh),
>         };
>         int                     status;
>  
> @@ -235,6 +238,7 @@ static int nfs3_proc_access(struct inode *inode,
> struct nfs_access_entry *entry)
>                 .rpc_argp       = &arg,
>                 .rpc_resp       = &res,
>                 .rpc_cred       = entry->cred,
> +               .rpc_xprt_hint  = nfs_fh_hash(arg.fh),
>         };
>         int status = -ENOMEM;
>  
> @@ -266,6 +270,7 @@ static int nfs3_proc_readlink(struct inode
> *inode, struct page *page,
>         struct rpc_message msg = {
>                 .rpc_proc       =
> &nfs3_procedures[NFS3PROC_READLINK],
>                 .rpc_argp       = &args,
> +               .rpc_xprt_hint  = nfs_fh_hash(args.fh),
>         };
>         int status = -ENOMEM;
>  
> @@ -355,6 +360,7 @@ nfs3_proc_create(struct inode *dir, struct dentry
> *dentry, struct iattr *sattr,
>         data->arg.create.name = dentry->d_name.name;
>         data->arg.create.len = dentry->d_name.len;
>         data->arg.create.sattr = sattr;
> +       data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.create.fh);
>  
>         data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
>         if (flags & O_EXCL) {
> @@ -442,6 +448,7 @@ nfs3_proc_remove(struct inode *dir, struct dentry
> *dentry)
>                 .rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE],
>                 .rpc_argp = &arg,
>                 .rpc_resp = &res,
> +               .rpc_xprt_hint = nfs_fh_hash(arg.fh),
>         };
>         int status = -ENOMEM;
>  
> @@ -524,6 +531,7 @@ nfs3_proc_link(struct inode *inode, struct inode
> *dir, const struct qstr *name)
>                 .rpc_proc       = &nfs3_procedures[NFS3PROC_LINK],
>                 .rpc_argp       = &arg,
>                 .rpc_resp       = &res,
> +               .rpc_xprt_hint  = nfs_fh_hash(arg.tofh),
>         };
>         int status = -ENOMEM;
>  
> @@ -566,6 +574,7 @@ nfs3_proc_symlink(struct inode *dir, struct
> dentry *dentry, struct page *page,
>         data->arg.symlink.pages = &page;
>         data->arg.symlink.pathlen = len;
>         data->arg.symlink.sattr = sattr;
> +       data->msg.rpc_xprt_hint = nfs_fh_hash(data-
> >arg.symlink.fromfh);
>  
>         d_alias = nfs3_do_create(dir, dentry, data);
>         status = PTR_ERR_OR_ZERO(d_alias);
> @@ -602,6 +611,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry
> *dentry, struct iattr *sattr)
>         data->arg.mkdir.name = dentry->d_name.name;
>         data->arg.mkdir.len = dentry->d_name.len;
>         data->arg.mkdir.sattr = sattr;
> +       data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.mkdir.fh);
>  
>         d_alias = nfs3_do_create(dir, dentry, data);
>         status = PTR_ERR_OR_ZERO(d_alias);
> @@ -636,6 +646,7 @@ nfs3_proc_rmdir(struct inode *dir, const struct
> qstr *name)
>         struct rpc_message msg = {
>                 .rpc_proc       = &nfs3_procedures[NFS3PROC_RMDIR],
>                 .rpc_argp       = &arg,
> +               .rpc_xprt_hint  = nfs_fh_hash(arg.fh),
>         };
>         int status = -ENOMEM;
>  
> @@ -682,6 +693,7 @@ static int nfs3_proc_readdir(struct
> nfs_readdir_arg *nr_arg,
>                 .rpc_argp       = &arg,
>                 .rpc_resp       = &res,
>                 .rpc_cred       = nr_arg->cred,
> +               .rpc_xprt_hint  = nfs_fh_hash(arg.fh),
>         };
>         int status = -ENOMEM;
>  
> @@ -735,6 +747,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry
> *dentry, struct iattr *sattr,
>         data->arg.mknod.len = dentry->d_name.len;
>         data->arg.mknod.sattr = sattr;
>         data->arg.mknod.rdev = rdev;
> +       data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.mknod.fh);
>  
>         switch (sattr->ia_mode & S_IFMT) {
>         case S_IFBLK:
> @@ -782,6 +795,7 @@ nfs3_proc_statfs(struct nfs_server *server,
> struct nfs_fh *fhandle,
>                 .rpc_proc       = &nfs3_procedures[NFS3PROC_FSSTAT],
>                 .rpc_argp       = fhandle,
>                 .rpc_resp       = stat,
> +               .rpc_xprt_hint  = nfs_fh_hash(fhandle),
>         };
>         int     status;
>  
> @@ -800,6 +814,7 @@ do_proc_fsinfo(struct rpc_clnt *client, struct
> nfs_fh *fhandle,
>                 .rpc_proc       = &nfs3_procedures[NFS3PROC_FSINFO],
>                 .rpc_argp       = fhandle,
>                 .rpc_resp       = info,
> +               .rpc_xprt_hint  = nfs_fh_hash(fhandle),
>         };
>         int     status;
>  
> @@ -834,6 +849,7 @@ nfs3_proc_pathconf(struct nfs_server *server,
> struct nfs_fh *fhandle,
>                 .rpc_proc       =
> &nfs3_procedures[NFS3PROC_PATHCONF],
>                 .rpc_argp       = fhandle,
>                 .rpc_resp       = info,
> +               .rpc_xprt_hint  = nfs_fh_hash(fhandle),
>         };
>         int     status;
>  
> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> index 78c9c4bde..60578e9fd 100644
> --- a/fs/nfs/pagelist.c
> +++ b/fs/nfs/pagelist.c
> @@ -762,6 +762,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt,
> struct nfs_pgio_header *hdr,
>                 .rpc_argp = &hdr->args,
>                 .rpc_resp = &hdr->res,
>                 .rpc_cred = cred,
> +               .rpc_xprt_hint = nfs_fh_hash(hdr->args.fh),
>         };
>         struct rpc_task_setup task_setup_data = {
>                 .rpc_client = clnt,
> diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
> index b27ebdcce..b1713d6fb 100644
> --- a/fs/nfs/unlink.c
> +++ b/fs/nfs/unlink.c
> @@ -355,6 +355,12 @@ nfs_async_rename(struct inode *old_dir, struct
> inode *new_dir,
>         msg.rpc_resp = &data->res;
>         msg.rpc_cred = data->cred;
>  
> +       if (data->args.new_dir)
> +               msg.rpc_xprt_hint = nfs_fh_hash(data->args.new_dir);
> +       else
> +               msg.rpc_xprt_hint = nfs_fh_hash(data->args.old_dir);
> +
> +
>         /* set up nfs_renamedata */
>         data->old_dir = old_dir;
>         ihold(old_dir);
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index 639c34fec..284b364a6 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -1687,6 +1687,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt,
> struct nfs_commit_data *data,
>                 .rpc_argp = &data->args,
>                 .rpc_resp = &data->res,
>                 .rpc_cred = data->cred,
> +               .rpc_xprt_hint = nfs_fh_hash(data->args.fh),
>         };
>         struct rpc_task_setup task_setup_data = {
>                 .task = &data->task,
> diff --git a/include/linux/nfs.h b/include/linux/nfs.h
> index 0dc7ad38a..5d5b2d20b 100644
> --- a/include/linux/nfs.h
> +++ b/include/linux/nfs.h
> @@ -10,6 +10,7 @@
>  
>  #include <linux/sunrpc/msg_prot.h>
>  #include <linux/string.h>
> +#include <linux/jhash.h>
>  #include <uapi/linux/nfs.h>
>  
>  /*
> @@ -36,6 +37,10 @@ static inline void nfs_copy_fh(struct nfs_fh
> *target, const struct nfs_fh *sourc
>         memcpy(target->data, source->data, source->size);
>  }
>  
> +static inline u32 nfs_fh_hash(const struct nfs_fh *fh)
> +{
> +       return (fh ? jhash(fh->data, fh->size, 0) : 0);
> +}
>  
>  /*
>   * This is really a general kernel constant, but since nothing like
> diff --git a/include/linux/sunrpc/sched.h
> b/include/linux/sunrpc/sched.h
> index df696efdd..8f365280c 100644
> --- a/include/linux/sunrpc/sched.h
> +++ b/include/linux/sunrpc/sched.h
> @@ -27,6 +27,7 @@ struct rpc_message {
>         void *                  rpc_argp;       /* Arguments */
>         void *                  rpc_resp;       /* Result */
>         const struct cred *     rpc_cred;       /* Credentials */
> +       u32                     rpc_xprt_hint;
>  };
>  
>  struct rpc_call_ops;
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
> index 612f0a641..fcf8e7962 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -1058,6 +1058,53 @@ rpc_task_get_next_xprt(struct rpc_clnt *clnt)
>         return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt-
> >cl_xpi));
>  }
>  
> +static
> +bool xprt_is_active(const struct rpc_xprt *xprt)
> +{
> +       return kref_read(&xprt->kref) != 0;
> +}
> +
> +static struct rpc_xprt *
> +rpc_task_get_hashed_xprt(struct rpc_clnt *clnt, const struct
> rpc_task *task)
> +{
> +       const struct rpc_xprt_switch *xps = NULL;
> +       struct rpc_xprt *xprt = NULL;
> +       const struct rpc_message *rpc_message = &task->tk_msg;
> +       const u32 hash = rpc_message->rpc_xprt_hint;
> +
> +       if (!hash)
> +               return rpc_task_get_next_xprt(clnt);
> +
> +       rcu_read_lock();
> +       xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
> +
> +       if (xps && hash) {
> +               const struct list_head *head = &xps->xps_xprt_list;
> +               struct rpc_xprt *pos;
> +               const u32 nactive = READ_ONCE(xps->xps_nactive);
> +               const u32 xprt_idx = (hash % nactive);
> +               u32 idx = 0;
> +
> +               list_for_each_entry_rcu(pos, head, xprt_switch) {
> +                       if (xprt_idx > idx++)
> +                               continue;
> +                       if (xprt_is_active(pos)) {
> +                               xprt = xprt_get(pos);
> +                               break;
> +                       }
> +               }
> +       }
> +
> +       /*
> +        * Use first transport, if not found any.
> +        */
> +       if (!xprt)
> +               xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
> +       rcu_read_unlock();
> +
> +       return rpc_task_get_xprt(clnt, xprt);
> +}
> +
>  static
>  void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt
> *clnt)
>  {
> @@ -1066,7 +1113,7 @@ void rpc_task_set_transport(struct rpc_task
> *task, struct rpc_clnt *clnt)
>         if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
>                 task->tk_xprt = rpc_task_get_first_xprt(clnt);
>         else
> -               task->tk_xprt = rpc_task_get_next_xprt(clnt);
> +               task->tk_xprt = rpc_task_get_hashed_xprt(clnt, task);
>  }
>  
>  static
> @@ -1100,6 +1147,7 @@ rpc_task_set_rpc_message(struct rpc_task *task,
> const struct rpc_message *msg)
>                 task->tk_msg.rpc_argp = msg->rpc_argp;
>                 task->tk_msg.rpc_resp = msg->rpc_resp;
>                 task->tk_msg.rpc_cred = msg->rpc_cred;
> +               task->tk_msg.rpc_xprt_hint = msg->rpc_xprt_hint;
>                 if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
>                         get_cred(task->tk_msg.rpc_cred);
>         }
> @@ -1130,8 +1178,8 @@ struct rpc_task *rpc_run_task(const struct
> rpc_task_setup *task_setup_data)
>         if (!RPC_IS_ASYNC(task))
>                 task->tk_flags |= RPC_TASK_CRED_NOREF;
>  
> -       rpc_task_set_client(task, task_setup_data->rpc_client);
>         rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
> +       rpc_task_set_client(task, task_setup_data->rpc_client);
>  
>         if (task->tk_action == NULL)
>                 rpc_call_start(task);


This is the wrong approach. It tries to shoehorn everyone into a
problem that affects your server only. Worse, it puts all these Jenkins
hash calls into what is supposed to be a series of fast paths for I/O.

If you want customers to use nconnect (which is not a default option),
but are not happy with a round robin policy, then I suggest
constructing a pluggable policy engine that does what you want it to
do. I designed the code in net/sunrpc/xprtmultipath.c to allow for that
possibility, should we want to implement it.

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@xxxxxxxxxxxxxxx






[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux