We have a clustered NFS server behind a L4 load-balancer with the following Characteristics (relevant to this discussion): 1. RPC requests for the same file issued to different cluster nodes are not efficient. One file one cluster node is efficient. This is particularly true for WRITEs. 2. Multiple nconnect xprts land on different cluster nodes due to the source port being different for all. Due to this, the default nconnect roundrobin policy does not work very well as it results in RPCs targeted to the same file to be serviced by different cluster nodes. To solve this, we tweaked the nfs multipath code to always choose the same xprt for the same file. We do that by adding a new integer field to rpc_message, rpc_xprt_hint, which is set by NFS layer and used by RPC layer to pick a xprt. NFS layer sets it to the hash of the target file's filehandle, thus ensuring same file requests always use the same xprt. This works well. I am interested in knowing your thoughts on this, has anyone else also come across similar issue, is there any other way of solving this, etc. Following patch is just to demonstrate the idea. It works but covers only what I needed for the experiment. Based on the feedback, I can follow it up with a formal patch if needed. Thanks, Tomar --- diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 5c4e23abc..8f1cf03dc 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -108,6 +108,7 @@ nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], .rpc_argp = fhandle, .rpc_resp = fattr, + .rpc_xprt_hint = nfs_fh_hash(fhandle), }; int status; unsigned short task_flags = 0; @@ -136,6 +137,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, .rpc_proc = &nfs3_procedures[NFS3PROC_SETATTR], .rpc_argp = &arg, .rpc_resp = fattr, + .rpc_xprt_hint = nfs_fh_hash(arg.fh), }; int status; @@ -171,6 +173,7 @@ __nfs3_proc_lookup(struct inode *dir, const char *name, size_t len, .rpc_proc = &nfs3_procedures[NFS3PROC_LOOKUP], .rpc_argp = &arg, .rpc_resp = &res, + .rpc_xprt_hint = nfs_fh_hash(arg.fh), }; int status; @@ -235,6 +238,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) .rpc_argp = &arg, .rpc_resp = &res, .rpc_cred = entry->cred, + .rpc_xprt_hint = nfs_fh_hash(arg.fh), }; int status = -ENOMEM; @@ -266,6 +270,7 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page, struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK], .rpc_argp = &args, + .rpc_xprt_hint = nfs_fh_hash(args.fh), }; int status = -ENOMEM; @@ -355,6 +360,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, data->arg.create.name = dentry->d_name.name; data->arg.create.len = dentry->d_name.len; data->arg.create.sattr = sattr; + data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.create.fh); data->arg.create.createmode = NFS3_CREATE_UNCHECKED; if (flags & O_EXCL) { @@ -442,6 +448,7 @@ nfs3_proc_remove(struct inode *dir, struct dentry *dentry) .rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE], .rpc_argp = &arg, .rpc_resp = &res, + .rpc_xprt_hint = nfs_fh_hash(arg.fh), }; int status = -ENOMEM; @@ -524,6 +531,7 @@ nfs3_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], .rpc_argp = &arg, .rpc_resp = &res, + .rpc_xprt_hint = nfs_fh_hash(arg.tofh), }; int status = -ENOMEM; @@ -566,6 +574,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, data->arg.symlink.pages = &page; data->arg.symlink.pathlen = len; data->arg.symlink.sattr = sattr; + data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.symlink.fromfh); d_alias = nfs3_do_create(dir, dentry, data); status = PTR_ERR_OR_ZERO(d_alias); @@ -602,6 +611,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) data->arg.mkdir.name = dentry->d_name.name; data->arg.mkdir.len = dentry->d_name.len; data->arg.mkdir.sattr = sattr; + data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.mkdir.fh); d_alias = nfs3_do_create(dir, dentry, data); status = PTR_ERR_OR_ZERO(d_alias); @@ -636,6 +646,7 @@ nfs3_proc_rmdir(struct inode *dir, const struct qstr *name) struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], .rpc_argp = &arg, + .rpc_xprt_hint = nfs_fh_hash(arg.fh), }; int status = -ENOMEM; @@ -682,6 +693,7 @@ static int nfs3_proc_readdir(struct nfs_readdir_arg *nr_arg, .rpc_argp = &arg, .rpc_resp = &res, .rpc_cred = nr_arg->cred, + .rpc_xprt_hint = nfs_fh_hash(arg.fh), }; int status = -ENOMEM; @@ -735,6 +747,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, data->arg.mknod.len = dentry->d_name.len; data->arg.mknod.sattr = sattr; data->arg.mknod.rdev = rdev; + data->msg.rpc_xprt_hint = nfs_fh_hash(data->arg.mknod.fh); switch (sattr->ia_mode & S_IFMT) { case S_IFBLK: @@ -782,6 +795,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_proc = &nfs3_procedures[NFS3PROC_FSSTAT], .rpc_argp = fhandle, .rpc_resp = stat, + .rpc_xprt_hint = nfs_fh_hash(fhandle), }; int status; @@ -800,6 +814,7 @@ do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle, .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO], .rpc_argp = fhandle, .rpc_resp = info, + .rpc_xprt_hint = nfs_fh_hash(fhandle), }; int status; @@ -834,6 +849,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_proc = &nfs3_procedures[NFS3PROC_PATHCONF], .rpc_argp = fhandle, .rpc_resp = info, + .rpc_xprt_hint = nfs_fh_hash(fhandle), }; int status; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 78c9c4bde..60578e9fd 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -762,6 +762,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, .rpc_argp = &hdr->args, .rpc_resp = &hdr->res, .rpc_cred = cred, + .rpc_xprt_hint = nfs_fh_hash(hdr->args.fh), }; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index b27ebdcce..b1713d6fb 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -355,6 +355,12 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, msg.rpc_resp = &data->res; msg.rpc_cred = data->cred; + if (data->args.new_dir) + msg.rpc_xprt_hint = nfs_fh_hash(data->args.new_dir); + else + msg.rpc_xprt_hint = nfs_fh_hash(data->args.old_dir); + + /* set up nfs_renamedata */ data->old_dir = old_dir; ihold(old_dir); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 639c34fec..284b364a6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1687,6 +1687,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, .rpc_argp = &data->args, .rpc_resp = &data->res, .rpc_cred = data->cred, + .rpc_xprt_hint = nfs_fh_hash(data->args.fh), }; struct rpc_task_setup task_setup_data = { .task = &data->task, diff --git a/include/linux/nfs.h b/include/linux/nfs.h index 0dc7ad38a..5d5b2d20b 100644 --- a/include/linux/nfs.h +++ b/include/linux/nfs.h @@ -10,6 +10,7 @@ #include <linux/sunrpc/msg_prot.h> #include <linux/string.h> +#include <linux/jhash.h> #include <uapi/linux/nfs.h> /* @@ -36,6 +37,10 @@ static inline void nfs_copy_fh(struct nfs_fh *target, const struct nfs_fh *sourc memcpy(target->data, source->data, source->size); } +static inline u32 nfs_fh_hash(const struct nfs_fh *fh) +{ + return (fh ? jhash(fh->data, fh->size, 0) : 0); +} /* * This is really a general kernel constant, but since nothing like diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index df696efdd..8f365280c 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -27,6 +27,7 @@ struct rpc_message { void * rpc_argp; /* Arguments */ void * rpc_resp; /* Result */ const struct cred * rpc_cred; /* Credentials */ + u32 rpc_xprt_hint; }; struct rpc_call_ops; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 612f0a641..fcf8e7962 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1058,6 +1058,53 @@ rpc_task_get_next_xprt(struct rpc_clnt *clnt) return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi)); } +static +bool xprt_is_active(const struct rpc_xprt *xprt) +{ + return kref_read(&xprt->kref) != 0; +} + +static struct rpc_xprt * +rpc_task_get_hashed_xprt(struct rpc_clnt *clnt, const struct rpc_task *task) +{ + const struct rpc_xprt_switch *xps = NULL; + struct rpc_xprt *xprt = NULL; + const struct rpc_message *rpc_message = &task->tk_msg; + const u32 hash = rpc_message->rpc_xprt_hint; + + if (!hash) + return rpc_task_get_next_xprt(clnt); + + rcu_read_lock(); + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + + if (xps && hash) { + const struct list_head *head = &xps->xps_xprt_list; + struct rpc_xprt *pos; + const u32 nactive = READ_ONCE(xps->xps_nactive); + const u32 xprt_idx = (hash % nactive); + u32 idx = 0; + + list_for_each_entry_rcu(pos, head, xprt_switch) { + if (xprt_idx > idx++) + continue; + if (xprt_is_active(pos)) { + xprt = xprt_get(pos); + break; + } + } + } + + /* + * Use first transport, if not found any. + */ + if (!xprt) + xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); + rcu_read_unlock(); + + return rpc_task_get_xprt(clnt, xprt); +} + static void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) { @@ -1066,7 +1113,7 @@ void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) task->tk_xprt = rpc_task_get_first_xprt(clnt); else - task->tk_xprt = rpc_task_get_next_xprt(clnt); + task->tk_xprt = rpc_task_get_hashed_xprt(clnt, task); } static @@ -1100,6 +1147,7 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) task->tk_msg.rpc_argp = msg->rpc_argp; task->tk_msg.rpc_resp = msg->rpc_resp; task->tk_msg.rpc_cred = msg->rpc_cred; + task->tk_msg.rpc_xprt_hint = msg->rpc_xprt_hint; if (!(task->tk_flags & RPC_TASK_CRED_NOREF)) get_cred(task->tk_msg.rpc_cred); } @@ -1130,8 +1178,8 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) if (!RPC_IS_ASYNC(task)) task->tk_flags |= RPC_TASK_CRED_NOREF; - rpc_task_set_client(task, task_setup_data->rpc_client); rpc_task_set_rpc_message(task, task_setup_data->rpc_message); + rpc_task_set_client(task, task_setup_data->rpc_client); if (task->tk_action == NULL) rpc_call_start(task);