On Mon, 2022-10-31 at 12:54 -0400, Jeff Layton wrote: > On Fri, 2022-10-28 at 10:48 -0400, Chuck Lever wrote: > > fh_match() is costly, especially when filehandles are large (as is > > the case for NFSv4). It needs to be used sparingly when searching > > data structures. Unfortunately, with common workloads, I see > > multiple thousands of objects stored in file_hashtbl[], which has > > just 256 buckets, making its bucket hash chains quite lengthy. > > > > Walking long hash chains with the state_lock held blocks other > > activity that needs that lock. Sizable hash chains are a common > > occurrance once the server has handed out some delegations, for > > example -- IIUC, each delegated file is held open on the server by > > an nfs4_file object. > > > > To help mitigate the cost of searching with fh_match(), replace the > > nfs4_file hash table with an rhashtable, which can dynamically > > resize its bucket array to minimize hash chain length. > > > > The result of this modification is an improvement in the latency of > > NFSv4 operations, and the reduction of nfsd CPU utilization due to > > eliminating the cost of multiple calls to fh_match() and reducing > > the CPU cache misses incurred while walking long hash chains in the > > nfs4_file hash table. > > > > Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> > > Reviewed-by: NeilBrown <neilb@xxxxxxx> > > --- > > fs/nfsd/nfs4state.c | 77 ++++++++++++++++++++++++++------------------------- > > fs/nfsd/state.h | 4 --- > > 2 files changed, 40 insertions(+), 41 deletions(-) > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > > index c2ef2db9c84c..c78b66e678dd 100644 > > --- a/fs/nfsd/nfs4state.c > > +++ b/fs/nfsd/nfs4state.c > > @@ -591,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu) > > void > > put_nfs4_file(struct nfs4_file *fi) > > { > > - might_lock(&state_lock); > > - > > - if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { > > + if (refcount_dec_and_test(&fi->fi_ref)) { > > nfsd4_file_hash_remove(fi); > > - spin_unlock(&state_lock); > > WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); > > WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); > > call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); > > @@ -709,20 +706,6 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) > > return ret & OWNER_HASH_MASK; > > } > > > > -/* hash table for nfs4_file */ > > -#define FILE_HASH_BITS 8 > > -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) > > - > > -static unsigned int file_hashval(const struct svc_fh *fh) > > -{ > > - struct inode *inode = d_inode(fh->fh_dentry); > > - > > - /* XXX: why not (here & in file cache) use inode? */ > > - return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS); > > -} > > - > > -static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; > > - > > static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp; > > > > static const struct rhashtable_params nfs4_file_rhash_params = { > > @@ -4687,12 +4670,14 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) > > static noinline_for_stack struct nfs4_file * > > nfsd4_file_hash_lookup(const struct svc_fh *fhp) > > { > > - unsigned int hashval = file_hashval(fhp); > > + struct inode *inode = d_inode(fhp->fh_dentry); > > + struct rhlist_head *tmp, *list; > > struct nfs4_file *fi; > > > > rcu_read_lock(); > > - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, > > - lockdep_is_held(&state_lock)) { > > + list = rhltable_lookup(&nfs4_file_rhltable, &inode, > > + nfs4_file_rhash_params); > > + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { > > if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { > > if (refcount_inc_not_zero(&fi->fi_ref)) { > > rcu_read_unlock(); > > @@ -4706,40 +4691,56 @@ nfsd4_file_hash_lookup(const struct svc_fh *fhp) > > > > /* > > * On hash insertion, identify entries with the same inode but > > - * distinct filehandles. They will all be in the same hash bucket > > - * because nfs4_file's are hashed by the address in the fi_inode > > - * field. > > + * distinct filehandles. They will all be on the list returned > > + * by rhltable_lookup(). > > + * > > + * inode->i_lock prevents racing insertions from adding an entry > > + * for the same inode/fhp pair twice. > > */ > > static noinline_for_stack struct nfs4_file * > > nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp) > > { > > - unsigned int hashval = file_hashval(fhp); > > + struct inode *inode = d_inode(fhp->fh_dentry); > > + struct rhlist_head *tmp, *list; > > struct nfs4_file *ret = NULL; > > bool alias_found = false; > > struct nfs4_file *fi; > > + int err; > > > > - spin_lock(&state_lock); > > - hlist_for_each_entry_rcu(fi, &file_hashtbl[hashval], fi_hash, > > - lockdep_is_held(&state_lock)) { > > + rcu_read_lock(); > > + spin_lock(&inode->i_lock); > > + > > + list = rhltable_lookup(&nfs4_file_rhltable, &inode, > > + nfs4_file_rhash_params); > > + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { > > if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { > > if (refcount_inc_not_zero(&fi->fi_ref)) > > ret = fi; > > - } else if (d_inode(fhp->fh_dentry) == fi->fi_inode) > > + } else > > fi->fi_aliased = alias_found = true; > > } > > - if (likely(ret == NULL)) { > > - nfsd4_file_init(fhp, new); > > - hlist_add_head_rcu(&new->fi_hash, &file_hashtbl[hashval]); > > - new->fi_aliased = alias_found; > > - ret = new; > > - } > > - spin_unlock(&state_lock); > > + if (ret) > > + goto out_unlock; > > + > > + nfsd4_file_init(fhp, new); > > + err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist, > > + nfs4_file_rhash_params); > > + if (err) > > + goto out_unlock; > > + > > + new->fi_aliased = alias_found; > > + ret = new; > > + > > +out_unlock: > > + spin_unlock(&inode->i_lock); > > + rcu_read_unlock(); > > return ret; > > } > > > > static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) > > { > > - hlist_del_rcu(&fi->fi_hash); > > + rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist, > > + nfs4_file_rhash_params); > > } > > > > /* > > @@ -5629,6 +5630,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf > > * If not found, create the nfs4_file struct > > */ > > fp = nfsd4_file_hash_insert(open->op_file, current_fh); > > + if (unlikely(!fp)) > > + return nfserr_jukebox; > > if (fp != open->op_file) { > > status = nfs4_check_deleg(cl, open, &dp); > > if (status) > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h > > index 190fc7e418a4..eadd7f465bf5 100644 > > --- a/fs/nfsd/state.h > > +++ b/fs/nfsd/state.h > > @@ -536,16 +536,12 @@ struct nfs4_clnt_odstate { > > * inode can have multiple filehandles associated with it, so there is > > * (potentially) a many to one relationship between this struct and struct > > * inode. > > - * > > - * These are hashed by filehandle in the file_hashtbl, which is protected by > > - * the global state_lock spinlock. > > */ > > struct nfs4_file { > > refcount_t fi_ref; > > struct inode * fi_inode; > > bool fi_aliased; > > spinlock_t fi_lock; > > - struct hlist_node fi_hash; /* hash on fi_fhandle */ > > struct rhlist_head fi_rlist; > > struct list_head fi_stateids; > > union { > > > > > > You can add this to 13 and 14: > > Reviewed-by: Jeff Layton <jlayton@xxxxxxxxxx> > > That said, I'd probably prefer to see the two squashed together, since > you're adding unused infrastructure in 13. > Erm, maybe make that my kernel.org address instead? I try to use that for upstream work. -- Jeff Layton <jlayton@xxxxxxxxxx>