Re: [PATCH v4 5/7] NFSD: Use rhashtable for managing nfs4_file objects

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




> On Oct 19, 2022, at 7:39 AM, Jeff Layton <jlayton@xxxxxxxxxxxxxxx> wrote:
> 
> On Tue, 2022-10-18 at 15:58 -0400, Chuck Lever wrote:
>> fh_match() is costly, especially when filehandles are large (as is
>> the case for NFSv4). It needs to be used sparingly when searching
>> data structures. Unfortunately, with common workloads, I see
>> multiple thousands of objects stored in file_hashtbl[], which always
>> has only 256 buckets, which makes the hash chains quite lengthy.
>> 
>> Walking long hash chains with the state_lock held blocks other
>> activity that needs that lock.
>> 
>> To help mitigate the cost of searching with fh_match(), replace the
>> nfs4_file hash table with an rhashtable, which can dynamically
>> resize its bucket array to minimize hash chain length. The ideal for
>> this use case is one bucket per inode.
>> 
>> The result is an improvement in the latency of NFSv4 operations
>> and the reduction of nfsd CPU utilization due to the cost of
>> fh_match() and the CPU cache misses incurred while walking long
>> hash chains in the nfs4_file hash table.
>> 
>> Note that hash removal is no longer protected by the state_lock.
>> This is because insert_nfs4_file() takes the RCU read lock then the
>> state_lock. rhltable_remove() takes the RCU read lock internally;
>> if remove_nfs4_file() took the state_lock before calling
>> rhltable_remove(), that would result in a lock inversion.
>> 
>> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
>> ---
>> fs/nfsd/nfs4state.c |  215 +++++++++++++++++++++++++++++++++++----------------
>> fs/nfsd/state.h     |    5 -
>> 2 files changed, 147 insertions(+), 73 deletions(-)
>> 
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index 2b850de288cf..a63334ad61f6 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -44,7 +44,9 @@
>> #include <linux/jhash.h>
>> #include <linux/string_helpers.h>
>> #include <linux/fsnotify.h>
>> +#include <linux/rhashtable.h>
>> #include <linux/nfs_ssc.h>
>> +
>> #include "xdr4.h"
>> #include "xdr4cb.h"
>> #include "vfs.h"
>> @@ -84,6 +86,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
>> static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
>> void nfsd4_end_grace(struct nfsd_net *nn);
>> static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
>> +static void remove_nfs4_file(struct nfs4_file *fi);
>> 
>> /* Locking: */
>> 
>> @@ -577,11 +580,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
>> void
>> put_nfs4_file(struct nfs4_file *fi)
>> {
>> -	might_lock(&state_lock);
>> -
>> -	if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
>> -		hlist_del_rcu(&fi->fi_hash);
>> -		spin_unlock(&state_lock);
>> +	if (refcount_dec_and_test(&fi->fi_ref)) {
>> +		remove_nfs4_file(fi);
>> 		WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
>> 		WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
>> 		call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
>> @@ -695,19 +695,82 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
>> 	return ret & OWNER_HASH_MASK;
>> }
>> 
>> -/* hash table for nfs4_file */
>> -#define FILE_HASH_BITS                   8
>> -#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
>> +static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
>> +
>> +/*
>> + * The returned hash value is based solely on the address of an in-code
>> + * inode, a pointer to a slab-allocated object. The entropy in such a
>> + * pointer is concentrated in its middle bits.
>> + */
>> +static u32 nfs4_file_inode_hash(const struct inode *inode, u32 seed)
>> +{
>> +	u32 k;
>> +
>> +	k = ((unsigned long)inode) >> L1_CACHE_SHIFT;
>> +	return jhash2(&k, 1, seed);
>> +}
>> 
>> -static unsigned int file_hashval(struct svc_fh *fh)
>> +/**
>> + * nfs4_file_key_hashfn - Compute the hash value of a lookup key
>> + * @data: key on which to compute the hash value
>> + * @len: rhash table's key_len parameter (unused)
>> + * @seed: rhash table's random seed of the day
>> + *
>> + * Return value:
>> + *   Computed 32-bit hash value
>> + */
>> +static u32 nfs4_file_key_hashfn(const void *data, u32 len, u32 seed)
>> {
>> -	struct inode *inode = d_inode(fh->fh_dentry);
>> +	const struct svc_fh *fhp = data;
>> 
>> -	/* XXX: why not (here & in file cache) use inode? */
>> -	return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
>> +	return nfs4_file_inode_hash(d_inode(fhp->fh_dentry), seed);
>> }
>> 
>> -static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
>> +/**
>> + * nfs4_file_obj_hashfn - Compute the hash value of an nfs4_file object
>> + * @data: object on which to compute the hash value
>> + * @len: rhash table's key_len parameter (unused)
>> + * @seed: rhash table's random seed of the day
>> + *
>> + * Return value:
>> + *   Computed 32-bit hash value
>> + */
>> +static u32 nfs4_file_obj_hashfn(const void *data, u32 len, u32 seed)
>> +{
>> +	const struct nfs4_file *fi = data;
>> +
>> +	return nfs4_file_inode_hash(fi->fi_inode, seed);
>> +}
>> +
>> +/**
>> + * nfs4_file_obj_cmpfn - Match a cache item against search criteria
>> + * @arg: search criteria
>> + * @ptr: cache item to check
>> + *
>> + * Return values:
>> + *   %0 - Success; item matches search criteria
>> + */
>> +static int nfs4_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
>> +			       const void *ptr)
>> +{
>> +	const struct svc_fh *fhp = arg->key;
>> +	const struct nfs4_file *fi = ptr;
>> +
>> +	return fh_match(&fi->fi_fhandle, &fhp->fh_handle) ? 0 : 1;
>> +}
>> +
>> +static const struct rhashtable_params nfs4_file_rhash_params = {
>> +	.key_len		= sizeof_field(struct nfs4_file, fi_inode),
>> +	.key_offset		= offsetof(struct nfs4_file, fi_inode),
>> +	.head_offset		= offsetof(struct nfs4_file, fi_rhash),
>> +	.hashfn			= nfs4_file_key_hashfn,
>> +	.obj_hashfn		= nfs4_file_obj_hashfn,
>> +	.obj_cmpfn		= nfs4_file_obj_cmpfn,
>> +
>> +	/* Reduce resizing churn on light workloads */
>> +	.min_size		= 512,		/* buckets */
>> +	.automatic_shrinking	= true,
>> +};
>> 
>> /*
>>  * Check if courtesy clients have conflicting access and resolve it if possible
>> @@ -4251,11 +4314,8 @@ static struct nfs4_file *nfsd4_alloc_file(void)
>> }
>> 
>> /* OPEN Share state helper functions */
>> -static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
>> -				struct nfs4_file *fp)
>> +static void init_nfs4_file(const struct svc_fh *fh, struct nfs4_file *fp)
>> {
>> -	lockdep_assert_held(&state_lock);
>> -
>> 	refcount_set(&fp->fi_ref, 1);
>> 	spin_lock_init(&fp->fi_lock);
>> 	INIT_LIST_HEAD(&fp->fi_stateids);
>> @@ -4273,7 +4333,6 @@ static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
>> 	INIT_LIST_HEAD(&fp->fi_lo_states);
>> 	atomic_set(&fp->fi_lo_recalls, 0);
>> #endif
>> -	hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
>> }
>> 
>> void
>> @@ -4626,71 +4685,75 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
>> 		nfs4_put_stid(&last->st_stid);
>> }
>> 
>> -/* search file_hashtbl[] for file */
>> -static struct nfs4_file *
>> -find_file_locked(struct svc_fh *fh, unsigned int hashval)
>> +static struct nfs4_file *find_nfs4_file(const struct svc_fh *fhp)
>> {
>> -	struct nfs4_file *fp;
>> +	struct rhlist_head *tmp, *list;
>> +	struct nfs4_file *fi = NULL;
>> 
>> -	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
>> -				lockdep_is_held(&state_lock)) {
>> -		if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
>> -			if (refcount_inc_not_zero(&fp->fi_ref))
>> -				return fp;
>> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
>> +			       nfs4_file_rhash_params);
>> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rhash)
>> +		if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
>> +			if (!refcount_inc_not_zero(&fi->fi_ref))
>> +				fi = NULL;
>> +			break;
>> 		}
>> -	}
>> -	return NULL;
>> +	return fi;
>> }
>> 
>> -static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
>> -				     unsigned int hashval)
>> +/*
>> + * On hash insertion, identify entries with the same inode but
>> + * distinct filehandles. They will all be in the same hash bucket
>> + * because we hash by the address of the nfs4_file's struct inode.
>> + */
>> +static struct nfs4_file *insert_file(struct nfs4_file *new,
>> +				     const struct svc_fh *fhp)
>> {
>> -	struct nfs4_file *fp;
>> -	struct nfs4_file *ret = NULL;
>> -	bool alias_found = false;
>> +	struct rhlist_head *tmp, *list;
>> +	struct nfs4_file *fi;
>> +	int err;
>> 
>> 	spin_lock(&state_lock);
>> -	hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
>> -				 lockdep_is_held(&state_lock)) {
>> -		if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
>> -			if (refcount_inc_not_zero(&fp->fi_ref))
>> -				ret = fp;
>> -		} else if (d_inode(fh->fh_dentry) == fp->fi_inode)
>> -			fp->fi_aliased = alias_found = true;
>> -	}
>> -	if (likely(ret == NULL)) {
>> -		nfsd4_init_file(fh, hashval, new);
>> -		new->fi_aliased = alias_found;
>> -		ret = new;
>> +
>> +	err = rhltable_insert_key(&nfs4_file_rhltable, fhp,
>> +				  &new->fi_rhash,
>> +				  nfs4_file_rhash_params);
>> +	if (err) {
>> +		spin_unlock(&state_lock);
>> +		return NULL;
>> 	}
>> +
>> +	list = rhltable_lookup(&nfs4_file_rhltable, fhp,
>> +			       nfs4_file_rhash_params);
>> +	rhl_for_each_entry_rcu(fi, tmp, list, fi_rhash)
>> +		if (fi->fi_inode == d_inode(fhp->fh_dentry) &&
>> +		    !fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
>> +			fi->fi_aliased = true;
>> +			new->fi_aliased = true;
>> +		}
>> +
>> 	spin_unlock(&state_lock);
>> -	return ret;
>> +	return new;
>> }
>> 
>> -static struct nfs4_file * find_file(struct svc_fh *fh)
>> +static noinline struct nfs4_file *
>> +insert_nfs4_file(struct nfs4_file *new, const struct svc_fh *fhp)
>> {
>> -	struct nfs4_file *fp;
>> -	unsigned int hashval = file_hashval(fh);
>> +	struct nfs4_file *fi;
>> 
>> -	rcu_read_lock();
>> -	fp = find_file_locked(fh, hashval);
>> -	rcu_read_unlock();
>> -	return fp;
>> +	/* Do not hash the same filehandle more than once */
>> +	fi = find_nfs4_file(fhp);
>> +	if (unlikely(fi))
>> +		return fi;
>> +
>> +	init_nfs4_file(fhp, new);
>> +	return insert_file(new, fhp);
>> }
>> 
>> -static struct nfs4_file *
>> -find_or_add_file(struct nfs4_file *new, struct svc_fh *fh)
>> +static noinline void remove_nfs4_file(struct nfs4_file *fi)
>> {
>> -	struct nfs4_file *fp;
>> -	unsigned int hashval = file_hashval(fh);
>> -
>> -	rcu_read_lock();
>> -	fp = find_file_locked(fh, hashval);
>> -	rcu_read_unlock();
>> -	if (fp)
>> -		return fp;
>> -
>> -	return insert_file(new, fh, hashval);
>> +	rhltable_remove(&nfs4_file_rhltable, &fi->fi_rhash,
>> +			nfs4_file_rhash_params);
>> }
>> 
>> /*
>> @@ -4703,9 +4766,12 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
>> 	struct nfs4_file *fp;
>> 	__be32 ret = nfs_ok;
>> 
>> -	fp = find_file(current_fh);
>> +	rcu_read_lock();
>> +	fp = find_nfs4_file(current_fh);
>> +	rcu_read_unlock();
>> 	if (!fp)
>> 		return ret;
>> +
>> 	/* Check for conflicting share reservations */
>> 	spin_lock(&fp->fi_lock);
>> 	if (fp->fi_share_deny & deny_type)
>> @@ -5548,7 +5614,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
>> 	 * and check for delegations in the process of being recalled.
>> 	 * If not found, create the nfs4_file struct
>> 	 */
>> -	fp = find_or_add_file(open->op_file, current_fh);
>> +	rcu_read_lock();
>> +	fp = insert_nfs4_file(open->op_file, current_fh);
>> +	rcu_read_unlock();
> 
> It'd probably be better to push this rcu_read_lock down into
> insert_nfs4_file. You don't need to hold it over the actual insertion,
> since that requires the state_lock.

I used this arrangement because:

insert_nfs4_file() invokes only find_nfs4_file() and the
insert_file() helper. Both find_nfs4_file() and the
insert_file() helper invoke rhltable_lookup(), which
must be called with the RCU read lock held.

And this is the reason why put_nfs4_file() no longer takes
the state_lock: it would take the state_lock first and
then the RCU read lock (which is implicitly taken in
rhltable_remove()), which results in a lock inversion
relative to insert_nfs4_file(), which takes the RCU read
lock first, then the state_lock.


I'm certainly not an expert, so I'm willing to listen to
alternative approaches. Can we rely on only the RCU read
lock for exclusion on hash insertion?


>> +	if (unlikely(!fp))
>> +		return nfserr_jukebox;
>> 	if (fp != open->op_file) {
>> 		status = nfs4_check_deleg(cl, open, &dp);
>> 		if (status)
>> @@ -7905,10 +7975,16 @@ nfs4_state_start(void)
>> {
>> 	int ret;
>> 
>> -	ret = nfsd4_create_callback_queue();
>> +	ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params);
>> 	if (ret)
>> 		return ret;
>> 
>> +	ret = nfsd4_create_callback_queue();
>> +	if (ret) {
>> +		rhltable_destroy(&nfs4_file_rhltable);
>> +		return ret;
>> +	}
>> +
>> 	set_max_delegations();
>> 	return 0;
>> }
>> @@ -7939,6 +8015,7 @@ nfs4_state_shutdown_net(struct net *net)
>> 
>> 	nfsd4_client_tracking_exit(net);
>> 	nfs4_state_destroy_net(net);
>> +	rhltable_destroy(&nfs4_file_rhltable);
>> #ifdef CONFIG_NFSD_V4_2_INTER_SSC
>> 	nfsd4_ssc_shutdown_umount(nn);
>> #endif
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index ae596dbf8667..ad20467c9dee 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -536,16 +536,13 @@ struct nfs4_clnt_odstate {
>>  * inode can have multiple filehandles associated with it, so there is
>>  * (potentially) a many to one relationship between this struct and struct
>>  * inode.
>> - *
>> - * These are hashed by filehandle in the file_hashtbl, which is protected by
>> - * the global state_lock spinlock.
>>  */
>> struct nfs4_file {
>> 	refcount_t		fi_ref;
>> 	struct inode *		fi_inode;
>> 	bool			fi_aliased;
>> 	spinlock_t		fi_lock;
>> -	struct hlist_node       fi_hash;	/* hash on fi_fhandle */
>> +	struct rhlist_head	fi_rhash;
>> 	struct list_head        fi_stateids;
>> 	union {
>> 		struct list_head	fi_delegations;
>> 
>> 
> 
> -- 
> Jeff Layton <jlayton@xxxxxxxxxxxxxxx>

--
Chuck Lever







[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux