On Thu, 2023-01-05 at 09:58 -0500, Chuck Lever wrote: > From: Chuck Lever <chuck.lever@xxxxxxxxxx> > > While we were converting the nfs4_file hashtable to use the kernel's > resizable hashtable data structure, Neil Brown observed that the > list variant (rhltable) would be better for managing nfsd_file items > as well. The nfsd_file hash table will contain multiple entries for > the same inode -- these should be kept together on a list. And, it > could be possible for exotic or malicious client behavior to cause > the hash table to resize itself on every insertion. > > A nice simplification is that rhltable_lookup() can return a list > that contains only nfsd_file items that match a given inode, which > enables us to eliminate specialized hash table helper functions and > use the default functions provided by the rhashtable implementation). > > Since we are now storing nfsd_file items for the same inode on a > single list, that effectively reduces the number of hash entries > that have to be tracked in the hash table. The mininum bucket count > is therefore lowered. > > Suggested-by: Neil Brown <neilb@xxxxxxx> > Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> > --- > fs/nfsd/filecache.c | 289 +++++++++++++++++++-------------------------------- > fs/nfsd/filecache.h | 9 +- > 2 files changed, 115 insertions(+), 183 deletions(-) > > Just to note that this work is still in the wings. It would need to > be rebased on Jeff's recent fixes and clean-ups. I'm reluctant to > apply this until there is more evidence that the instability in v6.0 > has been duly addressed. > > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > index 45b2c9e3f636..f04e722bb6bc 100644 > --- a/fs/nfsd/filecache.c > +++ b/fs/nfsd/filecache.c > @@ -74,70 +74,9 @@ static struct list_lru nfsd_file_lru; > static unsigned long nfsd_file_flags; > static struct fsnotify_group *nfsd_file_fsnotify_group; > static struct delayed_work nfsd_filecache_laundrette; > -static struct rhashtable nfsd_file_rhash_tbl > +static struct rhltable nfsd_file_rhltable > ____cacheline_aligned_in_smp; > > -enum nfsd_file_lookup_type { > - NFSD_FILE_KEY_INODE, > - NFSD_FILE_KEY_FULL, > -}; > - > -struct nfsd_file_lookup_key { > - struct inode *inode; > - struct net *net; > - const struct cred *cred; > - unsigned char need; > - bool gc; > - enum nfsd_file_lookup_type type; > -}; > - > -/* > - * The returned hash value is based solely on the address of an in-code > - * inode, a pointer to a slab-allocated object. The entropy in such a > - * pointer is concentrated in its middle bits. > - */ > -static u32 nfsd_file_inode_hash(const struct inode *inode, u32 seed) > -{ > - unsigned long ptr = (unsigned long)inode; > - u32 k; > - > - k = ptr >> L1_CACHE_SHIFT; > - k &= 0x00ffffff; > - return jhash2(&k, 1, seed); > -} > - > -/** > - * nfsd_file_key_hashfn - Compute the hash value of a lookup key > - * @data: key on which to compute the hash value > - * @len: rhash table's key_len parameter (unused) > - * @seed: rhash table's random seed of the day > - * > - * Return value: > - * Computed 32-bit hash value > - */ > -static u32 nfsd_file_key_hashfn(const void *data, u32 len, u32 seed) > -{ > - const struct nfsd_file_lookup_key *key = data; > - > - return nfsd_file_inode_hash(key->inode, seed); > -} > - > -/** > - * nfsd_file_obj_hashfn - Compute the hash value of an nfsd_file > - * @data: object on which to compute the hash value > - * @len: rhash table's key_len parameter (unused) > - * @seed: rhash table's random seed of the day > - * > - * Return value: > - * Computed 32-bit hash value > - */ > -static u32 nfsd_file_obj_hashfn(const void *data, u32 len, u32 seed) > -{ > - const struct nfsd_file *nf = data; > - > - return nfsd_file_inode_hash(nf->nf_inode, seed); > -} > - > static bool > nfsd_match_cred(const struct cred *c1, const struct cred *c2) > { > @@ -158,53 +97,16 @@ nfsd_match_cred(const struct cred *c1, const struct cred *c2) > return true; > } > > -/** > - * nfsd_file_obj_cmpfn - Match a cache item against search criteria > - * @arg: search criteria > - * @ptr: cache item to check > - * > - * Return values: > - * %0 - Item matches search criteria > - * %1 - Item does not match search criteria > - */ > -static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg, > - const void *ptr) > -{ > - const struct nfsd_file_lookup_key *key = arg->key; > - const struct nfsd_file *nf = ptr; > - > - switch (key->type) { > - case NFSD_FILE_KEY_INODE: > - if (nf->nf_inode != key->inode) > - return 1; > - break; > - case NFSD_FILE_KEY_FULL: > - if (nf->nf_inode != key->inode) > - return 1; > - if (nf->nf_may != key->need) > - return 1; > - if (nf->nf_net != key->net) > - return 1; > - if (!nfsd_match_cred(nf->nf_cred, key->cred)) > - return 1; > - if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc) > - return 1; > - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) > - return 1; > - break; > - } > - return 0; > -} > - > static const struct rhashtable_params nfsd_file_rhash_params = { > .key_len = sizeof_field(struct nfsd_file, nf_inode), > .key_offset = offsetof(struct nfsd_file, nf_inode), > - .head_offset = offsetof(struct nfsd_file, nf_rhash), > - .hashfn = nfsd_file_key_hashfn, > - .obj_hashfn = nfsd_file_obj_hashfn, > - .obj_cmpfn = nfsd_file_obj_cmpfn, > - /* Reduce resizing churn on light workloads */ > - .min_size = 512, /* buckets */ > + .head_offset = offsetof(struct nfsd_file, nf_rlist), > + > + /* > + * Start with a single page hash table to reduce resizing churn > + * on light workloads. > + */ > + .min_size = 256, > .automatic_shrinking = true, > }; > > @@ -307,27 +209,27 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode) > } > > static struct nfsd_file * > -nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) > +nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, > + bool want_gc) > { > struct nfsd_file *nf; > > nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL); > - if (nf) { > - INIT_LIST_HEAD(&nf->nf_lru); > - nf->nf_birthtime = ktime_get(); > - nf->nf_file = NULL; > - nf->nf_cred = get_current_cred(); > - nf->nf_net = key->net; > - nf->nf_flags = 0; > - __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); > - __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); > - if (key->gc) > - __set_bit(NFSD_FILE_GC, &nf->nf_flags); > - nf->nf_inode = key->inode; > - refcount_set(&nf->nf_ref, 1); > - nf->nf_may = key->need; > - nf->nf_mark = NULL; > - } > + if (unlikely(!nf)) > + return NULL; > + > + INIT_LIST_HEAD(&nf->nf_lru); > + nf->nf_birthtime = ktime_get(); > + nf->nf_file = NULL; > + nf->nf_cred = get_current_cred(); > + nf->nf_net = net; > + nf->nf_flags = want_gc ? > + BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING) | BIT(NFSD_FILE_GC) : > + BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING); > + nf->nf_inode = inode; > + refcount_set(&nf->nf_ref, 1); > + nf->nf_may = need; > + nf->nf_mark = NULL; > return nf; > } > > @@ -362,8 +264,8 @@ nfsd_file_hash_remove(struct nfsd_file *nf) > > if (nfsd_file_check_write_error(nf)) > nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); > - rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash, > - nfsd_file_rhash_params); > + rhltable_remove(&nfsd_file_rhltable, &nf->nf_rlist, > + nfsd_file_rhash_params); > } > > static bool > @@ -680,21 +582,19 @@ static struct shrinker nfsd_file_shrinker = { > static void > nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose) > { > - struct nfsd_file_lookup_key key = { > - .type = NFSD_FILE_KEY_INODE, > - .inode = inode, > - }; > - struct nfsd_file *nf; > - > rcu_read_lock(); > do { > + struct rhlist_head *list; > + struct nfsd_file *nf; > int decrement = 1; > > - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, > + list = rhltable_lookup(&nfsd_file_rhltable, &inode, > nfsd_file_rhash_params); > - if (!nf) > + if (!list) > break; > Rather than redriving the lookup multiple times in the loop, would it be better to just search once and then walk the resulting list with rhl_for_each_entry_rcu, all while holding the rcu_read_lock? > + nf = container_of(list, struct nfsd_file, nf_rlist); > + > /* If we raced with someone else unhashing, ignore it */ > if (!nfsd_file_unhash(nf)) > continue; > @@ -836,7 +736,7 @@ nfsd_file_cache_init(void) > if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) > return 0; > > - ret = rhashtable_init(&nfsd_file_rhash_tbl, &nfsd_file_rhash_params); > + ret = rhltable_init(&nfsd_file_rhltable, &nfsd_file_rhash_params); > if (ret) > return ret; > > @@ -904,7 +804,7 @@ nfsd_file_cache_init(void) > nfsd_file_mark_slab = NULL; > destroy_workqueue(nfsd_filecache_wq); > nfsd_filecache_wq = NULL; > - rhashtable_destroy(&nfsd_file_rhash_tbl); > + rhltable_destroy(&nfsd_file_rhltable); > goto out; > } > > @@ -922,7 +822,7 @@ __nfsd_file_cache_purge(struct net *net) > struct nfsd_file *nf; > LIST_HEAD(dispose); > > - rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter); > + rhltable_walk_enter(&nfsd_file_rhltable, &iter); > do { > rhashtable_walk_start(&iter); > > @@ -1031,7 +931,7 @@ nfsd_file_cache_shutdown(void) > nfsd_file_mark_slab = NULL; > destroy_workqueue(nfsd_filecache_wq); > nfsd_filecache_wq = NULL; > - rhashtable_destroy(&nfsd_file_rhash_tbl); > + rhltable_destroy(&nfsd_file_rhltable); > > for_each_possible_cpu(i) { > per_cpu(nfsd_file_cache_hits, i) = 0; > @@ -1042,6 +942,36 @@ nfsd_file_cache_shutdown(void) > } > } > > +static struct nfsd_file * > +nfsd_file_lookup_locked(const struct net *net, const struct cred *cred, > + struct inode *inode, unsigned char need, > + bool want_gc) > +{ > + struct rhlist_head *tmp, *list; > + struct nfsd_file *nf; > + > + list = rhltable_lookup(&nfsd_file_rhltable, &inode, > + nfsd_file_rhash_params); > + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) { > + if (nf->nf_may != need) > + continue; > + if (nf->nf_net != net) > + continue; > + if (!nfsd_match_cred(nf->nf_cred, cred)) > + continue; > + if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != want_gc) > + continue; > + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) > + continue; > + > + /* If it was on the LRU, reuse that reference. */ > + if (nfsd_file_lru_remove(nf)) > + WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref)); > + return nf; > + } > + return NULL; > +} > + > /** > * nfsd_file_is_cached - are there any cached open files for this inode? > * @inode: inode to check > @@ -1056,15 +986,12 @@ nfsd_file_cache_shutdown(void) > bool > nfsd_file_is_cached(struct inode *inode) > { > - struct nfsd_file_lookup_key key = { > - .type = NFSD_FILE_KEY_INODE, > - .inode = inode, > - }; > - bool ret = false; > - > - if (rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key, > - nfsd_file_rhash_params) != NULL) > - ret = true; > + bool ret; > + > + rcu_read_lock(); > + ret = (rhltable_lookup(&nfsd_file_rhltable, &inode, > + nfsd_file_rhash_params) != NULL); > + rcu_read_unlock(); > trace_nfsd_file_is_cached(inode, (int)ret); > return ret; > } > @@ -1074,14 +1001,12 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > unsigned int may_flags, struct nfsd_file **pnf, > bool open, bool want_gc) > { > - struct nfsd_file_lookup_key key = { > - .type = NFSD_FILE_KEY_FULL, > - .need = may_flags & NFSD_FILE_MAY_MASK, > - .net = SVC_NET(rqstp), > - .gc = want_gc, > - }; > + unsigned char need = may_flags & NFSD_FILE_MAY_MASK; > + struct net *net = SVC_NET(rqstp); > + struct nfsd_file *new, *nf; > + const struct cred *cred; > bool open_retry = true; > - struct nfsd_file *nf; > + struct inode *inode; > __be32 status; > int ret; > > @@ -1089,32 +1014,38 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > may_flags|NFSD_MAY_OWNER_OVERRIDE); > if (status != nfs_ok) > return status; > - key.inode = d_inode(fhp->fh_dentry); > - key.cred = get_current_cred(); > + inode = d_inode(fhp->fh_dentry); > + cred = get_current_cred(); > > retry: > - rcu_read_lock(); > - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, > - nfsd_file_rhash_params); > - if (nf) > - nf = nfsd_file_get(nf); > - rcu_read_unlock(); > - > - if (nf) { > - if (nfsd_file_lru_remove(nf)) > - WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref)); > - goto wait_for_construction; > + if (open) { > + rcu_read_lock(); > + nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); > + rcu_read_unlock(); > + if (nf) > + goto wait_for_construction; > } > > - nf = nfsd_file_alloc(&key, may_flags); > - if (!nf) { > + new = nfsd_file_alloc(net, inode, need, want_gc); > + if (!new) { > status = nfserr_jukebox; > goto out_status; > } > + rcu_read_lock(); > + spin_lock(&inode->i_lock); > + nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); > + if (unlikely(nf)) { > + spin_unlock(&inode->i_lock); > + rcu_read_unlock(); > + nfsd_file_slab_free(&new->nf_rcu); > + goto wait_for_construction; > + } > + nf = new; > + ret = rhltable_insert(&nfsd_file_rhltable, &nf->nf_rlist, > + nfsd_file_rhash_params); > + spin_unlock(&inode->i_lock); > + rcu_read_unlock(); > > - ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl, > - &key, &nf->nf_rhash, > - nfsd_file_rhash_params); > if (likely(ret == 0)) > goto open_file; > > @@ -1122,7 +1053,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > nf = NULL; > if (ret == -EEXIST) > goto retry; > - trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret); > + trace_nfsd_file_insert_err(rqstp, inode, may_flags, ret); > status = nfserr_jukebox; > goto out_status; > > @@ -1131,7 +1062,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > > /* Did construction of this file fail? */ > if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { > - trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf); > + trace_nfsd_file_cons_err(rqstp, inode, may_flags, nf); > if (!open_retry) { > status = nfserr_jukebox; > goto out; > @@ -1157,14 +1088,14 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > } > > out_status: > - put_cred(key.cred); > + put_cred(cred); > if (open) > - trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status); > + trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status); > return status; > > open_file: > trace_nfsd_file_alloc(nf); > - nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode); > + nf->nf_mark = nfsd_file_mark_find_or_create(nf, inode); > if (nf->nf_mark) { > if (open) { > status = nfsd_open_verified(rqstp, fhp, may_flags, > @@ -1178,7 +1109,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > * If construction failed, or we raced with a call to unlink() > * then unhash. > */ > - if (status == nfs_ok && key.inode->i_nlink == 0) > + if (status != nfs_ok || inode->i_nlink == 0) > status = nfserr_jukebox; > if (status != nfs_ok) > nfsd_file_unhash(nf); > @@ -1273,7 +1204,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) > lru = list_lru_count(&nfsd_file_lru); > > rcu_read_lock(); > - ht = &nfsd_file_rhash_tbl; > + ht = &nfsd_file_rhltable.ht; > count = atomic_read(&ht->nelems); > tbl = rht_dereference_rcu(ht->tbl, ht); > buckets = tbl->size; > @@ -1289,7 +1220,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) > evictions += per_cpu(nfsd_file_evictions, i); > } > > - seq_printf(m, "total entries: %u\n", count); > + seq_printf(m, "total inodes: %u\n", count); > seq_printf(m, "hash buckets: %u\n", buckets); > seq_printf(m, "lru entries: %lu\n", lru); > seq_printf(m, "cache hits: %lu\n", hits); > diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h > index b7efb2c3ddb1..7d3b35771565 100644 > --- a/fs/nfsd/filecache.h > +++ b/fs/nfsd/filecache.h > @@ -29,9 +29,8 @@ struct nfsd_file_mark { > * never be dereferenced, only used for comparison. > */ > struct nfsd_file { > - struct rhash_head nf_rhash; > - struct list_head nf_lru; > - struct rcu_head nf_rcu; > + struct rhlist_head nf_rlist; > + void *nf_inode; > struct file *nf_file; > const struct cred *nf_cred; > struct net *nf_net; > @@ -40,10 +39,12 @@ struct nfsd_file { > #define NFSD_FILE_REFERENCED (2) > #define NFSD_FILE_GC (3) > unsigned long nf_flags; > - struct inode *nf_inode; /* don't deref */ > refcount_t nf_ref; > unsigned char nf_may; > + > struct nfsd_file_mark *nf_mark; > + struct list_head nf_lru; > + struct rcu_head nf_rcu; > ktime_t nf_birthtime; > }; > > > -- Jeff Layton <jlayton@xxxxxxxxxx>