On Mon, 2023-01-23 at 20:34 +0000, Chuck Lever III wrote: > > > On Jan 23, 2023, at 3:15 PM, Jeff Layton <jlayton@xxxxxxxxxx> wrote: > > > > On Thu, 2023-01-05 at 09:58 -0500, Chuck Lever wrote: > > > From: Chuck Lever <chuck.lever@xxxxxxxxxx> > > > > > > While we were converting the nfs4_file hashtable to use the kernel's > > > resizable hashtable data structure, Neil Brown observed that the > > > list variant (rhltable) would be better for managing nfsd_file items > > > as well. The nfsd_file hash table will contain multiple entries for > > > the same inode -- these should be kept together on a list. And, it > > > could be possible for exotic or malicious client behavior to cause > > > the hash table to resize itself on every insertion. > > > > > > A nice simplification is that rhltable_lookup() can return a list > > > that contains only nfsd_file items that match a given inode, which > > > enables us to eliminate specialized hash table helper functions and > > > use the default functions provided by the rhashtable implementation). > > > > > > Since we are now storing nfsd_file items for the same inode on a > > > single list, that effectively reduces the number of hash entries > > > that have to be tracked in the hash table. The mininum bucket count > > > is therefore lowered. > > > > > > Suggested-by: Neil Brown <neilb@xxxxxxx> > > > Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> > > > --- > > > fs/nfsd/filecache.c | 289 +++++++++++++++++++-------------------------------- > > > fs/nfsd/filecache.h | 9 +- > > > 2 files changed, 115 insertions(+), 183 deletions(-) > > > > > > Just to note that this work is still in the wings. It would need to > > > be rebased on Jeff's recent fixes and clean-ups. I'm reluctant to > > > apply this until there is more evidence that the instability in v6.0 > > > has been duly addressed. > > > > > > > > > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > > > index 45b2c9e3f636..f04e722bb6bc 100644 > > > --- a/fs/nfsd/filecache.c > > > +++ b/fs/nfsd/filecache.c > > > @@ -74,70 +74,9 @@ static struct list_lru nfsd_file_lru; > > > static unsigned long nfsd_file_flags; > > > static struct fsnotify_group *nfsd_file_fsnotify_group; > > > static struct delayed_work nfsd_filecache_laundrette; > > > -static struct rhashtable nfsd_file_rhash_tbl > > > +static struct rhltable nfsd_file_rhltable > > > ____cacheline_aligned_in_smp; > > > > > > -enum nfsd_file_lookup_type { > > > - NFSD_FILE_KEY_INODE, > > > - NFSD_FILE_KEY_FULL, > > > -}; > > > - > > > -struct nfsd_file_lookup_key { > > > - struct inode *inode; > > > - struct net *net; > > > - const struct cred *cred; > > > - unsigned char need; > > > - bool gc; > > > - enum nfsd_file_lookup_type type; > > > -}; > > > - > > > -/* > > > - * The returned hash value is based solely on the address of an in-code > > > - * inode, a pointer to a slab-allocated object. The entropy in such a > > > - * pointer is concentrated in its middle bits. > > > - */ > > > -static u32 nfsd_file_inode_hash(const struct inode *inode, u32 seed) > > > -{ > > > - unsigned long ptr = (unsigned long)inode; > > > - u32 k; > > > - > > > - k = ptr >> L1_CACHE_SHIFT; > > > - k &= 0x00ffffff; > > > - return jhash2(&k, 1, seed); > > > -} > > > - > > > -/** > > > - * nfsd_file_key_hashfn - Compute the hash value of a lookup key > > > - * @data: key on which to compute the hash value > > > - * @len: rhash table's key_len parameter (unused) > > > - * @seed: rhash table's random seed of the day > > > - * > > > - * Return value: > > > - * Computed 32-bit hash value > > > - */ > > > -static u32 nfsd_file_key_hashfn(const void *data, u32 len, u32 seed) > > > -{ > > > - const struct nfsd_file_lookup_key *key = data; > > > - > > > - return nfsd_file_inode_hash(key->inode, seed); > > > -} > > > - > > > -/** > > > - * nfsd_file_obj_hashfn - Compute the hash value of an nfsd_file > > > - * @data: object on which to compute the hash value > > > - * @len: rhash table's key_len parameter (unused) > > > - * @seed: rhash table's random seed of the day > > > - * > > > - * Return value: > > > - * Computed 32-bit hash value > > > - */ > > > -static u32 nfsd_file_obj_hashfn(const void *data, u32 len, u32 seed) > > > -{ > > > - const struct nfsd_file *nf = data; > > > - > > > - return nfsd_file_inode_hash(nf->nf_inode, seed); > > > -} > > > - > > > static bool > > > nfsd_match_cred(const struct cred *c1, const struct cred *c2) > > > { > > > @@ -158,53 +97,16 @@ nfsd_match_cred(const struct cred *c1, const struct cred *c2) > > > return true; > > > } > > > > > > -/** > > > - * nfsd_file_obj_cmpfn - Match a cache item against search criteria > > > - * @arg: search criteria > > > - * @ptr: cache item to check > > > - * > > > - * Return values: > > > - * %0 - Item matches search criteria > > > - * %1 - Item does not match search criteria > > > - */ > > > -static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg, > > > - const void *ptr) > > > -{ > > > - const struct nfsd_file_lookup_key *key = arg->key; > > > - const struct nfsd_file *nf = ptr; > > > - > > > - switch (key->type) { > > > - case NFSD_FILE_KEY_INODE: > > > - if (nf->nf_inode != key->inode) > > > - return 1; > > > - break; > > > - case NFSD_FILE_KEY_FULL: > > > - if (nf->nf_inode != key->inode) > > > - return 1; > > > - if (nf->nf_may != key->need) > > > - return 1; > > > - if (nf->nf_net != key->net) > > > - return 1; > > > - if (!nfsd_match_cred(nf->nf_cred, key->cred)) > > > - return 1; > > > - if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc) > > > - return 1; > > > - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) > > > - return 1; > > > - break; > > > - } > > > - return 0; > > > -} > > > - > > > static const struct rhashtable_params nfsd_file_rhash_params = { > > > .key_len = sizeof_field(struct nfsd_file, nf_inode), > > > .key_offset = offsetof(struct nfsd_file, nf_inode), > > > - .head_offset = offsetof(struct nfsd_file, nf_rhash), > > > - .hashfn = nfsd_file_key_hashfn, > > > - .obj_hashfn = nfsd_file_obj_hashfn, > > > - .obj_cmpfn = nfsd_file_obj_cmpfn, > > > - /* Reduce resizing churn on light workloads */ > > > - .min_size = 512, /* buckets */ > > > + .head_offset = offsetof(struct nfsd_file, nf_rlist), > > > + > > > + /* > > > + * Start with a single page hash table to reduce resizing churn > > > + * on light workloads. > > > + */ > > > + .min_size = 256, > > > .automatic_shrinking = true, > > > }; > > > > > > @@ -307,27 +209,27 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode) > > > } > > > > > > static struct nfsd_file * > > > -nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) > > > +nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, > > > + bool want_gc) > > > { > > > struct nfsd_file *nf; > > > > > > nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL); > > > - if (nf) { > > > - INIT_LIST_HEAD(&nf->nf_lru); > > > - nf->nf_birthtime = ktime_get(); > > > - nf->nf_file = NULL; > > > - nf->nf_cred = get_current_cred(); > > > - nf->nf_net = key->net; > > > - nf->nf_flags = 0; > > > - __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); > > > - __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); > > > - if (key->gc) > > > - __set_bit(NFSD_FILE_GC, &nf->nf_flags); > > > - nf->nf_inode = key->inode; > > > - refcount_set(&nf->nf_ref, 1); > > > - nf->nf_may = key->need; > > > - nf->nf_mark = NULL; > > > - } > > > + if (unlikely(!nf)) > > > + return NULL; > > > + > > > + INIT_LIST_HEAD(&nf->nf_lru); > > > + nf->nf_birthtime = ktime_get(); > > > + nf->nf_file = NULL; > > > + nf->nf_cred = get_current_cred(); > > > + nf->nf_net = net; > > > + nf->nf_flags = want_gc ? > > > + BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING) | BIT(NFSD_FILE_GC) : > > > + BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING); > > > + nf->nf_inode = inode; > > > + refcount_set(&nf->nf_ref, 1); > > > + nf->nf_may = need; > > > + nf->nf_mark = NULL; > > > return nf; > > > } > > > > > > @@ -362,8 +264,8 @@ nfsd_file_hash_remove(struct nfsd_file *nf) > > > > > > if (nfsd_file_check_write_error(nf)) > > > nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); > > > - rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash, > > > - nfsd_file_rhash_params); > > > + rhltable_remove(&nfsd_file_rhltable, &nf->nf_rlist, > > > + nfsd_file_rhash_params); > > > } > > > > > > static bool > > > @@ -680,21 +582,19 @@ static struct shrinker nfsd_file_shrinker = { > > > static void > > > nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose) > > > { > > > - struct nfsd_file_lookup_key key = { > > > - .type = NFSD_FILE_KEY_INODE, > > > - .inode = inode, > > > - }; > > > - struct nfsd_file *nf; > > > - > > > rcu_read_lock(); > > > do { > > > + struct rhlist_head *list; > > > + struct nfsd_file *nf; > > > int decrement = 1; > > > > > > - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, > > > + list = rhltable_lookup(&nfsd_file_rhltable, &inode, > > > nfsd_file_rhash_params); > > > - if (!nf) > > > + if (!list) > > > break; > > > > > > > Rather than redriving the lookup multiple times in the loop, would it be > > better to just search once and then walk the resulting list with > > rhl_for_each_entry_rcu, all while holding the rcu_read_lock? > > That would be nice, but as you and I have discussed before: > > On every iteration, we're possibly calling nfsd_file_unhash(), which > changes the list. So we have to invoke rhltable_lookup() again to get > the updated version of that list. > > As far as I can see there's no "_safe" version of rhl_for_each_entry. > > I think the best we can do is not redrive the lookup if we didn't > unhash anything. I'm not sure that will fit easily with the > nfsd_file_cond_queue thingie you just added in nfsd-fixes. > > Perhaps it should also drop the RCU read lock on each iteration in > case it finds a lot of inodes that match the lookup criteria. > I could be wrong, but it looks like you're safe to traverse the list even in the case of removals, assuming the objects themselves are rcu-freed. AFAICT, the object's ->next pointer is not changed when it's removed from the table. After all, we're not holding a "real" lock here so the object could be removed by another task at any time. It would be nice if this were documented though. > > > > + nf = container_of(list, struct nfsd_file, nf_rlist); > > > + > > > /* If we raced with someone else unhashing, ignore it */ > > > if (!nfsd_file_unhash(nf)) > > > continue; > > > @@ -836,7 +736,7 @@ nfsd_file_cache_init(void) > > > if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) > > > return 0; > > > > > > - ret = rhashtable_init(&nfsd_file_rhash_tbl, &nfsd_file_rhash_params); > > > + ret = rhltable_init(&nfsd_file_rhltable, &nfsd_file_rhash_params); > > > if (ret) > > > return ret; > > > > > > @@ -904,7 +804,7 @@ nfsd_file_cache_init(void) > > > nfsd_file_mark_slab = NULL; > > > destroy_workqueue(nfsd_filecache_wq); > > > nfsd_filecache_wq = NULL; > > > - rhashtable_destroy(&nfsd_file_rhash_tbl); > > > + rhltable_destroy(&nfsd_file_rhltable); > > > goto out; > > > } > > > > > > @@ -922,7 +822,7 @@ __nfsd_file_cache_purge(struct net *net) > > > struct nfsd_file *nf; > > > LIST_HEAD(dispose); > > > > > > - rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter); > > > + rhltable_walk_enter(&nfsd_file_rhltable, &iter); > > > do { > > > rhashtable_walk_start(&iter); > > > > > > @@ -1031,7 +931,7 @@ nfsd_file_cache_shutdown(void) > > > nfsd_file_mark_slab = NULL; > > > destroy_workqueue(nfsd_filecache_wq); > > > nfsd_filecache_wq = NULL; > > > - rhashtable_destroy(&nfsd_file_rhash_tbl); > > > + rhltable_destroy(&nfsd_file_rhltable); > > > > > > for_each_possible_cpu(i) { > > > per_cpu(nfsd_file_cache_hits, i) = 0; > > > @@ -1042,6 +942,36 @@ nfsd_file_cache_shutdown(void) > > > } > > > } > > > > > > +static struct nfsd_file * > > > +nfsd_file_lookup_locked(const struct net *net, const struct cred *cred, > > > + struct inode *inode, unsigned char need, > > > + bool want_gc) > > > +{ > > > + struct rhlist_head *tmp, *list; > > > + struct nfsd_file *nf; > > > + > > > + list = rhltable_lookup(&nfsd_file_rhltable, &inode, > > > + nfsd_file_rhash_params); > > > + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) { > > > + if (nf->nf_may != need) > > > + continue; > > > + if (nf->nf_net != net) > > > + continue; > > > + if (!nfsd_match_cred(nf->nf_cred, cred)) > > > + continue; > > > + if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != want_gc) > > > + continue; > > > + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) > > > + continue; > > > + > > > + /* If it was on the LRU, reuse that reference. */ > > > + if (nfsd_file_lru_remove(nf)) > > > + WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref)); > > > + return nf; > > > + } > > > + return NULL; > > > +} > > > + > > > /** > > > * nfsd_file_is_cached - are there any cached open files for this inode? > > > * @inode: inode to check > > > @@ -1056,15 +986,12 @@ nfsd_file_cache_shutdown(void) > > > bool > > > nfsd_file_is_cached(struct inode *inode) > > > { > > > - struct nfsd_file_lookup_key key = { > > > - .type = NFSD_FILE_KEY_INODE, > > > - .inode = inode, > > > - }; > > > - bool ret = false; > > > - > > > - if (rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key, > > > - nfsd_file_rhash_params) != NULL) > > > - ret = true; > > > + bool ret; > > > + > > > + rcu_read_lock(); > > > + ret = (rhltable_lookup(&nfsd_file_rhltable, &inode, > > > + nfsd_file_rhash_params) != NULL); > > > + rcu_read_unlock(); > > > trace_nfsd_file_is_cached(inode, (int)ret); > > > return ret; > > > } > > > @@ -1074,14 +1001,12 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > > > unsigned int may_flags, struct nfsd_file **pnf, > > > bool open, bool want_gc) > > > { > > > - struct nfsd_file_lookup_key key = { > > > - .type = NFSD_FILE_KEY_FULL, > > > - .need = may_flags & NFSD_FILE_MAY_MASK, > > > - .net = SVC_NET(rqstp), > > > - .gc = want_gc, > > > - }; > > > + unsigned char need = may_flags & NFSD_FILE_MAY_MASK; > > > + struct net *net = SVC_NET(rqstp); > > > + struct nfsd_file *new, *nf; > > > + const struct cred *cred; > > > bool open_retry = true; > > > - struct nfsd_file *nf; > > > + struct inode *inode; > > > __be32 status; > > > int ret; > > > > > > @@ -1089,32 +1014,38 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > > > may_flags|NFSD_MAY_OWNER_OVERRIDE); > > > if (status != nfs_ok) > > > return status; > > > - key.inode = d_inode(fhp->fh_dentry); > > > - key.cred = get_current_cred(); > > > + inode = d_inode(fhp->fh_dentry); > > > + cred = get_current_cred(); > > > > > > retry: > > > - rcu_read_lock(); > > > - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, > > > - nfsd_file_rhash_params); > > > - if (nf) > > > - nf = nfsd_file_get(nf); > > > - rcu_read_unlock(); > > > - > > > - if (nf) { > > > - if (nfsd_file_lru_remove(nf)) > > > - WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref)); > > > - goto wait_for_construction; > > > + if (open) { > > > + rcu_read_lock(); > > > + nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); > > > + rcu_read_unlock(); > > > + if (nf) > > > + goto wait_for_construction; > > > } > > > > > > - nf = nfsd_file_alloc(&key, may_flags); > > > - if (!nf) { > > > + new = nfsd_file_alloc(net, inode, need, want_gc); > > > + if (!new) { > > > status = nfserr_jukebox; > > > goto out_status; > > > } > > > + rcu_read_lock(); > > > + spin_lock(&inode->i_lock); > > > + nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); > > > + if (unlikely(nf)) { > > > + spin_unlock(&inode->i_lock); > > > + rcu_read_unlock(); > > > + nfsd_file_slab_free(&new->nf_rcu); > > > + goto wait_for_construction; > > > + } > > > + nf = new; > > > + ret = rhltable_insert(&nfsd_file_rhltable, &nf->nf_rlist, > > > + nfsd_file_rhash_params); > > > + spin_unlock(&inode->i_lock); > > > + rcu_read_unlock(); > > > > > > - ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl, > > > - &key, &nf->nf_rhash, > > > - nfsd_file_rhash_params); > > > if (likely(ret == 0)) > > > goto open_file; > > > > > > @@ -1122,7 +1053,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > > > nf = NULL; > > > if (ret == -EEXIST) > > > goto retry; > > > - trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret); > > > + trace_nfsd_file_insert_err(rqstp, inode, may_flags, ret); > > > status = nfserr_jukebox; > > > goto out_status; > > > > > > @@ -1131,7 +1062,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > > > > > > /* Did construction of this file fail? */ > > > if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { > > > - trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf); > > > + trace_nfsd_file_cons_err(rqstp, inode, may_flags, nf); > > > if (!open_retry) { > > > status = nfserr_jukebox; > > > goto out; > > > @@ -1157,14 +1088,14 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > > > } > > > > > > out_status: > > > - put_cred(key.cred); > > > + put_cred(cred); > > > if (open) > > > - trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status); > > > + trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status); > > > return status; > > > > > > open_file: > > > trace_nfsd_file_alloc(nf); > > > - nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode); > > > + nf->nf_mark = nfsd_file_mark_find_or_create(nf, inode); > > > if (nf->nf_mark) { > > > if (open) { > > > status = nfsd_open_verified(rqstp, fhp, may_flags, > > > @@ -1178,7 +1109,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > > > * If construction failed, or we raced with a call to unlink() > > > * then unhash. > > > */ > > > - if (status == nfs_ok && key.inode->i_nlink == 0) > > > + if (status != nfs_ok || inode->i_nlink == 0) > > > status = nfserr_jukebox; > > > if (status != nfs_ok) > > > nfsd_file_unhash(nf); > > > @@ -1273,7 +1204,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) > > > lru = list_lru_count(&nfsd_file_lru); > > > > > > rcu_read_lock(); > > > - ht = &nfsd_file_rhash_tbl; > > > + ht = &nfsd_file_rhltable.ht; > > > count = atomic_read(&ht->nelems); > > > tbl = rht_dereference_rcu(ht->tbl, ht); > > > buckets = tbl->size; > > > @@ -1289,7 +1220,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) > > > evictions += per_cpu(nfsd_file_evictions, i); > > > } > > > > > > - seq_printf(m, "total entries: %u\n", count); > > > + seq_printf(m, "total inodes: %u\n", count); > > > seq_printf(m, "hash buckets: %u\n", buckets); > > > seq_printf(m, "lru entries: %lu\n", lru); > > > seq_printf(m, "cache hits: %lu\n", hits); > > > diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h > > > index b7efb2c3ddb1..7d3b35771565 100644 > > > --- a/fs/nfsd/filecache.h > > > +++ b/fs/nfsd/filecache.h > > > @@ -29,9 +29,8 @@ struct nfsd_file_mark { > > > * never be dereferenced, only used for comparison. > > > */ > > > struct nfsd_file { > > > - struct rhash_head nf_rhash; > > > - struct list_head nf_lru; > > > - struct rcu_head nf_rcu; > > > + struct rhlist_head nf_rlist; > > > + void *nf_inode; > > > struct file *nf_file; > > > const struct cred *nf_cred; > > > struct net *nf_net; > > > @@ -40,10 +39,12 @@ struct nfsd_file { > > > #define NFSD_FILE_REFERENCED (2) > > > #define NFSD_FILE_GC (3) > > > unsigned long nf_flags; > > > - struct inode *nf_inode; /* don't deref */ > > > refcount_t nf_ref; > > > unsigned char nf_may; > > > + > > > struct nfsd_file_mark *nf_mark; > > > + struct list_head nf_lru; > > > + struct rcu_head nf_rcu; > > > ktime_t nf_birthtime; > > > }; > > > > > > > > > > > > > -- > > Jeff Layton <jlayton@xxxxxxxxxx> > > -- > Chuck Lever > > > -- Jeff Layton <jlayton@xxxxxxxxxx>