On Wed, 2025-01-22 at 14:54 +1100, NeilBrown wrote: > The nfsd filecache currently uses list_lru for tracking files recently > used in NFSv3 requests which need to be "garbage collected" when they > have becoming idle - unused for 2-4 seconds. > > I do not believe list_lru is a good tool for this. It does not allow the > timeout which filecache requires so we have to add a timeout mechanism > which holds the list_lru lock while the whole list is scanned looking for > entries that haven't been recently accessed. When the list is largish > (even a few hundred) this can block new requests which need the lock to > remove a file to access it. > > This patch removes the list_lru and instead uses 2 simple linked lists. > When a file is accessed it is removed from whichever list it is on, > then added to the tail of the first list. Every 2 seconds the second > list is moved to the "freeme" list and the first list is moved to the > second list. This avoids any need to walk a list to find old entries. > > Previously a file would be unhashed before being moved to the freeme > list. We don't do that any more. The freeme list is much like the > other two lists (recent and older) in that they all hold a reference to > the file and the file is still hashed. When the nfsd thread processes > the freeme list it now uses the new nfsd_file_release_list() which uses > nfsd_file_cond_queue() to unhash and drop the refcount. > > We no longer have a precise count of the size of the lru (recent + > older) as we don't know how big "older" is when it is moved to "freeme". > However the shrinker can cope with an approximation. So we keep a > count of number in the lru and when "older" is moved to "freeme" we > divide that count by 2. When we remove anything from the lru we > decrement that counter but ensure it never goes negative. Naturally > when we add to the lru we increase the counter. > > For the filecache stats file, which assumes a global lru, we keep a > separate counter which includes all files in all netns in recent or > older or freeme. > > We discard the nf_gc linkage in an nfsd_file and only use nf_lru. > We discard NFSD_FILE_REFERENCED. > > This patch drops the nfsd_file_gc_removed() trace point. I couldn't > think of useful information to provide. > > Signed-off-by: NeilBrown <neilb@xxxxxxx> > --- > fs/nfsd/filecache.c | 215 ++++++++++++++++++++++---------------------- > fs/nfsd/filecache.h | 4 +- > fs/nfsd/trace.h | 4 - > 3 files changed, 108 insertions(+), 115 deletions(-) > I like the basic approach! This is a better fit than list_lru. > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > index 4f39f6632b35..552feba94f09 100644 > --- a/fs/nfsd/filecache.c > +++ b/fs/nfsd/filecache.c > @@ -34,7 +34,6 @@ > #include <linux/file.h> > #include <linux/pagemap.h> > #include <linux/sched.h> > -#include <linux/list_lru.h> > #include <linux/fsnotify_backend.h> > #include <linux/fsnotify.h> > #include <linux/seq_file.h> > @@ -63,10 +62,13 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions); > > struct nfsd_fcache_disposal { > spinlock_t lock; > - struct list_lru file_lru; > - struct list_head freeme; > + struct list_head recent; /* have been used in last 0-2 seconds */ > + struct list_head older; /* haven't been used in last 0-2 seconds */ > + struct list_head freeme; /* ready to be discarded */ > + unsigned long num_gc; /* Approximate size of recent plus older */ > struct delayed_work filecache_laundrette; > struct shrinker *file_shrinker; > + struct nfsd_net *nn; > }; > > static struct kmem_cache *nfsd_file_slab; > @@ -227,7 +229,6 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, > > this_cpu_inc(nfsd_file_allocations); > INIT_LIST_HEAD(&nf->nf_lru); > - INIT_LIST_HEAD(&nf->nf_gc); > nf->nf_birthtime = ktime_get(); > nf->nf_file = NULL; > nf->nf_cred = get_current_cred(); > @@ -332,12 +333,16 @@ static bool nfsd_file_lru_add(struct nfsd_file *nf) > struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id); > struct nfsd_fcache_disposal *l = nn->fcache_disposal; > > - set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); > - if (list_lru_add_obj(&l->file_lru, &nf->nf_lru)) { > + spin_lock(&l->lock); > + if (list_empty(&nf->nf_lru)) { > + list_add_tail(&nf->nf_lru, &l->recent); > + l->num_gc += 1; > atomic_long_inc(&nfsd_lru_total); > trace_nfsd_file_lru_add(nf); > + spin_unlock(&l->lock); > return true; > } > + spin_unlock(&l->lock); > return false; > } > > @@ -346,11 +351,17 @@ static bool nfsd_file_lru_remove(struct nfsd_file *nf) > struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id); > struct nfsd_fcache_disposal *l = nn->fcache_disposal; > > - if (list_lru_del_obj(&l->file_lru, &nf->nf_lru)) { > + spin_lock(&l->lock); > + if (!list_empty(&nf->nf_lru)) { > + list_del_init(&nf->nf_lru); > atomic_long_dec(&nfsd_lru_total); > + if (l->num_gc > 0) > + l->num_gc -= 1; > trace_nfsd_file_lru_del(nf); > + spin_unlock(&l->lock); > return true; > } > + spin_unlock(&l->lock); > return false; > } > > @@ -440,12 +451,26 @@ nfsd_file_dispose_list(struct list_head *dispose) > struct nfsd_file *nf; > > while (!list_empty(dispose)) { > - nf = list_first_entry(dispose, struct nfsd_file, nf_gc); > - list_del_init(&nf->nf_gc); > + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); > + list_del_init(&nf->nf_lru); > nfsd_file_free(nf); > } > } > > +static void > +nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose); > + > +static void > +nfsd_file_release_list(struct list_head *dispose) > +{ > + LIST_HEAD(dispose2); > + struct nfsd_file *nf, *nf2; > + > + list_for_each_entry_safe(nf, nf2, dispose, nf_lru) > + nfsd_file_cond_queue(nf, &dispose2); > + nfsd_file_dispose_list(&dispose2); > +} > + > /** > * nfsd_file_dispose_list_delayed - move list of dead files to net's freeme list > * @dispose: list of nfsd_files to be disposed > @@ -458,12 +483,12 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) > { > while(!list_empty(dispose)) { > struct nfsd_file *nf = list_first_entry(dispose, > - struct nfsd_file, nf_gc); > + struct nfsd_file, nf_lru); > struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id); > struct nfsd_fcache_disposal *l = nn->fcache_disposal; > > spin_lock(&l->lock); > - list_move_tail(&nf->nf_gc, &l->freeme); > + list_move_tail(&nf->nf_lru, &l->freeme); > spin_unlock(&l->lock); > svc_wake_up(nn->nfsd_serv); > } > @@ -487,88 +512,32 @@ void nfsd_file_net_dispose(struct nfsd_net *nn) > int i; > > spin_lock(&l->lock); > - for (i = 0; i < 8 && !list_empty(&l->freeme); i++) > - list_move(l->freeme.next, &dispose); While you're in here, could you document why we only take 8 at a time? Maybe even consider turning it into a named constant? > + for (i = 0; i < 8 && !list_empty(&l->freeme); i++) { > + struct nfsd_file *nf = list_first_entry( > + &l->freeme, struct nfsd_file, nf_lru); > + > + /* > + * Don't throw out files that are still > + * undergoing I/O or that have uncleared errors > + * pending. > + */ > + if (nfsd_file_check_writeback(nf)) { > + trace_nfsd_file_gc_writeback(nf); > + list_move(&nf->nf_lru, &l->recent); > + l->num_gc += 1; > + } else { > + trace_nfsd_file_gc_disposed(nf); > + list_move(&nf->nf_lru, &dispose); > + this_cpu_inc(nfsd_file_evictions); > + } > + } > spin_unlock(&l->lock); > if (!list_empty(&l->freeme)) > /* Wake up another thread to share the work > * *before* doing any actual disposing. > */ > svc_wake_up(nn->nfsd_serv); > - nfsd_file_dispose_list(&dispose); > - } > -} > - > -/** > - * nfsd_file_lru_cb - Examine an entry on the LRU list > - * @item: LRU entry to examine > - * @lru: controlling LRU > - * @arg: dispose list > - * > - * Return values: > - * %LRU_REMOVED: @item was removed from the LRU > - * %LRU_ROTATE: @item is to be moved to the LRU tail > - * %LRU_SKIP: @item cannot be evicted > - */ > -static enum lru_status > -nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, > - void *arg) > -{ > - struct list_head *head = arg; > - struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); > - > - /* We should only be dealing with GC entries here */ > - WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags)); > - > - /* > - * Don't throw out files that are still undergoing I/O or > - * that have uncleared errors pending. > - */ > - if (nfsd_file_check_writeback(nf)) { > - trace_nfsd_file_gc_writeback(nf); > - return LRU_SKIP; > - } > - > - /* If it was recently added to the list, skip it */ > - if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) { > - trace_nfsd_file_gc_referenced(nf); > - return LRU_ROTATE; > - } > - > - /* > - * Put the reference held on behalf of the LRU. If it wasn't the last > - * one, then just remove it from the LRU and ignore it. > - */ > - if (!refcount_dec_and_test(&nf->nf_ref)) { > - trace_nfsd_file_gc_in_use(nf); > - list_lru_isolate(lru, &nf->nf_lru); > - return LRU_REMOVED; > - } > - > - /* Refcount went to zero. Unhash it and queue it to the dispose list */ > - nfsd_file_unhash(nf); > - list_lru_isolate(lru, &nf->nf_lru); > - list_add(&nf->nf_gc, head); > - this_cpu_inc(nfsd_file_evictions); > - trace_nfsd_file_gc_disposed(nf); > - return LRU_REMOVED; > -} > - > -static void > -nfsd_file_gc(struct nfsd_fcache_disposal *l) > -{ > - unsigned long remaining = list_lru_count(&l->file_lru); > - LIST_HEAD(dispose); > - unsigned long ret; > - > - while (remaining > 0) { > - unsigned long num_to_scan = min(remaining, NFSD_FILE_GC_BATCH); > - > - ret = list_lru_walk(&l->file_lru, nfsd_file_lru_cb, > - &dispose, num_to_scan); > - trace_nfsd_file_gc_removed(ret, list_lru_count(&l->file_lru)); > - nfsd_file_dispose_list_delayed(&dispose); > - remaining -= num_to_scan; > + nfsd_file_release_list(&dispose); > } > } > > @@ -577,9 +546,20 @@ nfsd_file_gc_worker(struct work_struct *work) > { > struct nfsd_fcache_disposal *l = container_of( > work, struct nfsd_fcache_disposal, filecache_laundrette.work); > - nfsd_file_gc(l); > - if (list_lru_count(&l->file_lru)) > + > + spin_lock(&l->lock); > + list_splice_init(&l->older, &l->freeme); > + list_splice_init(&l->recent, &l->older); > + /* We don't know how many were moved to 'freeme' and don't want > + * to waste time counting - guess a half. > + */ > + l->num_gc /= 2; Given that you have to manipulate the lists under a spinlock, it wouldn't be difficult or expensive to keep accurate counts. nfsd workloads can be "spiky", so it seems like this method may be wildly inaccurate at times. > + if (!list_empty(&l->freeme)) > + svc_wake_up(l->nn->nfsd_serv); > + if (!list_empty(&l->older) || !list_empty(&l->recent)) > nfsd_file_schedule_laundrette(l); > + spin_unlock(&l->lock); > + nit: remove the extra newline above > } > > static unsigned long > @@ -587,22 +567,40 @@ nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc) > { > struct nfsd_fcache_disposal *l = s->private_data; > > - return list_lru_count(&l->file_lru); > + return l->num_gc; > } > > static unsigned long > nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) > { > struct nfsd_fcache_disposal *l = s->private_data; > + struct nfsd_file *nf; > + int scanned = 0; > + > + spin_lock(&l->lock); > + while (scanned < sc->nr_to_scan && > + (nf = list_first_entry_or_null(&l->older, > + struct nfsd_file, nf_lru)) != NULL) { > + list_del_init(&nf->nf_lru); > + list_add_tail(&nf->nf_lru, &l->freeme); > + if (l->num_gc > 0) > + l->num_gc -= 1; > + scanned += 1; > + } > + if (scanned > 0) > + svc_wake_up(l->nn->nfsd_serv); > > - LIST_HEAD(dispose); > - unsigned long ret; > + trace_nfsd_file_shrinker_removed(scanned, l->num_gc); > > - ret = list_lru_shrink_walk(&l->file_lru, sc, > - nfsd_file_lru_cb, &dispose); > - trace_nfsd_file_shrinker_removed(ret, list_lru_count(&l->file_lru)); > - nfsd_file_dispose_list_delayed(&dispose); > - return ret; > + while (scanned < sc->nr_to_scan && > + (nf = list_first_entry_or_null(&l->recent, > + struct nfsd_file, nf_lru)) != NULL) { > + list_del_init(&nf->nf_lru); > + list_add_tail(&nf->nf_lru, &l->older); > + scanned += 1; > + } > + spin_unlock(&l->lock); > + return scanned; > } > > /** > @@ -615,7 +613,6 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) > */ > static void > nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose) > - __must_hold(RCU) > { > int decrement = 1; > > @@ -633,7 +630,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose) > > /* If refcount goes to 0, then put on the dispose list */ > if (refcount_sub_and_test(decrement, &nf->nf_ref)) { > - list_add(&nf->nf_gc, dispose); > + list_add(&nf->nf_lru, dispose); > trace_nfsd_file_closing(nf); > } > } > @@ -858,7 +855,10 @@ nfsd_alloc_fcache_disposal(void) > return NULL; > spin_lock_init(&l->lock); > INIT_DELAYED_WORK(&l->filecache_laundrette, nfsd_file_gc_worker); > + INIT_LIST_HEAD(&l->recent); > + INIT_LIST_HEAD(&l->older); > INIT_LIST_HEAD(&l->freeme); > + l->num_gc = 0; > l->file_shrinker = shrinker_alloc(0, "nfsd-filecache"); > if (!l->file_shrinker) { > pr_err("nfsd: failed to allocate nfsd_file_shrinker\n"); > @@ -870,13 +870,6 @@ nfsd_alloc_fcache_disposal(void) > l->file_shrinker->seeks = 1; > l->file_shrinker->private_data = l; > > - if (list_lru_init(&l->file_lru)) { > - pr_err("nfsd: failed to init nfsd_file_lru\n"); > - shrinker_free(l->file_shrinker); > - kfree(l); > - return NULL; > - } > - > shrinker_register(l->file_shrinker); > return l; > } > @@ -886,8 +879,12 @@ nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l) > { > cancel_delayed_work_sync(&l->filecache_laundrette); > shrinker_free(l->file_shrinker); > - list_lru_destroy(&l->file_lru); > - nfsd_file_dispose_list(&l->freeme); > + nfsd_file_release_list(&l->recent); > + WARN_ON_ONCE(!list_empty(&l->recent)); > + nfsd_file_release_list(&l->older); > + WARN_ON_ONCE(!list_empty(&l->older)); > + nfsd_file_release_list(&l->freeme); > + WARN_ON_ONCE(!list_empty(&l->freeme)); > kfree(l); > } > > @@ -906,6 +903,8 @@ nfsd_file_cache_start_net(struct net *net) > struct nfsd_net *nn = net_generic(net, nfsd_net_id); > > nn->fcache_disposal = nfsd_alloc_fcache_disposal(); > + if (nn->fcache_disposal) > + nn->fcache_disposal->nn = nn; > return nn->fcache_disposal ? 0 : -ENOMEM; > } > > diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h > index a09a851d510e..02059b6c5e9a 100644 > --- a/fs/nfsd/filecache.h > +++ b/fs/nfsd/filecache.h > @@ -42,15 +42,13 @@ struct nfsd_file { > struct net *nf_net; > #define NFSD_FILE_HASHED (0) > #define NFSD_FILE_PENDING (1) > -#define NFSD_FILE_REFERENCED (2) > -#define NFSD_FILE_GC (3) > +#define NFSD_FILE_GC (2) > unsigned long nf_flags; > refcount_t nf_ref; > unsigned char nf_may; > > struct nfsd_file_mark *nf_mark; > struct list_head nf_lru; > - struct list_head nf_gc; > struct rcu_head nf_rcu; > ktime_t nf_birthtime; > }; > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h > index ad2c0c432d08..efa683541ed5 100644 > --- a/fs/nfsd/trace.h > +++ b/fs/nfsd/trace.h > @@ -1038,7 +1038,6 @@ DEFINE_CLID_EVENT(confirmed_r); > __print_flags(val, "|", \ > { 1 << NFSD_FILE_HASHED, "HASHED" }, \ > { 1 << NFSD_FILE_PENDING, "PENDING" }, \ > - { 1 << NFSD_FILE_REFERENCED, "REFERENCED" }, \ > { 1 << NFSD_FILE_GC, "GC" }) > > DECLARE_EVENT_CLASS(nfsd_file_class, > @@ -1314,9 +1313,7 @@ DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add); > DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add_disposed); > DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del); > DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed); > -DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use); > DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback); > -DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced); > DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed); > > DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class, > @@ -1345,7 +1342,6 @@ DEFINE_EVENT(nfsd_file_lruwalk_class, name, \ > ), \ > TP_ARGS(removed, remaining)) > > -DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed); > DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed); > > TRACE_EVENT(nfsd_file_close, -- Jeff Layton <jlayton@xxxxxxxxxx>