Re: [PATCH 3/4] nfsd: filecache: change garbage collection list management.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, 2025-01-22 at 14:54 +1100, NeilBrown wrote:
> The nfsd filecache currently uses  list_lru for tracking files recently
> used in NFSv3 requests which need to be "garbage collected" when they
> have becoming idle - unused for 2-4 seconds.
> 
> I do not believe list_lru is a good tool for this.  It does not allow the
> timeout which filecache requires so we have to add a timeout mechanism
> which holds the list_lru lock while the whole list is scanned looking for
> entries that haven't been recently accessed.  When the list is largish
> (even a few hundred) this can block new requests which need the lock to
> remove a file to access it.
> 
> This patch removes the list_lru and instead uses 2 simple linked lists.
> When a file is accessed it is removed from whichever list it is on,
> then added to the tail of the first list.  Every 2 seconds the second
> list is moved to the "freeme" list and the first list is moved to the
> second list.  This avoids any need to walk a list to find old entries.
> 
> Previously a file would be unhashed before being moved to the freeme
> list.  We don't do that any more.  The freeme list is much like the
> other two lists (recent and older) in that they all hold a reference to
> the file and the file is still hashed.  When the nfsd thread processes
> the freeme list it now uses the new nfsd_file_release_list() which uses
> nfsd_file_cond_queue() to unhash and drop the refcount.
> 
> We no longer have a precise count of the size of the lru (recent +
> older) as we don't know how big "older" is when it is moved to "freeme".
> However the shrinker can cope with an approximation.  So we keep a
> count of number in the lru and when "older" is moved to "freeme" we
> divide that count by 2.  When we remove anything from the lru we
> decrement that counter but ensure it never goes negative.  Naturally
> when we add to the lru we increase the counter.
> 
> For the filecache stats file, which assumes a global lru, we keep a
> separate counter which includes all files in all netns in recent or
> older or freeme.
> 
> We discard the nf_gc linkage in an nfsd_file and only use nf_lru.
> We discard NFSD_FILE_REFERENCED.
> 
> This patch drops the nfsd_file_gc_removed() trace point.  I couldn't
> think of useful information to provide.
> 
> Signed-off-by: NeilBrown <neilb@xxxxxxx>
> ---
>  fs/nfsd/filecache.c | 215 ++++++++++++++++++++++----------------------
>  fs/nfsd/filecache.h |   4 +-
>  fs/nfsd/trace.h     |   4 -
>  3 files changed, 108 insertions(+), 115 deletions(-)
> 

I like the basic approach! This is a better fit than list_lru.

> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index 4f39f6632b35..552feba94f09 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -34,7 +34,6 @@
>  #include <linux/file.h>
>  #include <linux/pagemap.h>
>  #include <linux/sched.h>
> -#include <linux/list_lru.h>
>  #include <linux/fsnotify_backend.h>
>  #include <linux/fsnotify.h>
>  #include <linux/seq_file.h>
> @@ -63,10 +62,13 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
>  
>  struct nfsd_fcache_disposal {
>  	spinlock_t lock;
> -	struct list_lru file_lru;
> -	struct list_head freeme;
> +	struct list_head recent; /* have been used in last 0-2 seconds */
> +	struct list_head older;	/* haven't been used in last 0-2 seconds */
> +	struct list_head freeme; /* ready to be discarded */
> +	unsigned long num_gc; /* Approximate size of recent plus older */
>  	struct delayed_work filecache_laundrette;
>  	struct shrinker *file_shrinker;
> +	struct nfsd_net *nn;
>  };
>  
>  static struct kmem_cache		*nfsd_file_slab;
> @@ -227,7 +229,6 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need,
>  
>  	this_cpu_inc(nfsd_file_allocations);
>  	INIT_LIST_HEAD(&nf->nf_lru);
> -	INIT_LIST_HEAD(&nf->nf_gc);
>  	nf->nf_birthtime = ktime_get();
>  	nf->nf_file = NULL;
>  	nf->nf_cred = get_current_cred();
> @@ -332,12 +333,16 @@ static bool nfsd_file_lru_add(struct nfsd_file *nf)
>  	struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id);
>  	struct nfsd_fcache_disposal *l = nn->fcache_disposal;
>  
> -	set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> -	if (list_lru_add_obj(&l->file_lru, &nf->nf_lru)) {
> +	spin_lock(&l->lock);
> +	if (list_empty(&nf->nf_lru)) {
> +		list_add_tail(&nf->nf_lru, &l->recent);
> +		l->num_gc += 1;
>  		atomic_long_inc(&nfsd_lru_total);
>  		trace_nfsd_file_lru_add(nf);
> +		spin_unlock(&l->lock);
>  		return true;
>  	}
> +	spin_unlock(&l->lock);
>  	return false;
>  }
>  
> @@ -346,11 +351,17 @@ static bool nfsd_file_lru_remove(struct nfsd_file *nf)
>  	struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id);
>  	struct nfsd_fcache_disposal *l = nn->fcache_disposal;
>  
> -	if (list_lru_del_obj(&l->file_lru, &nf->nf_lru)) {
> +	spin_lock(&l->lock);
> +	if (!list_empty(&nf->nf_lru)) {
> +		list_del_init(&nf->nf_lru);
>  		atomic_long_dec(&nfsd_lru_total);
> +		if (l->num_gc > 0)
> +			l->num_gc -= 1;
>  		trace_nfsd_file_lru_del(nf);
> +		spin_unlock(&l->lock);
>  		return true;
>  	}
> +	spin_unlock(&l->lock);
>  	return false;
>  }
>  
> @@ -440,12 +451,26 @@ nfsd_file_dispose_list(struct list_head *dispose)
>  	struct nfsd_file *nf;
>  
>  	while (!list_empty(dispose)) {
> -		nf = list_first_entry(dispose, struct nfsd_file, nf_gc);
> -		list_del_init(&nf->nf_gc);
> +		nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> +		list_del_init(&nf->nf_lru);
>  		nfsd_file_free(nf);
>  	}
>  }
>  
> +static void
> +nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose);
> +
> +static void
> +nfsd_file_release_list(struct list_head *dispose)
> +{
> +	LIST_HEAD(dispose2);
> +	struct nfsd_file *nf, *nf2;
> +
> +	list_for_each_entry_safe(nf, nf2, dispose, nf_lru)
> +		nfsd_file_cond_queue(nf, &dispose2);
> +	nfsd_file_dispose_list(&dispose2);
> +}
> +
>  /**
>   * nfsd_file_dispose_list_delayed - move list of dead files to net's freeme list
>   * @dispose: list of nfsd_files to be disposed
> @@ -458,12 +483,12 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
>  {
>  	while(!list_empty(dispose)) {
>  		struct nfsd_file *nf = list_first_entry(dispose,
> -						struct nfsd_file, nf_gc);
> +						struct nfsd_file, nf_lru);
>  		struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id);
>  		struct nfsd_fcache_disposal *l = nn->fcache_disposal;
>  
>  		spin_lock(&l->lock);
> -		list_move_tail(&nf->nf_gc, &l->freeme);
> +		list_move_tail(&nf->nf_lru, &l->freeme);
>  		spin_unlock(&l->lock);
>  		svc_wake_up(nn->nfsd_serv);
>  	}
> @@ -487,88 +512,32 @@ void nfsd_file_net_dispose(struct nfsd_net *nn)
>  		int i;
>  
>  		spin_lock(&l->lock);
> -		for (i = 0; i < 8 && !list_empty(&l->freeme); i++)
> -			list_move(l->freeme.next, &dispose);

While you're in here, could you document why we only take 8 at a time?
Maybe even consider turning it into a named constant?

> +		for (i = 0; i < 8 && !list_empty(&l->freeme); i++) {
> +			struct nfsd_file *nf = list_first_entry(
> +				&l->freeme, struct nfsd_file, nf_lru);
> +
> +			/*
> +			 * Don't throw out files that are still
> +			 * undergoing I/O or that have uncleared errors
> +			 * pending.
> +			 */
> +			if (nfsd_file_check_writeback(nf)) {
> +				trace_nfsd_file_gc_writeback(nf);
> +				list_move(&nf->nf_lru, &l->recent);
> +				l->num_gc += 1;
> +			} else {
> +				trace_nfsd_file_gc_disposed(nf);
> +				list_move(&nf->nf_lru, &dispose);
> +				this_cpu_inc(nfsd_file_evictions);
> +			}
> +		}
>  		spin_unlock(&l->lock);
>  		if (!list_empty(&l->freeme))
>  			/* Wake up another thread to share the work
>  			 * *before* doing any actual disposing.
>  			 */
>  			svc_wake_up(nn->nfsd_serv);
> -		nfsd_file_dispose_list(&dispose);
> -	}
> -}
> -
> -/**
> - * nfsd_file_lru_cb - Examine an entry on the LRU list
> - * @item: LRU entry to examine
> - * @lru: controlling LRU
> - * @arg: dispose list
> - *
> - * Return values:
> - *   %LRU_REMOVED: @item was removed from the LRU
> - *   %LRU_ROTATE: @item is to be moved to the LRU tail
> - *   %LRU_SKIP: @item cannot be evicted
> - */
> -static enum lru_status
> -nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> -		 void *arg)
> -{
> -	struct list_head *head = arg;
> -	struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
> -
> -	/* We should only be dealing with GC entries here */
> -	WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
> -
> -	/*
> -	 * Don't throw out files that are still undergoing I/O or
> -	 * that have uncleared errors pending.
> -	 */
> -	if (nfsd_file_check_writeback(nf)) {
> -		trace_nfsd_file_gc_writeback(nf);
> -		return LRU_SKIP;
> -	}
> -
> -	/* If it was recently added to the list, skip it */
> -	if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
> -		trace_nfsd_file_gc_referenced(nf);
> -		return LRU_ROTATE;
> -	}
> -
> -	/*
> -	 * Put the reference held on behalf of the LRU. If it wasn't the last
> -	 * one, then just remove it from the LRU and ignore it.
> -	 */
> -	if (!refcount_dec_and_test(&nf->nf_ref)) {
> -		trace_nfsd_file_gc_in_use(nf);
> -		list_lru_isolate(lru, &nf->nf_lru);
> -		return LRU_REMOVED;
> -	}
> -
> -	/* Refcount went to zero. Unhash it and queue it to the dispose list */
> -	nfsd_file_unhash(nf);
> -	list_lru_isolate(lru, &nf->nf_lru);
> -	list_add(&nf->nf_gc, head);
> -	this_cpu_inc(nfsd_file_evictions);
> -	trace_nfsd_file_gc_disposed(nf);
> -	return LRU_REMOVED;
> -}
> -
> -static void
> -nfsd_file_gc(struct nfsd_fcache_disposal *l)
> -{
> -	unsigned long remaining = list_lru_count(&l->file_lru);
> -	LIST_HEAD(dispose);
> -	unsigned long ret;
> -
> -	while (remaining > 0) {
> -		unsigned long num_to_scan = min(remaining, NFSD_FILE_GC_BATCH);
> -
> -		ret = list_lru_walk(&l->file_lru, nfsd_file_lru_cb,
> -				    &dispose, num_to_scan);
> -		trace_nfsd_file_gc_removed(ret, list_lru_count(&l->file_lru));
> -		nfsd_file_dispose_list_delayed(&dispose);
> -		remaining -= num_to_scan;
> +		nfsd_file_release_list(&dispose);
>  	}
>  }
>  
> @@ -577,9 +546,20 @@ nfsd_file_gc_worker(struct work_struct *work)
>  {
>  	struct nfsd_fcache_disposal *l = container_of(
>  		work, struct nfsd_fcache_disposal, filecache_laundrette.work);
> -	nfsd_file_gc(l);
> -	if (list_lru_count(&l->file_lru))
> +
> +	spin_lock(&l->lock);
> +	list_splice_init(&l->older, &l->freeme);
> +	list_splice_init(&l->recent, &l->older);
> +	/* We don't know how many were moved to 'freeme' and don't want
> +	 * to waste time counting - guess a half.
> +	 */
> +	l->num_gc /= 2;

Given that you have to manipulate the lists under a spinlock, it
wouldn't be difficult or expensive to keep accurate counts. nfsd
workloads can be "spiky", so it seems like this method may be wildly
inaccurate at times.

> +	if (!list_empty(&l->freeme))
> +		svc_wake_up(l->nn->nfsd_serv);
> +	if (!list_empty(&l->older) || !list_empty(&l->recent))
>  		nfsd_file_schedule_laundrette(l);
> +	spin_unlock(&l->lock);
> +

nit: remove the extra newline above

>  }
>  
>  static unsigned long
> @@ -587,22 +567,40 @@ nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc)
>  {
>  	struct nfsd_fcache_disposal *l = s->private_data;
>  
> -	return list_lru_count(&l->file_lru);
> +	return l->num_gc;
>  }
>  
>  static unsigned long
>  nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
>  {
>  	struct nfsd_fcache_disposal *l = s->private_data;
> +	struct nfsd_file *nf;
> +	int scanned = 0;
> +
> +	spin_lock(&l->lock);
> +	while (scanned < sc->nr_to_scan &&
> +	       (nf = list_first_entry_or_null(&l->older,
> +					      struct nfsd_file, nf_lru)) != NULL) {
> +		list_del_init(&nf->nf_lru);
> +		list_add_tail(&nf->nf_lru, &l->freeme);
> +		if (l->num_gc > 0)
> +			l->num_gc -= 1;
> +		scanned += 1;
> +	}
> +	if (scanned > 0)
> +		svc_wake_up(l->nn->nfsd_serv);
>  
> -	LIST_HEAD(dispose);
> -	unsigned long ret;
> +	trace_nfsd_file_shrinker_removed(scanned, l->num_gc);
>  
> -	ret = list_lru_shrink_walk(&l->file_lru, sc,
> -				   nfsd_file_lru_cb, &dispose);
> -	trace_nfsd_file_shrinker_removed(ret, list_lru_count(&l->file_lru));
> -	nfsd_file_dispose_list_delayed(&dispose);
> -	return ret;
> +	while (scanned < sc->nr_to_scan &&
> +	       (nf = list_first_entry_or_null(&l->recent,
> +					      struct nfsd_file, nf_lru)) != NULL) {
> +		list_del_init(&nf->nf_lru);
> +		list_add_tail(&nf->nf_lru, &l->older);
> +		scanned += 1;
> +	}
> +	spin_unlock(&l->lock);
> +	return scanned;
>  }
>  
>  /**
> @@ -615,7 +613,6 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
>   */
>  static void
>  nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose)
> -	__must_hold(RCU)
>  {
>  	int decrement = 1;
>  
> @@ -633,7 +630,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose)
>  
>  	/* If refcount goes to 0, then put on the dispose list */
>  	if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
> -		list_add(&nf->nf_gc, dispose);
> +		list_add(&nf->nf_lru, dispose);
>  		trace_nfsd_file_closing(nf);
>  	}
>  }
> @@ -858,7 +855,10 @@ nfsd_alloc_fcache_disposal(void)
>  		return NULL;
>  	spin_lock_init(&l->lock);
>  	INIT_DELAYED_WORK(&l->filecache_laundrette, nfsd_file_gc_worker);
> +	INIT_LIST_HEAD(&l->recent);
> +	INIT_LIST_HEAD(&l->older);
>  	INIT_LIST_HEAD(&l->freeme);
> +	l->num_gc = 0;
>  	l->file_shrinker = shrinker_alloc(0, "nfsd-filecache");
>  	if (!l->file_shrinker) {
>  		pr_err("nfsd: failed to allocate nfsd_file_shrinker\n");
> @@ -870,13 +870,6 @@ nfsd_alloc_fcache_disposal(void)
>  	l->file_shrinker->seeks = 1;
>  	l->file_shrinker->private_data = l;
>  
> -	if (list_lru_init(&l->file_lru)) {
> -		pr_err("nfsd: failed to init nfsd_file_lru\n");
> -		shrinker_free(l->file_shrinker);
> -		kfree(l);
> -		return NULL;
> -	}
> -
>  	shrinker_register(l->file_shrinker);
>  	return l;
>  }
> @@ -886,8 +879,12 @@ nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l)
>  {
>  	cancel_delayed_work_sync(&l->filecache_laundrette);
>  	shrinker_free(l->file_shrinker);
> -	list_lru_destroy(&l->file_lru);
> -	nfsd_file_dispose_list(&l->freeme);
> +	nfsd_file_release_list(&l->recent);
> +	WARN_ON_ONCE(!list_empty(&l->recent));
> +	nfsd_file_release_list(&l->older);
> +	WARN_ON_ONCE(!list_empty(&l->older));
> +	nfsd_file_release_list(&l->freeme);
> +	WARN_ON_ONCE(!list_empty(&l->freeme));
>  	kfree(l);
>  }
>  
> @@ -906,6 +903,8 @@ nfsd_file_cache_start_net(struct net *net)
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>  
>  	nn->fcache_disposal = nfsd_alloc_fcache_disposal();
> +	if (nn->fcache_disposal)
> +		nn->fcache_disposal->nn = nn;
>  	return nn->fcache_disposal ? 0 : -ENOMEM;
>  }
>  
> diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
> index a09a851d510e..02059b6c5e9a 100644
> --- a/fs/nfsd/filecache.h
> +++ b/fs/nfsd/filecache.h
> @@ -42,15 +42,13 @@ struct nfsd_file {
>  	struct net		*nf_net;
>  #define NFSD_FILE_HASHED	(0)
>  #define NFSD_FILE_PENDING	(1)
> -#define NFSD_FILE_REFERENCED	(2)
> -#define NFSD_FILE_GC		(3)
> +#define NFSD_FILE_GC		(2)
>  	unsigned long		nf_flags;
>  	refcount_t		nf_ref;
>  	unsigned char		nf_may;
>  
>  	struct nfsd_file_mark	*nf_mark;
>  	struct list_head	nf_lru;
> -	struct list_head	nf_gc;
>  	struct rcu_head		nf_rcu;
>  	ktime_t			nf_birthtime;
>  };
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index ad2c0c432d08..efa683541ed5 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -1038,7 +1038,6 @@ DEFINE_CLID_EVENT(confirmed_r);
>  	__print_flags(val, "|",						\
>  		{ 1 << NFSD_FILE_HASHED,	"HASHED" },		\
>  		{ 1 << NFSD_FILE_PENDING,	"PENDING" },		\
> -		{ 1 << NFSD_FILE_REFERENCED,	"REFERENCED" },		\
>  		{ 1 << NFSD_FILE_GC,		"GC" })
>  
>  DECLARE_EVENT_CLASS(nfsd_file_class,
> @@ -1314,9 +1313,7 @@ DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add);
>  DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add_disposed);
>  DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del);
>  DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed);
> -DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use);
>  DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback);
> -DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced);
>  DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed);
>  
>  DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class,
> @@ -1345,7 +1342,6 @@ DEFINE_EVENT(nfsd_file_lruwalk_class, name,				\
>  	),								\
>  	TP_ARGS(removed, remaining))
>  
> -DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed);
>  DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed);
>  
>  TRACE_EVENT(nfsd_file_close,



-- 
Jeff Layton <jlayton@xxxxxxxxxx>





[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux