Re: [PATCH v4 3/5] nfsd: rework refcounting in filecache

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, 2022-10-31 at 20:45 +0000, Chuck Lever III wrote:
> 
> > On Oct 31, 2022, at 7:37 AM, Jeff Layton <jlayton@xxxxxxxxxx> wrote:
> > 
> > The filecache refcounting is a bit non-standard for something searchable
> > by RCU, in that we maintain a sentinel reference while it's hashed. This
> > in turn requires that we have to do things differently in the "put"
> > depending on whether its hashed, which we believe to have led to races.
> > 
> > There are other problems in here too. nfsd_file_close_inode_sync can end
> > up freeing an nfsd_file while there are still outstanding references to
> > it, and there are a number of subtle ToC/ToU races.
> > 
> > Rework the code so that the refcount is what drives the lifecycle. When
> > the refcount goes to zero, then unhash and rcu free the object.
> > 
> > With this change, the LRU carries a reference. Take special care to
> > deal with it when removing an entry from the list.
> 
> ....removing an entry from the dispose list, or the LRU?
> 
> I spent today reviewing and testing this series, and
> comparing its behavior to the behavior of the current
> code base. A few general comments below:
> 

Thanks for the review!

> 
> > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> > ---
> > fs/nfsd/filecache.c | 247 +++++++++++++++++++++-----------------------
> > fs/nfsd/trace.h     |   1 +
> > 2 files changed, 121 insertions(+), 127 deletions(-)
> > 
> > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> > index ce80b6a80ffc..d928c5e38eeb 100644
> > --- a/fs/nfsd/filecache.c
> > +++ b/fs/nfsd/filecache.c
> > @@ -1,6 +1,12 @@
> > // SPDX-License-Identifier: GPL-2.0
> > /*
> >  * The NFSD open file cache.
> > + *
> > + * Each nfsd_file is created in response to client activity -- either regular
> > + * file I/O for v2/v3, or opening a file for v4. Files opened via v4 are
> > + * cleaned up as soon as their refcount goes to 0.  Entries for v2/v3 are
> > + * flagged with NFSD_FILE_GC. On their last put, they are added to the LRU for
> > + * eventual disposal if they aren't used again within a short time period.
> 
> Trond and I discussed intentionally not using the terms "v4-only"
> and "v2/3" in the filecache comments and function names. That's
> why the flag is called GC and not, say, NFSV3. Can the new comments
> stick with gc and nogc?
> 

This comment is explaining that v2/v3 _are_ GC entries. I can just
remove this comment I guess, but converting all the mentions to gc/non-
gc in this comment will make it nonsensical.

> 
> >  */
> > 
> > #include <linux/hash.h>
> > @@ -301,8 +307,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> > 		if (key->gc)
> > 			__set_bit(NFSD_FILE_GC, &nf->nf_flags);
> > 		nf->nf_inode = key->inode;
> > -		/* nf_ref is pre-incremented for hash table */
> > -		refcount_set(&nf->nf_ref, 2);
> > +		refcount_set(&nf->nf_ref, 1);
> > 		nf->nf_may = key->need;
> > 		nf->nf_mark = NULL;
> > 	}
> > @@ -351,23 +356,25 @@ nfsd_file_unhash(struct nfsd_file *nf)
> > 	return false;
> > }
> > 
> 
> There's still no explanation in the code of why writeback is
> needed for GC files. I can write a patch to add that, say,
> right here as a documenting comment for nfsd_file_free, if
> you agree.
> 
> 

I'll add something here.

> > -static bool
> > +static void
> > nfsd_file_free(struct nfsd_file *nf)
> > {
> > 	s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
> > -	bool flush = false;
> > +
> > +	trace_nfsd_file_free(nf);
> 
> Maybe move this tracepoint in 2/5 instead? Or just leave it
> where it was. Up to you.
> 

Sure, done.

> 
> > 
> > 	this_cpu_inc(nfsd_file_releases);
> > 	this_cpu_add(nfsd_file_total_age, age);
> > 
> > -	trace_nfsd_file_free(nf);
> > +	nfsd_file_unhash(nf);
> > +	nfsd_file_fsync(nf);
> > +
> > 	if (nf->nf_mark)
> > 		nfsd_file_mark_put(nf->nf_mark);
> > 	if (nf->nf_file) {
> > 		get_file(nf->nf_file);
> > 		filp_close(nf->nf_file, NULL);
> > 		fput(nf->nf_file);
> > -		flush = true;
> > 	}
> > 
> > 	/*
> > @@ -375,10 +382,9 @@ nfsd_file_free(struct nfsd_file *nf)
> > 	 * WARN and leak it to preserve system stability.
> > 	 */
> > 	if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
> > -		return flush;
> > +		return;
> > 
> > 	call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
> > -	return flush;
> > }
> > 
> > static bool
> > @@ -394,17 +400,23 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
> > 		mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
> > }
> > 
> > -static void nfsd_file_lru_add(struct nfsd_file *nf)
> > +static bool nfsd_file_lru_add(struct nfsd_file *nf)
> > {
> > 	set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
> > -	if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
> > +	if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
> > 		trace_nfsd_file_lru_add(nf);
> > +		return true;
> > +	}
> > +	return false;
> > }
> > 
> > -static void nfsd_file_lru_remove(struct nfsd_file *nf)
> > +static bool nfsd_file_lru_remove(struct nfsd_file *nf)
> > {
> > -	if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
> > +	if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
> > 		trace_nfsd_file_lru_del(nf);
> > +		return true;
> > +	}
> > +	return false;
> > }
> > 
> > struct nfsd_file *
> > @@ -415,86 +427,77 @@ nfsd_file_get(struct nfsd_file *nf)
> > 	return NULL;
> > }
> > 
> > -static void
> > +/**
> > + * nfsd_file_unhash_and_queue - unhash a file and queue it to the dispose list
> > + * @nf: nfsd_file to be unhashed and queued
> > + * @dispose: list to which it should be queued
> > + *
> > + * Attempt to unhash a nfsd_file and queue it to the given list. Each file
> > + * will have a reference held on behalf of the list. That reference may come
> > + * from the LRU, or we may need to take one. If we can't get a reference,
> > + * ignore it altogether.
> > + */
> > +static bool
> > nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
> > {
> > 	trace_nfsd_file_unhash_and_queue(nf);
> > 	if (nfsd_file_unhash(nf)) {
> > -		/* caller must call nfsd_file_dispose_list() later */
> > -		nfsd_file_lru_remove(nf);
> > +		/*
> > +		 * If we remove it from the LRU, then just use that
> > +		 * reference for the dispose list. Otherwise, we need
> > +		 * to take a reference. If that fails, just ignore
> > +		 * the file altogether.
> > +		 */
> > +		if (!nfsd_file_lru_remove(nf) && !nfsd_file_get(nf))
> > +			return false;
> > 		list_add(&nf->nf_lru, dispose);
> > +		return true;
> > 	}
> > +	return false;
> > }
> > 
> > -static void
> > -nfsd_file_put_noref(struct nfsd_file *nf)
> > -{
> > -	trace_nfsd_file_put(nf);
> > -
> > -	if (refcount_dec_and_test(&nf->nf_ref)) {
> > -		WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
> > -		nfsd_file_lru_remove(nf);
> > -		nfsd_file_free(nf);
> > -	}
> > -}
> > -
> > -static void
> > -nfsd_file_unhash_and_put(struct nfsd_file *nf)
> > -{
> > -	if (nfsd_file_unhash(nf))
> > -		nfsd_file_put_noref(nf);
> > -}
> > -
> > +/**
> > + * nfsd_file_put - put the reference to a nfsd_file
> > + * @nf: nfsd_file of which to put the reference
> > + *
> > + * Put a reference to a nfsd_file. In the v4 case, we just put the
> > + * reference immediately. In the v2/3 case, if the reference would be
> > + * the last one, the put it on the LRU instead to be cleaned up later.
> > + */
> > void
> > nfsd_file_put(struct nfsd_file *nf)
> > {
> > -	might_sleep();
> 
> The might_sleep() annotation should be added back to nfsd_file_fsync()
> or its callers. It might give more useful warnings if you were to
> annotate the external API functions (as was done here) rather than
> internal helper functions.
> 

Actually, it's probably better in nfsd_file_put(). That'll give us more
coverage.

> 
> > -
> > -	if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
> > -		nfsd_file_lru_add(nf);
> > -	else if (refcount_read(&nf->nf_ref) == 2)
> > -		nfsd_file_unhash_and_put(nf);
> > -
> > -	if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > -		nfsd_file_fsync(nf);
> > -		nfsd_file_put_noref(nf);
> > -	} else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
> > -		nfsd_file_put_noref(nf);
> > -		nfsd_file_schedule_laundrette();
> 
> This might be the only caller of nfsd_file_schedule_laundrette.
> How does the laundrette get scheduled now?
> 

Good point. I think we need to schedule it any time an entry is added to
the LRU.

> > -	} else
> > -		nfsd_file_put_noref(nf);
> > -}
> > -
> > -static void
> > -nfsd_file_dispose_list(struct list_head *dispose)
> > -{
> > -	struct nfsd_file *nf;
> > +	trace_nfsd_file_put(nf);
> > 
> > -	while(!list_empty(dispose)) {
> > -		nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > -		list_del_init(&nf->nf_lru);
> > -		nfsd_file_fsync(nf);
> > -		nfsd_file_put_noref(nf);
> > +	/*
> > +	 * The HASHED check is racy. We may end up with the occasional
> > +	 * unhashed entry on the LRU, but they should get cleaned up
> > +	 * like any other.
> > +	 */
> > +	if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
> > +	    test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > +		/*
> > +		 * If this is the last reference (nf_ref == 1), then transfer
> > +		 * it to the LRU. If the add to the LRU fails, just put it as
> > +		 * usual.
> > +		 */
> > +		if (refcount_dec_not_one(&nf->nf_ref) || nfsd_file_lru_add(nf))
> > +			return;
> > 	}
> > +	if (refcount_dec_and_test(&nf->nf_ref))
> > +		nfsd_file_free(nf);
> > }
> > 
> > static void
> > -nfsd_file_dispose_list_sync(struct list_head *dispose)
> > +nfsd_file_dispose_list(struct list_head *dispose)
> > {
> > -	bool flush = false;
> > 	struct nfsd_file *nf;
> > 
> > 	while(!list_empty(dispose)) {
> > 		nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
> > 		list_del_init(&nf->nf_lru);
> > -		nfsd_file_fsync(nf);
> > -		if (!refcount_dec_and_test(&nf->nf_ref))
> > -			continue;
> > -		if (nfsd_file_free(nf))
> > -			flush = true;
> > +		nfsd_file_free(nf);
> > 	}
> > -	if (flush)
> > -		flush_delayed_fput();
> > }
> > 
> > static void
> > @@ -564,21 +567,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > 	struct list_head *head = arg;
> > 	struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
> > 
> > -	/*
> > -	 * Do a lockless refcount check. The hashtable holds one reference, so
> > -	 * we look to see if anything else has a reference, or if any have
> > -	 * been put since the shrinker last ran. Those don't get unhashed and
> > -	 * released.
> > -	 *
> > -	 * Note that in the put path, we set the flag and then decrement the
> > -	 * counter. Here we check the counter and then test and clear the flag.
> > -	 * That order is deliberate to ensure that we can do this locklessly.
> > -	 */
> > -	if (refcount_read(&nf->nf_ref) > 1) {
> > -		list_lru_isolate(lru, &nf->nf_lru);
> > -		trace_nfsd_file_gc_in_use(nf);
> > -		return LRU_REMOVED;
> > -	}
> > +	/* We should only be dealing with v2/3 entries here */
> > +	WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
> > 
> > 	/*
> > 	 * Don't throw out files that are still undergoing I/O or
> > @@ -589,40 +579,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
> > 		return LRU_SKIP;
> > 	}
> > 
> > +	/* If it was recently added to the list, skip it */
> > 	if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
> > 		trace_nfsd_file_gc_referenced(nf);
> > 		return LRU_ROTATE;
> > 	}
> > 
> > -	if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
> > -		trace_nfsd_file_gc_hashed(nf);
> > -		return LRU_SKIP;
> > +	/*
> > +	 * Put the reference held on behalf of the LRU. If it wasn't the last
> > +	 * one, then just remove it from the LRU and ignore it.
> > +	 */
> > +	if (!refcount_dec_and_test(&nf->nf_ref)) {
> > +		trace_nfsd_file_gc_in_use(nf);
> > +		list_lru_isolate(lru, &nf->nf_lru);
> > +		return LRU_REMOVED;
> > 	}
> > 
> > +	/* Refcount went to zero. Unhash it and queue it to the dispose list */
> > +	nfsd_file_unhash(nf);
> > 	list_lru_isolate_move(lru, &nf->nf_lru, head);
> > 	this_cpu_inc(nfsd_file_evictions);
> > 	trace_nfsd_file_gc_disposed(nf);
> > 	return LRU_REMOVED;
> > }
> > 
> > -/*
> > - * Unhash items on @dispose immediately, then queue them on the
> > - * disposal workqueue to finish releasing them in the background.
> > - *
> > - * cel: Note that between the time list_lru_shrink_walk runs and
> > - * now, these items are in the hash table but marked unhashed.
> > - * Why release these outside of lru_cb ? There's no lock ordering
> > - * problem since lru_cb currently takes no lock.
> > - */
> > -static void nfsd_file_gc_dispose_list(struct list_head *dispose)
> > -{
> > -	struct nfsd_file *nf;
> > -
> > -	list_for_each_entry(nf, dispose, nf_lru)
> > -		nfsd_file_hash_remove(nf);
> > -	nfsd_file_dispose_list_delayed(dispose);
> > -}
> > -
> > static void
> > nfsd_file_gc(void)
> > {
> > @@ -632,7 +612,7 @@ nfsd_file_gc(void)
> > 	ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
> > 			    &dispose, list_lru_count(&nfsd_file_lru));
> > 	trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
> > -	nfsd_file_gc_dispose_list(&dispose);
> > +	nfsd_file_dispose_list_delayed(&dispose);
> > }
> > 
> > static void
> > @@ -657,7 +637,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
> > 	ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
> > 				   nfsd_file_lru_cb, &dispose);
> > 	trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
> > -	nfsd_file_gc_dispose_list(&dispose);
> > +	nfsd_file_dispose_list_delayed(&dispose);
> > 	return ret;
> > }
> > 
> > @@ -668,8 +648,11 @@ static struct shrinker	nfsd_file_shrinker = {
> > };
> > 
> > /*
> > - * Find all cache items across all net namespaces that match @inode and
> > - * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
> > + * Find all cache items across all net namespaces that match @inode, unhash
> > + * them, take references and then put them on @dispose if that was successful.
> > + *
> > + * The nfsd_file objects on the list will be unhashed, and each will have a
> > + * reference taken.
> >  */
> > static unsigned int
> > __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > @@ -687,8 +670,9 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> > 				       nfsd_file_rhash_params);
> > 		if (!nf)
> > 			break;
> > -		nfsd_file_unhash_and_queue(nf, dispose);
> > -		count++;
> > +
> > +		if (nfsd_file_unhash_and_queue(nf, dispose))
> > +			count++;
> > 	} while (1);
> > 	rcu_read_unlock();
> > 	return count;
> > @@ -700,15 +684,25 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
> >  *
> >  * Unhash and put all cache item associated with @inode.
> >  */
> > -static void
> > +static unsigned int
> > nfsd_file_close_inode(struct inode *inode)
> > {
> > -	LIST_HEAD(dispose);
> > +	struct nfsd_file *nf;
> > 	unsigned int count;
> > +	LIST_HEAD(dispose);
> > 
> > 	count = __nfsd_file_close_inode(inode, &dispose);
> > 	trace_nfsd_file_close_inode(inode, count);
> > -	nfsd_file_dispose_list_delayed(&dispose);
> > +	if (count) {
> 
> Why is it necessary to check @count? I would think that
> the the empty list check would be sufficient.
> 
> 

It's not necessary. I'll change it.

> > +		while(!list_empty(&dispose)) {
> > +			nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
> > +			list_del_init(&nf->nf_lru);
> > +			trace_nfsd_file_closing(nf);
> > +			if (refcount_dec_and_test(&nf->nf_ref))
> > +				nfsd_file_free(nf);
> > +		}
> > +	}
> > +	return count;
> > }
> > 
> > /**
> > @@ -720,19 +714,15 @@ nfsd_file_close_inode(struct inode *inode)
> > void
> > nfsd_file_close_inode_sync(struct inode *inode)
> > {
> > -	LIST_HEAD(dispose);
> > -	unsigned int count;
> > -
> > -	count = __nfsd_file_close_inode(inode, &dispose);
> > -	trace_nfsd_file_close_inode_sync(inode, count);
> > -	nfsd_file_dispose_list_sync(&dispose);
> > +	if (nfsd_file_close_inode(inode))
> > +		flush_delayed_fput();
> > }
> > 
> > /**
> >  * nfsd_file_delayed_close - close unused nfsd_files
> >  * @work: dummy
> >  *
> > - * Walk the LRU list and close any entries that have not been used since
> > + * Walk the LRU list and destroy any entries that have not been used since
> >  * the last scan.
> >  */
> > static void
> > @@ -1054,8 +1044,10 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > 	rcu_read_lock();
> > 	nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
> > 			       nfsd_file_rhash_params);
> > -	if (nf)
> > -		nf = nfsd_file_get(nf);
> > +	if (nf) {
> > +		if (!nfsd_file_lru_remove(nf))
> > +			nf = nfsd_file_get(nf);
> > +	}
> > 	rcu_read_unlock();
> > 	if (nf)
> > 		goto wait_for_construction;
> > @@ -1090,11 +1082,11 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > 			goto out;
> > 		}
> > 		open_retry = false;
> > -		nfsd_file_put_noref(nf);
> > +		if (refcount_dec_and_test(&nf->nf_ref))
> > +			nfsd_file_free(nf);
> > 		goto retry;
> > 	}
> > 
> > -	nfsd_file_lru_remove(nf);
> > 	this_cpu_inc(nfsd_file_cache_hits);
> > 
> > 	status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
> > @@ -1104,7 +1096,8 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > 			this_cpu_inc(nfsd_file_acquisitions);
> > 		*pnf = nf;
> > 	} else {
> > -		nfsd_file_put(nf);
> > +		if (refcount_dec_and_test(&nf->nf_ref))
> > +			nfsd_file_free(nf);
> > 		nf = NULL;
> > 	}
> > 
> > @@ -1131,7 +1124,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> > 	 * then unhash.
> > 	 */
> > 	if (status != nfs_ok || key.inode->i_nlink == 0)
> > -		nfsd_file_unhash_and_put(nf);
> > +		nfsd_file_unhash(nf);
> > 	clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
> > 	smp_mb__after_atomic();
> > 	wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index 940252482fd4..a44ded06af87 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -906,6 +906,7 @@ DEFINE_EVENT(nfsd_file_class, name, \
> > DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
> > DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
> > DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
> > +DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
> > DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
> > 
> > TRACE_EVENT(nfsd_file_alloc,
> > -- 
> > 2.38.1
> > 

Thanks again for the review!
-- 
Jeff Layton <jlayton@xxxxxxxxxx>




[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux