Re: [PATCH v2] nfsd: don't fsync nfsd_files on last close

Chuck Lever III <chuck.lever@xxxxxxxxxx> · Tue, 7 Feb 2023 17:34:09 +0000

> On Feb 7, 2023, at 12:02 PM, Jeff Layton <jlayton@xxxxxxxxxx> wrote:
> 
> Most of the time, NFSv4 clients issue a COMMIT before the final CLOSE of
> an open stateid, so with NFSv4, the fsync in the nfsd_file_free path is
> usually a no-op and doesn't block.
> 
> We have a customer running knfsd over very slow storage (XFS over Ceph
> RBD). They were using the "async" export option because performance was
> more important than data integrity for this application. That export
> option turns NFSv4 COMMIT calls into no-ops. Due to the fsync in this
> codepath however, their final CLOSE calls would still stall (since a
> CLOSE effectively became a COMMIT).
> 
> I think this fsync is not strictly necessary. We only use that result to
> reset the write verifier. Instead of fsync'ing all of the data when we
> free an nfsd_file, we can just check for writeback errors when one is
> acquired and when it is freed.
> 
> If the client never comes back, then it'll never see the error anyway
> and there is no point in resetting it. If an error occurs after the
> nfsd_file is removed from the cache but before the inode is evicted,
> then it will reset the write verifier on the next nfsd_file_acquire,
> (since there will be an unseen error).
> 
> The only exception here is if something else opens and fsyncs the file
> during that window. Given that local applications work with this
> limitation today, I don't see that as an issue.
> 
> Link: https://bugzilla.redhat.com/show_bug.cgi?id=2166658
> Fixes: ac3a2585f018 (nfsd: rework refcounting in filecache)
> Reported-and-Tested-by: Pierguido Lambri <plambri@xxxxxxxxxx>
> Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> ---
> fs/nfsd/filecache.c | 44 ++++++++++++--------------------------------
> fs/nfsd/trace.h     | 31 -------------------------------
> 2 files changed, 12 insertions(+), 63 deletions(-)
> 
> v2:
> - rebase onto nfsd-next branch
> - add Fixes and Link tags

v2 applied internally for testing. Should make it out to
nfsd-next in a few days, and then appear in the v6.3 NFSD
PR in a couple weeks.

> diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
> index ecc32105c3ab..6e8712bd7c99 100644
> --- a/fs/nfsd/filecache.c
> +++ b/fs/nfsd/filecache.c
> @@ -331,37 +331,27 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
> 	return nf;
> }
> 
> +/**
> + * nfsd_file_check_write_error - check for writeback errors on a file
> + * @nf: nfsd_file to check for writeback errors
> + *
> + * Check whether a nfsd_file has an unseen error. Reset the write
> + * verifier if so.
> + */
> static void
> -nfsd_file_fsync(struct nfsd_file *nf)
> -{
> -	struct file *file = nf->nf_file;
> -	int ret;
> -
> -	if (!file || !(file->f_mode & FMODE_WRITE))
> -		return;
> -	ret = vfs_fsync(file, 1);
> -	trace_nfsd_file_fsync(nf, ret);
> -	if (ret)
> -		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> -}
> -
> -static int
> nfsd_file_check_write_error(struct nfsd_file *nf)
> {
> 	struct file *file = nf->nf_file;
> 
> -	if (!file || !(file->f_mode & FMODE_WRITE))
> -		return 0;
> -	return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
> +	if ((file->f_mode & FMODE_WRITE) &&
> +	    filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)))
> +		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> }
> 
> static void
> nfsd_file_hash_remove(struct nfsd_file *nf)
> {
> 	trace_nfsd_file_unhash(nf);
> -
> -	if (nfsd_file_check_write_error(nf))
> -		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
> 	rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash,
> 			       nfsd_file_rhash_params);
> }
> @@ -387,23 +377,12 @@ nfsd_file_free(struct nfsd_file *nf)
> 	this_cpu_add(nfsd_file_total_age, age);
> 
> 	nfsd_file_unhash(nf);
> -
> -	/*
> -	 * We call fsync here in order to catch writeback errors. It's not
> -	 * strictly required by the protocol, but an nfsd_file could get
> -	 * evicted from the cache before a COMMIT comes in. If another
> -	 * task were to open that file in the interim and scrape the error,
> -	 * then the client may never see it. By calling fsync here, we ensure
> -	 * that writeback happens before the entry is freed, and that any
> -	 * errors reported result in the write verifier changing.
> -	 */
> -	nfsd_file_fsync(nf);
> -
> 	if (nf->nf_mark)
> 		nfsd_file_mark_put(nf->nf_mark);
> 	if (nf->nf_file) {
> 		get_file(nf->nf_file);
> 		filp_close(nf->nf_file, NULL);
> +		nfsd_file_check_write_error(nf);
> 		fput(nf->nf_file);
> 	}
> 
> @@ -1158,6 +1137,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
> out:
> 	if (status == nfs_ok) {
> 		this_cpu_inc(nfsd_file_acquisitions);
> +		nfsd_file_check_write_error(nf);
> 		*pnf = nf;
> 	} else {
> 		if (refcount_dec_and_test(&nf->nf_ref))
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index 8f9c82d9e075..4183819ea082 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -1202,37 +1202,6 @@ TRACE_EVENT(nfsd_file_close,
> 	)
> );
> 
> -TRACE_EVENT(nfsd_file_fsync,
> -	TP_PROTO(
> -		const struct nfsd_file *nf,
> -		int ret
> -	),
> -	TP_ARGS(nf, ret),
> -	TP_STRUCT__entry(
> -		__field(void *, nf_inode)
> -		__field(int, nf_ref)
> -		__field(int, ret)
> -		__field(unsigned long, nf_flags)
> -		__field(unsigned char, nf_may)
> -		__field(struct file *, nf_file)
> -	),
> -	TP_fast_assign(
> -		__entry->nf_inode = nf->nf_inode;
> -		__entry->nf_ref = refcount_read(&nf->nf_ref);
> -		__entry->ret = ret;
> -		__entry->nf_flags = nf->nf_flags;
> -		__entry->nf_may = nf->nf_may;
> -		__entry->nf_file = nf->nf_file;
> -	),
> -	TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p ret=%d",
> -		__entry->nf_inode,
> -		__entry->nf_ref,
> -		show_nf_flags(__entry->nf_flags),
> -		show_nfsd_may_flags(__entry->nf_may),
> -		__entry->nf_file, __entry->ret
> -	)
> -);
> -
> #include "cache.h"
> 
> TRACE_DEFINE_ENUM(RC_DROPIT);
> -- 
> 2.39.1
> 

--
Chuck Lever