Re: [PATCH v6 6/9] nfsd: use the getattr operation to fetch i_version

Jeff Layton <jlayton@xxxxxxxxxx> · Wed, 05 Oct 2022 06:06:48 -0400

On Tue, 2022-10-04 at 10:39 +1100, NeilBrown wrote:
> On Fri, 30 Sep 2022, Jeff Layton wrote:
> > Now that we can call into vfs_getattr to get the i_version field, use
> > that facility to fetch it instead of doing it in nfsd4_change_attribute.
> > 
> > Neil also pointed out recently that IS_I_VERSION directory operations
> > are always logged, and so we only need to mitigate the rollback problem
> > on regular files. Also, we don't need to factor in the ctime when
> > reexporting NFS or Ceph.
> > 
> > Set the STATX_VERSION (and BTIME) bits in the request when we're dealing
> > with a v4 request. Then, instead of looking at IS_I_VERSION when
> > generating the change attr, look at the result mask and only use it if
> > STATX_VERSION is set. With this change, we can drop the fetch_iversion
> > export operation as well.
> > 
> > Move nfsd4_change_attribute into nfsfh.c, and change it to only factor
> > in the ctime if it's a regular file and the fs doesn't advertise
> > STATX_ATTR_VERSION_MONOTONIC.
> > 
> > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> > ---
> >  fs/nfs/export.c          |  7 -------
> >  fs/nfsd/nfs4xdr.c        |  4 +++-
> >  fs/nfsd/nfsfh.c          | 40 ++++++++++++++++++++++++++++++++++++++++
> >  fs/nfsd/nfsfh.h          | 29 +----------------------------
> >  fs/nfsd/vfs.h            |  7 ++++++-
> >  include/linux/exportfs.h |  1 -
> >  6 files changed, 50 insertions(+), 38 deletions(-)
> > 
> > diff --git a/fs/nfs/export.c b/fs/nfs/export.c
> > index 01596f2d0a1e..1a9d5aa51dfb 100644
> > --- a/fs/nfs/export.c
> > +++ b/fs/nfs/export.c
> > @@ -145,17 +145,10 @@ nfs_get_parent(struct dentry *dentry)
> >  	return parent;
> >  }
> >  
> > -static u64 nfs_fetch_iversion(struct inode *inode)
> > -{
> > -	nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
> > -	return inode_peek_iversion_raw(inode);
> > -}
> > -
> >  const struct export_operations nfs_export_ops = {
> >  	.encode_fh = nfs_encode_fh,
> >  	.fh_to_dentry = nfs_fh_to_dentry,
> >  	.get_parent = nfs_get_parent,
> > -	.fetch_iversion = nfs_fetch_iversion,
> >  	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
> >  		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
> >  		EXPORT_OP_NOATOMIC_ATTR,
> > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > index 1e9690a061ec..779c009314c6 100644
> > --- a/fs/nfsd/nfs4xdr.c
> > +++ b/fs/nfsd/nfs4xdr.c
> > @@ -2869,7 +2869,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
> >  			goto out;
> >  	}
> >  
> > -	err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
> > +	err = vfs_getattr(&path, &stat,
> > +			  STATX_BASIC_STATS | STATX_BTIME | STATX_VERSION,
> > +			  AT_STATX_SYNC_AS_STAT);
> >  	if (err)
> >  		goto out_nfserr;
> >  	if (!(stat.result_mask & STATX_BTIME))
> > diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
> > index a5b71526cee0..9168bc657378 100644
> > --- a/fs/nfsd/nfsfh.c
> > +++ b/fs/nfsd/nfsfh.c
> > @@ -634,6 +634,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
> >  		stat.mtime = inode->i_mtime;
> >  		stat.ctime = inode->i_ctime;
> >  		stat.size  = inode->i_size;
> > +		if (v4 && IS_I_VERSION(inode)) {
> > +			stat.version = inode_query_iversion(inode);
> > +			stat.result_mask |= STATX_VERSION;
> > +		}
> 
> This is increasingly ugly.  I wonder if it is justified at all...
> 

I'm fine with dropping that. So if the getattrs fail, we should just not
offer up pre/post attrs?

> >  	}
> >  	if (v4)
> >  		fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
> > @@ -665,6 +669,8 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
> >  	if (err) {
> >  		fhp->fh_post_saved = false;
> >  		fhp->fh_post_attr.ctime = inode->i_ctime;
> > +		if (v4 && IS_I_VERSION(inode))
> > +			fhp->fh_post_attr.version = inode_query_iversion(inode);
> 
> ... ditto ...
> 
> >  	} else
> >  		fhp->fh_post_saved = true;
> >  	if (v4)
> > @@ -754,3 +760,37 @@ enum fsid_source fsid_source(const struct svc_fh *fhp)
> >  		return FSIDSOURCE_UUID;
> >  	return FSIDSOURCE_DEV;
> >  }
> > +
> > +/*
> > + * We could use i_version alone as the change attribute.  However, i_version
> > + * can go backwards on a regular file after an unclean shutdown.  On its own
> > + * that doesn't necessarily cause a problem, but if i_version goes backwards
> > + * and then is incremented again it could reuse a value that was previously
> > + * used before boot, and a client who queried the two values might incorrectly
> > + * assume nothing changed.
> > + *
> > + * By using both ctime and the i_version counter we guarantee that as long as
> > + * time doesn't go backwards we never reuse an old value. If the filesystem
> > + * advertises STATX_ATTR_VERSION_MONOTONIC, then this mitigation is not needed.
> > + *
> > + * We only need to do this for regular files as well. For directories, we
> > + * assume that the new change attr is always logged to stable storage in some
> > + * fashion before the results can be seen.
> > + */
> > +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
> > +{
> > +	u64 chattr;
> > +
> > +	if (stat->result_mask & STATX_VERSION) {
> > +		chattr = stat->version;
> > +
> > +		if (S_ISREG(inode->i_mode) &&
> > +		    !(stat->attributes & STATX_ATTR_VERSION_MONOTONIC)) {
> 
> I would really rather that the fs got to make this decision.
> If it can guarantee that the i_version is monotonic even over a crash
> (which is probably can for directory, and might need changes to do for
> files) then it sets STATX_ATTR_VERSION_MONOTONIC and nfsd trusts it
> completely.
> If it cannot, then it doesn't set the flag.
> i.e. the S_ISREG() test should be in the filesystem, not in nfsd.
> 

This sounds reasonable, but for one thing.

>From RFC 7862:

   While Section 5.4 of [RFC5661] discusses
   per-file system attributes, it is expected that the value of
   change_attr_type will not depend on the value of "homogeneous" and
   will only change in the event of a migration.

The change_attr_type4 must be the same for all filehandles under a
particular filesystem.

If we do what you suggest though, then it's easily possible for the fs
to set STATX_ATTR_VERSION_MONOTONIC on directories but not files. If we
later want to allow nfsd to advertise a change_attr_type4, we won't be
able to rely on the STATX_ATTR_VERSION_MONOTONIC to tell us how to fill
that out.

Maybe that's ok. I suppose we could add a new field to the export
options that filesystems can set to advertise what sort of change attr
they offer?

> 
> > +			chattr += (u64)stat->ctime.tv_sec << 30;
> > +			chattr += stat->ctime.tv_nsec;
> > +		}
> > +	} else {
> > +		chattr = time_to_chattr(&stat->ctime);
> > +	}
> > +	return chattr;
> > +}
> > diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> > index c3ae6414fc5c..4c223a7a91d4 100644
> > --- a/fs/nfsd/nfsfh.h
> > +++ b/fs/nfsd/nfsfh.h
> > @@ -291,34 +291,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
> >  	fhp->fh_pre_saved = false;
> >  }
> >  
> > -/*
> > - * We could use i_version alone as the change attribute.  However,
> > - * i_version can go backwards after a reboot.  On its own that doesn't
> > - * necessarily cause a problem, but if i_version goes backwards and then
> > - * is incremented again it could reuse a value that was previously used
> > - * before boot, and a client who queried the two values might
> > - * incorrectly assume nothing changed.
> > - *
> > - * By using both ctime and the i_version counter we guarantee that as
> > - * long as time doesn't go backwards we never reuse an old value.
> > - */
> > -static inline u64 nfsd4_change_attribute(struct kstat *stat,
> > -					 struct inode *inode)
> > -{
> > -	if (inode->i_sb->s_export_op->fetch_iversion)
> > -		return inode->i_sb->s_export_op->fetch_iversion(inode);
> > -	else if (IS_I_VERSION(inode)) {
> > -		u64 chattr;
> > -
> > -		chattr =  stat->ctime.tv_sec;
> > -		chattr <<= 30;
> > -		chattr += stat->ctime.tv_nsec;
> > -		chattr += inode_query_iversion(inode);
> > -		return chattr;
> > -	} else
> > -		return time_to_chattr(&stat->ctime);
> > -}
> > -
> > +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode);
> >  extern void fh_fill_pre_attrs(struct svc_fh *fhp);
> >  extern void fh_fill_post_attrs(struct svc_fh *fhp);
> >  extern void fh_fill_both_attrs(struct svc_fh *fhp);
> > diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> > index c95cd414b4bb..a905f59481ee 100644
> > --- a/fs/nfsd/vfs.h
> > +++ b/fs/nfsd/vfs.h
> > @@ -168,9 +168,14 @@ static inline void fh_drop_write(struct svc_fh *fh)
> >  
> >  static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
> >  {
> > +	u32 request_mask = STATX_BASIC_STATS;
> >  	struct path p = {.mnt = fh->fh_export->ex_path.mnt,
> >  			 .dentry = fh->fh_dentry};
> > -	return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS,
> > +
> > +	if (fh->fh_maxsize == NFS4_FHSIZE)
> > +		request_mask |= (STATX_BTIME | STATX_VERSION);
> > +
> > +	return nfserrno(vfs_getattr(&p, stat, request_mask,
> >  				    AT_STATX_SYNC_AS_STAT));
> >  }
> >  
> > diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
> > index fe848901fcc3..9f4d4bcbf251 100644
> > --- a/include/linux/exportfs.h
> > +++ b/include/linux/exportfs.h
> > @@ -213,7 +213,6 @@ struct export_operations {
> >  			  bool write, u32 *device_generation);
> >  	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
> >  			     int nr_iomaps, struct iattr *iattr);
> > -	u64 (*fetch_iversion)(struct inode *);
> >  #define	EXPORT_OP_NOWCC			(0x1) /* don't collect v3 wcc data */
> >  #define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
> >  #define	EXPORT_OP_CLOSE_BEFORE_UNLINK	(0x4) /* close files before unlink */
> > -- 
> > 2.37.3
> > 
> > 
> 
> Definitely more to like than to dislike here, so
> 
> Reviewed-by: NeilBrown <neilb@xxxxxxx>
> 
> Thanks,
> NeilBrown

-- 
Jeff Layton <jlayton@xxxxxxxxxx>