On Mon, 2024-05-27 at 13:04 +1000, NeilBrown wrote: > > dentry->d_fsdata is set to NFS_FSDATA_BLOCKED while unlinking or > renaming-over a file to ensure that no open succeeds while the NFS > operation progressed on the server. > > Setting dentry->d_fsdata to NFS_FSDATA_BLOCKED is done under ->d_lock > after checking the refcount is not elevated. Any attempt to open the > file (through that name) will go through lookp_open() which will take > ->d_lock while incrementing the refcount, we can be sure that once > the > new value is set, __nfs_lookup_revalidate() *will* see the new value > and > will block. > > We don't have any locking guarantee that when we set ->d_fsdata to > NULL, > the wait_var_event() in __nfs_lookup_revalidate() will notice. > wait/wake primitives do NOT provide barriers to guarantee order. We > must use smp_load_acquire() in wait_var_event() to ensure we look at > an > up-to-date value, and must use smp_store_release() before > wake_up_var(). > > This patch adds those barrier functions and factors out > block_revalidate() and unblock_revalidate() far clarity. > > There is also a hypothetical bug in that if memory allocation fails > (which never happens in practice) we might leave ->d_fsdata locked. > This patch adds the missing call to unblock_revalidate(). > > Reported-and-tested-by: Richard Kojedzinszky > <richard+debian+bugreport@xxxxxxxxx> > Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1071501 > Fixes: 3c59366c207e ("NFS: don't unhash dentry during unlink/rename") > Signed-off-by: NeilBrown <neilb@xxxxxxx> > --- > fs/nfs/dir.c | 44 +++++++++++++++++++++++++++++--------------- > 1 file changed, 29 insertions(+), 15 deletions(-) > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c > index ac505671efbd..c91dc36d41cc 100644 > --- a/fs/nfs/dir.c > +++ b/fs/nfs/dir.c > @@ -1802,9 +1802,10 @@ __nfs_lookup_revalidate(struct dentry *dentry, > unsigned int flags, > if (parent != READ_ONCE(dentry->d_parent)) > return -ECHILD; > } else { > - /* Wait for unlink to complete */ > + /* Wait for unlink to complete - see > unblock_revalidate() */ > wait_var_event(&dentry->d_fsdata, > - dentry->d_fsdata != > NFS_FSDATA_BLOCKED); > + smp_load_acquire(&dentry->d_fsdata) > + != NFS_FSDATA_BLOCKED); Doesn't this end up being a reversed ACQUIRE+RELEASE as described in the "LOCK ACQUISITION FUNCTIONS" section of Documentation/memory- barriers.txt? IOW: Shouldn't the above rather be using READ_ONCE()? > parent = dget_parent(dentry); > ret = reval(d_inode(parent), dentry, flags); > dput(parent); > @@ -1817,6 +1818,26 @@ static int nfs_lookup_revalidate(struct dentry > *dentry, unsigned int flags) > return __nfs_lookup_revalidate(dentry, flags, > nfs_do_lookup_revalidate); > } > > +static void block_revalidate(struct dentry *dentry) > +{ > + /* old devname - just in case */ > + kfree(dentry->d_fsdata); > + > + /* Any new reference that could lead to an open > + * will take ->d_lock in lookup_open() -> d_lookup(). > + */ > + lockdep_assert_held(&dentry->d_lock); > + > + dentry->d_fsdata = NULL; Why are you doing a barrier free change to dentry->d_fsdata here when you have the memory barrier protected change in unblock_revalidate()? > +} > + > +static void unblock_revalidate(struct dentry *dentry) > +{ > + /* store_release ensures wait_var_event() sees the update */ > + smp_store_release(&dentry->d_fsdata, NULL); Shouldn't this be a WRITE_ONCE(), for the same reason as above? > + wake_up_var(&dentry->d_fsdata); > +} > + > /* > * A weaker form of d_revalidate for revalidating just the > d_inode(dentry) > * when we don't really care about the dentry name. This is called > when a > @@ -2501,15 +2522,12 @@ int nfs_unlink(struct inode *dir, struct > dentry *dentry) > spin_unlock(&dentry->d_lock); > goto out; > } > - /* old devname */ > - kfree(dentry->d_fsdata); > - dentry->d_fsdata = NFS_FSDATA_BLOCKED; > + block_revalidate(dentry); > > spin_unlock(&dentry->d_lock); > error = nfs_safe_remove(dentry); > nfs_dentry_remove_handle_error(dir, dentry, error); > - dentry->d_fsdata = NULL; > - wake_up_var(&dentry->d_fsdata); > + unblock_revalidate(dentry); > out: > trace_nfs_unlink_exit(dir, dentry, error); > return error; > @@ -2616,8 +2634,7 @@ nfs_unblock_rename(struct rpc_task *task, > struct nfs_renamedata *data) > { > struct dentry *new_dentry = data->new_dentry; > > - new_dentry->d_fsdata = NULL; > - wake_up_var(&new_dentry->d_fsdata); > + unblock_revalidate(new_dentry); > } > > /* > @@ -2679,11 +2696,6 @@ int nfs_rename(struct mnt_idmap *idmap, struct > inode *old_dir, > if (WARN_ON(new_dentry->d_flags & > DCACHE_NFSFS_RENAMED) || > WARN_ON(new_dentry->d_fsdata == > NFS_FSDATA_BLOCKED)) > goto out; > - if (new_dentry->d_fsdata) { > - /* old devname */ > - kfree(new_dentry->d_fsdata); > - new_dentry->d_fsdata = NULL; > - } > > spin_lock(&new_dentry->d_lock); > if (d_count(new_dentry) > 2) { > @@ -2705,7 +2717,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct > inode *old_dir, > new_dentry = dentry; > new_inode = NULL; > } else { > - new_dentry->d_fsdata = NFS_FSDATA_BLOCKED; > + block_revalidate(new_dentry); > must_unblock = true; > spin_unlock(&new_dentry->d_lock); > } > @@ -2717,6 +2729,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct > inode *old_dir, > task = nfs_async_rename(old_dir, new_dir, old_dentry, > new_dentry, > must_unblock ? nfs_unblock_rename : > NULL); > if (IS_ERR(task)) { > + if (must_unblock) > + unblock_revalidate(new_dentry); > error = PTR_ERR(task); > goto out; > } -- Trond Myklebust Linux NFS client maintainer, Hammerspace trond.myklebust@xxxxxxxxxxxxxxx