On Mon, 21 Feb 2022, Miklos Szeredi wrote: > There has been a longstanding race condition between rename(2) and link(2), > when those operations are done in parallel: > > 1. Moving a file to an existing target file (eg. mv file target) > 2. Creating a link from the target file to a third file (eg. ln target > link) > > By the time vfs_link() locks the target inode, it might already be unlinked > by rename. This results in vfs_link() returning -ENOENT in order to > prevent linking to already unlinked files. This check was introduced in > v2.6.39 by commit aae8a97d3ec3 ("fs: Don't allow to create hardlink for > deleted file"). > > This breaks apparent atomicity of rename(2), which is described in > standards and the man page: > > "If newpath already exists, it will be atomically replaced, so that > there is no point at which another process attempting to access > newpath will find it missing." > > The simplest fix is to exclude renames for the complete link operation. Alternately, lock the "from" directory as well as the "to" directory. That would mean using lock_rename() and generally copying the structure of do_renameat2() into do_linkat() I wonder if you could get a similar race trying to create a file in (empty directory) /tmp/foo while /tmp/bar was being renamed over it. NeilBrown > > This patch introduces a global rw_semaphore that is locked for read in > rename and for write in link. To prevent excessive contention, do not take > the lock in link on the first try. If the source of the link was found to > be unlinked, then retry with the lock held. > > Reuse the lock_rename()/unlock_rename() helpers for the rename part. This > however needs special treatment for stacking fs (overlayfs, ecryptfs) to > prevent possible deadlocks. Introduce [un]lock_rename_stacked() for this > purpose. > > Reproducer can be found at: > > https://lore.kernel.org/all/20220216131814.GA2463301@xavier-xps/ > > Reported-by: Xavier Roche <xavier.roche@xxxxxxxxxxx> > Link: https://lore.kernel.org/all/20220214210708.GA2167841@xavier-xps/ > Fixes: aae8a97d3ec3 ("fs: Don't allow to create hardlink for deleted file") > Tested-by: Xavier Roche <xavier.roche@xxxxxxxxxxx> > Signed-off-by: Miklos Szeredi <mszeredi@xxxxxxxxxx> > --- > fs/ecryptfs/inode.c | 4 ++-- > fs/namei.c | 35 ++++++++++++++++++++++++++++++++--- > fs/overlayfs/copy_up.c | 4 ++-- > fs/overlayfs/dir.c | 12 ++++++------ > fs/overlayfs/util.c | 4 ++-- > include/linux/namei.h | 4 ++++ > 6 files changed, 48 insertions(+), 15 deletions(-) > > diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c > index 16d50dface59..f5c37599bd40 100644 > --- a/fs/ecryptfs/inode.c > +++ b/fs/ecryptfs/inode.c > @@ -596,7 +596,7 @@ ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, > > target_inode = d_inode(new_dentry); > > - trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); > + trap = lock_rename_stacked(lower_old_dir_dentry, lower_new_dir_dentry); > dget(lower_new_dentry); > rc = -EINVAL; > if (lower_old_dentry->d_parent != lower_old_dir_dentry) > @@ -631,7 +631,7 @@ ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, > fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); > out_lock: > dput(lower_new_dentry); > - unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); > + unlock_rename_stacked(lower_old_dir_dentry, lower_new_dir_dentry); > return rc; > } > > diff --git a/fs/namei.c b/fs/namei.c > index 3f1829b3ab5b..27e671c354a6 100644 > --- a/fs/namei.c > +++ b/fs/namei.c > @@ -122,6 +122,8 @@ > * PATH_MAX includes the nul terminator --RR. > */ > > +static DECLARE_RWSEM(link_rwsem); > + > #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) > > struct filename * > @@ -2954,10 +2956,11 @@ static inline int may_create(struct user_namespace *mnt_userns, > return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); > } > > + > /* > * p1 and p2 should be directories on the same fs. > */ > -struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) > +struct dentry *lock_rename_stacked(struct dentry *p1, struct dentry *p2) > { > struct dentry *p; > > @@ -2986,9 +2989,16 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) > inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); > return NULL; > } > +EXPORT_SYMBOL(lock_rename_stacked); > + > +struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) > +{ > + down_read(&link_rwsem); > + return lock_rename_stacked(p1, p2); > +} > EXPORT_SYMBOL(lock_rename); > > -void unlock_rename(struct dentry *p1, struct dentry *p2) > +void unlock_rename_stacked(struct dentry *p1, struct dentry *p2) > { > inode_unlock(p1->d_inode); > if (p1 != p2) { > @@ -2996,6 +3006,13 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) > mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); > } > } > +EXPORT_SYMBOL(unlock_rename_stacked); > + > +void unlock_rename(struct dentry *p1, struct dentry *p2) > +{ > + unlock_rename_stacked(p1, p2); > + up_read(&link_rwsem); > +} > EXPORT_SYMBOL(unlock_rename); > > /** > @@ -4456,6 +4473,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, > struct path old_path, new_path; > struct inode *delegated_inode = NULL; > int how = 0; > + bool lock = false; > int error; > > if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) { > @@ -4474,10 +4492,13 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, > > if (flags & AT_SYMLINK_FOLLOW) > how |= LOOKUP_FOLLOW; > +retry_lock: > + if (lock) > + down_write(&link_rwsem); > retry: > error = filename_lookup(olddfd, old, how, &old_path, NULL); > if (error) > - goto out_putnames; > + goto out_unlock_link; > > new_dentry = filename_create(newdfd, new, &new_path, > (how & LOOKUP_REVAL)); > @@ -4511,8 +4532,16 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, > how |= LOOKUP_REVAL; > goto retry; > } > + if (!lock && error == -ENOENT) { > + path_put(&old_path); > + lock = true; > + goto retry_lock; > + } > out_putpath: > path_put(&old_path); > +out_unlock_link: > + if (lock) > + up_write(&link_rwsem); > out_putnames: > putname(old); > putname(new); > diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c > index e040970408d4..911c3cec43c2 100644 > --- a/fs/overlayfs/copy_up.c > +++ b/fs/overlayfs/copy_up.c > @@ -670,7 +670,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) > > /* workdir and destdir could be the same when copying up to indexdir */ > err = -EIO; > - if (lock_rename(c->workdir, c->destdir) != NULL) > + if (lock_rename_stacked(c->workdir, c->destdir) != NULL) > goto unlock; > > err = ovl_prep_cu_creds(c->dentry, &cc); > @@ -711,7 +711,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) > if (S_ISDIR(inode->i_mode)) > ovl_set_flag(OVL_WHITEOUTS, inode); > unlock: > - unlock_rename(c->workdir, c->destdir); > + unlock_rename_stacked(c->workdir, c->destdir); > > return err; > > diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c > index f18490813170..fea397666174 100644 > --- a/fs/overlayfs/dir.c > +++ b/fs/overlayfs/dir.c > @@ -416,7 +416,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, > > ovl_cleanup_whiteouts(upper, list); > ovl_cleanup(wdir, upper); > - unlock_rename(workdir, upperdir); > + unlock_rename_stacked(workdir, upperdir); > > /* dentry's upper doesn't match now, get rid of it */ > d_drop(dentry); > @@ -427,7 +427,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, > ovl_cleanup(wdir, opaquedir); > dput(opaquedir); > out_unlock: > - unlock_rename(workdir, upperdir); > + unlock_rename_stacked(workdir, upperdir); > out: > return ERR_PTR(err); > } > @@ -551,7 +551,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, > out_dput: > dput(upper); > out_unlock: > - unlock_rename(workdir, upperdir); > + unlock_rename_stacked(workdir, upperdir); > out: > if (!hardlink) { > posix_acl_release(acl); > @@ -790,7 +790,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, > out_dput_upper: > dput(upper); > out_unlock: > - unlock_rename(workdir, upperdir); > + unlock_rename_stacked(workdir, upperdir); > out_dput: > dput(opaquedir); > out: > @@ -1187,7 +1187,7 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, > } > } > > - trap = lock_rename(new_upperdir, old_upperdir); > + trap = lock_rename_stacked(new_upperdir, old_upperdir); > > olddentry = lookup_one_len(old->d_name.name, old_upperdir, > old->d_name.len); > @@ -1281,7 +1281,7 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, > out_dput_old: > dput(olddentry); > out_unlock: > - unlock_rename(new_upperdir, old_upperdir); > + unlock_rename_stacked(new_upperdir, old_upperdir); > out_revert_creds: > revert_creds(old_cred); > if (update_nlink) > diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c > index f48284a2a896..9358282278b1 100644 > --- a/fs/overlayfs/util.c > +++ b/fs/overlayfs/util.c > @@ -930,13 +930,13 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) > goto err; > > /* Workdir should not be subdir of upperdir and vice versa */ > - if (lock_rename(workdir, upperdir) != NULL) > + if (lock_rename_stacked(workdir, upperdir) != NULL) > goto err_unlock; > > return 0; > > err_unlock: > - unlock_rename(workdir, upperdir); > + unlock_rename_stacked(workdir, upperdir); > err: > pr_err("failed to lock workdir+upperdir\n"); > return -EIO; > diff --git a/include/linux/namei.h b/include/linux/namei.h > index e89329bb3134..0a87bb0d56ce 100644 > --- a/include/linux/namei.h > +++ b/include/linux/namei.h > @@ -77,6 +77,10 @@ extern int follow_up(struct path *); > extern struct dentry *lock_rename(struct dentry *, struct dentry *); > extern void unlock_rename(struct dentry *, struct dentry *); > > +/* Special version of the above for stacking filesystems */ > +extern struct dentry *lock_rename_stacked(struct dentry *, struct dentry *); > +extern void unlock_rename_stacked(struct dentry *, struct dentry *); > + > extern int __must_check nd_jump_link(struct path *path); > > static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) > -- > 2.34.1 > >