On Mon, 2025-02-17 at 16:30 +1100, NeilBrown wrote: > vfs_mkdir() does not currently guarantee to leave the child dentry > hashed or make it positive on success. It may leave it unhashed and > negative and then the caller needs to perform a lookup to find the > target dentry, if it needs it. > > This patch changes vfs_mkdir() to return the dentry provided by the > filesystems which is hashed and positive when provided. This reduces > the need for lookup code which is now included in vfs_mkdir() rather > than at various call-sites. The included lookup does not include the > permission check that some of the existing code included in e.g. > lookup_one_len(). This should not be necessary for lookup up a > directory which has just be successfully created. > > If a different dentry is returned, the first one is put. If necessary > the fact that it is new can be determined by comparing pointers. A new > dentry will certainly have a new pointer (as the old is put after the > new is obtained). > > The dentry returned by vfs_mkdir(), when not an error, *is* now > guaranteed to be hashed and positive. > > A few callers do not need the dentry, but will now sometimes perform the > lookup anyway. This should be cheap except possibly in the case of cifs. > > Signed-off-by: NeilBrown <neilb@xxxxxxx> > --- > drivers/base/devtmpfs.c | 7 ++-- > fs/cachefiles/namei.c | 10 +++--- > fs/ecryptfs/inode.c | 14 +++++--- > fs/init.c | 7 ++-- > fs/namei.c | 70 +++++++++++++++++++++++++++++++--------- > fs/nfsd/nfs4recover.c | 7 ++-- > fs/nfsd/vfs.c | 28 +++++----------- > fs/overlayfs/dir.c | 37 +++------------------ > fs/overlayfs/overlayfs.h | 15 ++++----- > fs/overlayfs/super.c | 7 ++-- > fs/smb/server/vfs.c | 31 ++++++------------ > fs/xfs/scrub/orphanage.c | 9 +++--- > include/linux/fs.h | 4 +-- > 13 files changed, 121 insertions(+), 125 deletions(-) > > diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c > index c9e34842139f..8ec756b5dec4 100644 > --- a/drivers/base/devtmpfs.c > +++ b/drivers/base/devtmpfs.c > @@ -160,18 +160,17 @@ static int dev_mkdir(const char *name, umode_t mode) > { > struct dentry *dentry; > struct path path; > - int err; > > dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); > if (IS_ERR(dentry)) > return PTR_ERR(dentry); > > - err = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode); > - if (!err) > + dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode); > + if (!IS_ERR(dentry)) > /* mark as kernel-created inode */ > d_inode(dentry)->i_private = &thread; > done_path_create(&path, dentry); > - return err; > + return PTR_ERR_OR_ZERO(dentry); > } > > static int create_path(const char *nodepath) > diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c > index 7cf59713f0f7..8a8337d1be05 100644 > --- a/fs/cachefiles/namei.c > +++ b/fs/cachefiles/namei.c > @@ -95,7 +95,6 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, > /* search the current directory for the element name */ > inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); > > -retry: > ret = cachefiles_inject_read_error(); > if (ret == 0) > subdir = lookup_one_len(dirname, dir, strlen(dirname)); > @@ -128,10 +127,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, > ret = security_path_mkdir(&path, subdir, 0700); > if (ret < 0) > goto mkdir_error; > - ret = cachefiles_inject_write_error(); > - if (ret == 0) > - ret = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); > - if (ret < 0) { > + subdir = ERR_PTR(cachefiles_inject_write_error()); > + if (!IS_ERR(subdir)) > + subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); > + ret = PTR_ERR(subdir); > + if (IS_ERR(subdir)) { > trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, > cachefiles_trace_mkdir_error); > goto mkdir_error; > diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c > index 6315dd194228..51a5c54eb740 100644 > --- a/fs/ecryptfs/inode.c > +++ b/fs/ecryptfs/inode.c > @@ -511,10 +511,16 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, > struct inode *lower_dir; > > rc = lock_parent(dentry, &lower_dentry, &lower_dir); > - if (!rc) > - rc = vfs_mkdir(&nop_mnt_idmap, lower_dir, > - lower_dentry, mode); > - if (rc || d_really_is_negative(lower_dentry)) > + if (rc) > + goto out; > + > + lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir, > + lower_dentry, mode); > + rc = PTR_ERR(lower_dentry); > + if (IS_ERR(lower_dentry)) > + goto out; > + rc = 0; > + if (d_unhashed(lower_dentry)) > goto out; > rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); > if (rc) > diff --git a/fs/init.c b/fs/init.c > index e9387b6c4f30..eef5124885e3 100644 > --- a/fs/init.c > +++ b/fs/init.c > @@ -230,9 +230,12 @@ int __init init_mkdir(const char *pathname, umode_t mode) > return PTR_ERR(dentry); > mode = mode_strip_umask(d_inode(path.dentry), mode); > error = security_path_mkdir(&path, dentry, mode); > - if (!error) > - error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, > + if (!error) { > + dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, > dentry, mode); > + if (IS_ERR(dentry)) > + error = PTR_ERR(dentry); > + } > done_path_create(&path, dentry); > return error; > } > diff --git a/fs/namei.c b/fs/namei.c > index 19d5ea340a18..f76fee6df369 100644 > --- a/fs/namei.c > +++ b/fs/namei.c > @@ -4132,7 +4132,8 @@ EXPORT_SYMBOL(kern_path_create); > > void done_path_create(struct path *path, struct dentry *dentry) > { > - dput(dentry); > + if (!IS_ERR(dentry)) > + dput(dentry); > inode_unlock(path->dentry->d_inode); > mnt_drop_write(path->mnt); > path_put(path); > @@ -4278,7 +4279,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d > } > > /** > - * vfs_mkdir - create directory > + * vfs_mkdir - create directory returning correct dentry is possible > * @idmap: idmap of the mount the inode was found from > * @dir: inode of the parent directory > * @dentry: dentry of the child directory > @@ -4291,9 +4292,20 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d > * care to map the inode according to @idmap before checking permissions. > * On non-idmapped mounts or if permission checking is to be performed on the > * raw inode simply pass @nop_mnt_idmap. > + * > + * In the event that the filesystem does not use the *@dentry but leaves it > + * negative or unhashes it and possibly splices a different one returning it, > + * the original dentry is dput() and the alternate is returned. > + * > + * If the file-system reports success but leaves the dentry unhashed or > + * negative, a lookup is performed and providing that is positive and > + * a directory, it will be returned. If the lookup is not successful > + * the original dentry is unhashed and returned. > + * > + * In case of an error the dentry is dput() and an ERR_PTR() is returned. > */ > -int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, > - struct dentry *dentry, umode_t mode) > +struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, > + struct dentry *dentry, umode_t mode) > { > int error; > unsigned max_links = dir->i_sb->s_max_links; > @@ -4301,30 +4313,54 @@ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, > > error = may_create(idmap, dir, dentry); > if (error) > - return error; > + goto err; > > + error = -EPERM; > if (!dir->i_op->mkdir) > - return -EPERM; > + goto err; > > mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); > error = security_inode_mkdir(dir, dentry, mode); > if (error) > - return error; > + goto err; > > + error = -EMLINK; > if (max_links && dir->i_nlink >= max_links) > - return -EMLINK; > + goto err; > > de = dir->i_op->mkdir(idmap, dir, dentry, mode); > + error = PTR_ERR(de); > if (IS_ERR(de)) > - return PTR_ERR(de); > + goto err; > + if (!de && (d_unhashed(dentry) || d_is_negative(dentry))) { Would it be better to push this down into the callers that need it and make returning a hashed, positive dentry a requirement for the mkdir operation? You could just turn this block of code into a helper function that those four filesystems could call, which would mean that you could avoid the d_unhashed() and d_is_negative() checks on the other filesystems. > + /* A few filesystems leave the dentry negative on success, > + * currently cifs(with posix extensions), kernfs, tracefs, hostfs. > + * They rely on a subsequent lookup which we perform here. > + */ > + const struct qstr *d_name = (void*)&dentry->d_name; > + de = lookup_dcache(d_name, dentry->d_parent, 0); > + if (!de) > + de = __lookup_slow(d_name, dentry->d_parent, 0); > + if (IS_ERR(de)) > + de = NULL; > + else if (unlikely(d_is_negative(de) || !d_is_dir(de))) { > + dput(de); > + de = NULL; > + } > + if (!de) > + /* Ensure caller can easily detect that dentry is not useful */ > + d_drop(dentry); > + } > if (de) { > - fsnotify_mkdir(dir, de); > - /* Cannot return de yet */ > - dput(de); > - } else > - fsnotify_mkdir(dir, dentry); > + dput(dentry); > + dentry = de; > + } > + fsnotify_mkdir(dir, dentry); > + return dentry; > > - return 0; > +err: > + dput(dentry); > + return ERR_PTR(error); > } > EXPORT_SYMBOL(vfs_mkdir); > > @@ -4344,8 +4380,10 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode) > error = security_path_mkdir(&path, dentry, > mode_strip_umask(path.dentry->d_inode, mode)); > if (!error) { > - error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, > + dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, > dentry, mode); > + if (IS_ERR(dentry)) > + error = PTR_ERR(dentry); > } > done_path_create(&path, dentry); > if (retry_estale(error, lookup_flags)) { > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c > index 28f4d5311c40..c1d9bd07285f 100644 > --- a/fs/nfsd/nfs4recover.c > +++ b/fs/nfsd/nfs4recover.c > @@ -233,9 +233,12 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) > * as well be forgiving and just succeed silently. > */ > goto out_put; > - status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); > + dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); > + if (IS_ERR(dentry)) > + status = PTR_ERR(dentry); > out_put: > - dput(dentry); > + if (!status) > + dput(dentry); > out_unlock: > inode_unlock(d_inode(dir)); > if (status == 0) { > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c > index 29cb7b812d71..205b07f21e8d 100644 > --- a/fs/nfsd/vfs.c > +++ b/fs/nfsd/vfs.c > @@ -1461,7 +1461,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, > struct inode *dirp; > struct iattr *iap = attrs->na_iattr; > __be32 err; > - int host_err; > + int host_err = 0; > > dentry = fhp->fh_dentry; > dirp = d_inode(dentry); > @@ -1488,26 +1488,13 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, > nfsd_check_ignore_resizing(iap); > break; > case S_IFDIR: > - host_err = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); > - if (!host_err && unlikely(d_unhashed(dchild))) { > - struct dentry *d; > - d = lookup_one_len(dchild->d_name.name, > - dchild->d_parent, > - dchild->d_name.len); > - if (IS_ERR(d)) { > - host_err = PTR_ERR(d); > - break; > - } > - if (unlikely(d_is_negative(d))) { > - dput(d); > - err = nfserr_serverfault; > - goto out; > - } > + dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); > + if (IS_ERR(dchild)) > + host_err = PTR_ERR(dchild); > + else if (unlikely(dchild != resfhp->fh_dentry)) { > dput(resfhp->fh_dentry); > - resfhp->fh_dentry = dget(d); > + resfhp->fh_dentry = dget(dchild); > err = fh_update(resfhp); > - dput(dchild); > - dchild = d; > if (err) > goto out; > } > @@ -1530,7 +1517,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, > err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs); > > out: > - dput(dchild); > + if (!IS_ERR(dchild)) > + dput(dchild); > return err; > > out_nfserr: > diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c > index 21c3aaf7b274..fe493f3ed6b6 100644 > --- a/fs/overlayfs/dir.c > +++ b/fs/overlayfs/dir.c > @@ -138,37 +138,6 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, > goto out; > } > > -int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, > - struct dentry **newdentry, umode_t mode) > -{ > - int err; > - struct dentry *d, *dentry = *newdentry; > - > - err = ovl_do_mkdir(ofs, dir, dentry, mode); > - if (err) > - return err; > - > - if (likely(!d_unhashed(dentry))) > - return 0; > - > - /* > - * vfs_mkdir() may succeed and leave the dentry passed > - * to it unhashed and negative. If that happens, try to > - * lookup a new hashed and positive dentry. > - */ > - d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent, > - dentry->d_name.len); > - if (IS_ERR(d)) { > - pr_warn("failed lookup after mkdir (%pd2, err=%i).\n", > - dentry, err); > - return PTR_ERR(d); > - } > - dput(dentry); > - *newdentry = d; > - > - return 0; > -} > - > struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, > struct dentry *newdentry, struct ovl_cattr *attr) > { > @@ -191,7 +160,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, > > case S_IFDIR: > /* mkdir is special... */ > - err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode); > + newdentry = ovl_do_mkdir(ofs, dir, newdentry, attr->mode); > + err = PTR_ERR_OR_ZERO(newdentry); > break; > > case S_IFCHR: > @@ -219,7 +189,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, > } > out: > if (err) { > - dput(newdentry); > + if (!IS_ERR(newdentry)) > + dput(newdentry); > return ERR_PTR(err); > } > return newdentry; > diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h > index 0021e2025020..6f2f8f4cfbbc 100644 > --- a/fs/overlayfs/overlayfs.h > +++ b/fs/overlayfs/overlayfs.h > @@ -241,13 +241,14 @@ static inline int ovl_do_create(struct ovl_fs *ofs, > return err; > } > > -static inline int ovl_do_mkdir(struct ovl_fs *ofs, > - struct inode *dir, struct dentry *dentry, > - umode_t mode) > +static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs, > + struct inode *dir, > + struct dentry *dentry, > + umode_t mode) > { > - int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); > - pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); > - return err; > + dentry = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); > + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(dentry)); > + return dentry; > } > > static inline int ovl_do_mknod(struct ovl_fs *ofs, > @@ -838,8 +839,6 @@ struct ovl_cattr { > > #define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) > > -int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, > - struct dentry **newdentry, umode_t mode); > struct dentry *ovl_create_real(struct ovl_fs *ofs, > struct inode *dir, struct dentry *newdentry, > struct ovl_cattr *attr); > diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c > index 86ae6f6da36b..a381765802e9 100644 > --- a/fs/overlayfs/super.c > +++ b/fs/overlayfs/super.c > @@ -327,9 +327,10 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, > goto retry; > } > > - err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode); > - if (err) > - goto out_dput; > + work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode); > + err = PTR_ERR(work); > + if (IS_ERR(work)) > + goto out_err; > > /* Weird filesystem returning with hashed negative (kernfs)? */ > err = -EINVAL; > diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c > index fe29acef5872..d682443b14ac 100644 > --- a/fs/smb/server/vfs.c > +++ b/fs/smb/server/vfs.c > @@ -206,7 +206,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) > { > struct mnt_idmap *idmap; > struct path path; > - struct dentry *dentry; > + struct dentry *dentry, *d; > int err; > > dentry = ksmbd_vfs_kern_path_create(work, name, > @@ -222,31 +222,18 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) > > idmap = mnt_idmap(path.mnt); > mode |= S_IFDIR; > - err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); > - if (!err && d_unhashed(dentry)) { > - struct dentry *d; > - > - d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent, > - dentry->d_name.len); > - if (IS_ERR(d)) { > - err = PTR_ERR(d); > - goto out_err; > - } > - if (unlikely(d_is_negative(d))) { > - dput(d); > - err = -ENOENT; > - goto out_err; > - } > - > + d = dentry; > + entry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); > + err = PTR_ERR(dentry); > + if (!IS_ERR(dentry) && dentry != d) > ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d)); > - dput(d); > - } > > -out_err: > done_path_create(&path, dentry); > - if (err) > + if (IS_ERR(dentry)) { > pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); > - return err; > + return err; > + } > + return 0; > } > > static ssize_t ksmbd_vfs_getcasexattr(struct mnt_idmap *idmap, > diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c > index c287c755f2c5..3537f3cca6d5 100644 > --- a/fs/xfs/scrub/orphanage.c > +++ b/fs/xfs/scrub/orphanage.c > @@ -167,10 +167,11 @@ xrep_orphanage_create( > * directory to control access to a file we put in here. > */ > if (d_really_is_negative(orphanage_dentry)) { > - error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry, > - 0750); > - if (error) > - goto out_dput_orphanage; > + orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode, > + orphanage_dentry, 0750); > + error = PTR_ERR(orphanage_dentry); > + if (IS_ERR(orphanage_dentry)) > + goto out_unlock_root; > } > > /* Not a directory? Bail out. */ > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 45269608ee23..beae24bc990d 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1981,8 +1981,8 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap, > */ > int vfs_create(struct mnt_idmap *, struct inode *, > struct dentry *, umode_t, bool); > -int vfs_mkdir(struct mnt_idmap *, struct inode *, > - struct dentry *, umode_t); > +struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, > + struct dentry *, umode_t); > int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, > umode_t, dev_t); > int vfs_symlink(struct mnt_idmap *, struct inode *, -- Jeff Layton <jlayton@xxxxxxxxxx>