Opening a file grabs a reference on the terminal dentry in __legitimize_path(), then another one in do_dentry_open() and finally drops the initial reference in terminate_walk(). That's 2 modifications which don't need to be there -- do_dentry_open() can consume the already held reference instead. In order to facilitate some debugging a dedicated namei state flag was added to denote this happened. When benchmarking on a 20-core vm using will-it-scale's open3_processes ("Same file open/close"), the results are (ops/s): before: 3087010 after: 4173977 (+35%) Signed-off-by: Mateusz Guzik <mjguzik@xxxxxxxxx> --- The flag thing is optional and can be dropped, but I think the general direction should be to add *more* asserts and whatnot (even if they are to land separately). A debug-only variant would not hurt. The entire "always consume regardless of error" ordeal stems from the corner case where open succeeds with a fully populated file object and then returns an error anyway. While odd, it does mean error handling does not get more complicated at least. fs/internal.h | 3 ++- fs/namei.c | 30 +++++++++++++++++++++++------- fs/open.c | 31 ++++++++++++++++++++++++++++--- 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index cdd73209eecb..9eeb7e03f81d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -193,7 +193,8 @@ int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); int chown_common(const struct path *path, uid_t user, gid_t group); -extern int vfs_open(const struct path *, struct file *); +int vfs_open_consume(const struct path *, struct file *); +int vfs_open(const struct path *, struct file *); /* * inode.c diff --git a/fs/namei.c b/fs/namei.c index 1bf081959066..20c5823d34dc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -595,9 +595,10 @@ struct nameidata { umode_t dir_mode; } __randomize_layout; -#define ND_ROOT_PRESET 1 -#define ND_ROOT_GRABBED 2 -#define ND_JUMPED 4 +#define ND_ROOT_PRESET 0x00000001 +#define ND_ROOT_GRABBED 0x00000002 +#define ND_JUMPED 0x00000004 +#define ND_PATH_CONSUMED 0x00000008 static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) { @@ -697,6 +698,7 @@ static void terminate_walk(struct nameidata *nd) nd->state &= ~ND_ROOT_GRABBED; } } else { + BUG_ON(nd->state & ND_PATH_CONSUMED); leave_rcu(nd); } nd->depth = 0; @@ -3683,6 +3685,7 @@ static const char *open_last_lookups(struct nameidata *nd, static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { + struct vfsmount *mnt; struct mnt_idmap *idmap; int open_flag = op->open_flag; bool do_truncate; @@ -3720,11 +3723,22 @@ static int do_open(struct nameidata *nd, error = mnt_want_write(nd->path.mnt); if (error) return error; + /* + * We grab an additional reference here because vfs_open_consume() + * may error out and free the mount from under us, while we need + * to undo write access below. + */ + mnt = mntget(nd->path.mnt); do_truncate = true; } error = may_open(idmap, &nd->path, acc_mode, open_flag); - if (!error && !(file->f_mode & FMODE_OPENED)) - error = vfs_open(&nd->path, file); + if (!error && !(file->f_mode & FMODE_OPENED)) { + BUG_ON(nd->state & ND_PATH_CONSUMED); + error = vfs_open_consume(&nd->path, file); + nd->state |= ND_PATH_CONSUMED; + nd->path.mnt = NULL; + nd->path.dentry = NULL; + } if (!error) error = security_file_post_open(file, op->acc_mode); if (!error && do_truncate) @@ -3733,8 +3747,10 @@ static int do_open(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - if (do_truncate) - mnt_drop_write(nd->path.mnt); + if (do_truncate) { + mnt_drop_write(mnt); + mntput(mnt); + } return error; } diff --git a/fs/open.c b/fs/open.c index 22adbef7ecc2..eb69af3676e3 100644 --- a/fs/open.c +++ b/fs/open.c @@ -905,6 +905,15 @@ static inline int file_get_write_access(struct file *f) return error; } +/* + * Populate struct file + * + * NOTE: it assumes f_path is populated and consumes the caller's reference. + * + * The caller must not path_put on it regardless of the error code -- the + * routine will either clean it up on its own or rely on fput, which must + * be issued anyway. + */ static int do_dentry_open(struct file *f, int (*open)(struct inode *, struct file *)) { @@ -912,7 +921,6 @@ static int do_dentry_open(struct file *f, struct inode *inode = f->f_path.dentry->d_inode; int error; - path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; f->f_wb_err = filemap_sample_wb_err(f->f_mapping); @@ -1045,6 +1053,7 @@ int finish_open(struct file *file, struct dentry *dentry, BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; + path_get(&file->f_path); return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); @@ -1077,15 +1086,19 @@ char *file_path(struct file *filp, char *buf, int buflen) EXPORT_SYMBOL(file_path); /** - * vfs_open - open the file at the given path + * vfs_open_consume - open the file at the given path and consume the reference * @path: path to open * @file: newly allocated file with f_flag initialized */ -int vfs_open(const struct path *path, struct file *file) +int vfs_open_consume(const struct path *path, struct file *file) { int ret; file->f_path = *path; + /* + * do_dentry_open() does the referene consuming regardless of its return + * value + */ ret = do_dentry_open(file, NULL); if (!ret) { /* @@ -1098,6 +1111,17 @@ int vfs_open(const struct path *path, struct file *file) return ret; } +/** + * vfs_open - open the file at the given path + * @path: path to open + * @file: newly allocated file with f_flag initialized + */ +int vfs_open(const struct path *path, struct file *file) +{ + path_get(path); + return vfs_open_consume(path, file); +} + struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { @@ -1183,6 +1207,7 @@ struct file *kernel_file_open(const struct path *path, int flags, return f; f->f_path = *path; + path_get(&f->f_path); error = do_dentry_open(f, NULL); if (error) { fput(f); -- 2.43.0