On Tue, 2024-08-06 at 10:32 -0400, Jeff Layton wrote: > Today, when opening a file we'll typically do a fast lookup, but if > O_CREAT is set, the kernel always takes the exclusive inode lock. I > assume this was done with the expectation that O_CREAT means that we > always expect to do the create, but that's often not the case. Many > programs set O_CREAT even in scenarios where the file already exists. > > This patch rearranges the pathwalk-for-open code to also attempt a > fast_lookup in certain O_CREAT cases. If a positive dentry is found, the > inode_lock can be avoided altogether, and if auditing isn't enabled, it > can stay in rcuwalk mode for the last step_into. > > One notable exception that is hopefully temporary: if we're doing an > rcuwalk and auditing is enabled, skip the lookup_fast. Legitimizing the > dentry in that case is more expensive than taking the i_rwsem for now. > > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx> > --- > Here's a revised patch that does a fast_lookup in the O_CREAT codepath > too. The main difference here is that if a positive dentry is found and > audit_dummy_context is true, then we keep the walk lazy for the last > component, which avoids having to take any locks on the parent (just > like with non-O_CREAT opens). > > The testcase below runs in about 18s on v6.10 (on an 80 CPU machine). > With this patch, it runs in about 1s: > > #define _GNU_SOURCE 1 > #include <stdio.h> > #include <unistd.h> > #include <errno.h> > #include <fcntl.h> > #include <stdlib.h> > #include <sys/wait.h> > > #define PROCS 70 > #define LOOPS 500000 > > static int openloop(int tnum) > { > char *file; > int i, ret; > > ret = asprintf(&file, "./testfile%d", tnum); > if (ret < 0) { > printf("asprintf failed for proc %d", tnum); > return 1; > } > > for (i = 0; i < LOOPS; ++i) { > int fd = open(file, O_RDWR|O_CREAT, 0644); > > if (fd < 0) { > perror("open"); > return 1; > } > close(fd); > } > unlink(file); > free(file); > return 0; > } > > int main(int argc, char **argv) { > pid_t kids[PROCS]; > int i, ret = 0; > > for (i = 0; i < PROCS; ++i) { > kids[i] = fork(); > if (kids[i] > 0) > return openloop(i); > if (kids[i] < 0) > perror("fork"); > } > > for (i = 0; i < PROCS; ++i) { > int ret2; > > if (kids[i] > 0) { > wait(&ret2); > if (ret2 != 0) > ret = ret2; > } > } > return ret; > } > --- > Changes in v2: > - drop the lockref patch since Mateusz is working on a better approach > - add trailing_slashes helper function > - add a lookup_fast_for_open helper function > - make lookup_fast_for_open skip the lookup if auditing is enabled > - if we find a positive dentry and auditing is disabled, don't unlazy > - Link to v1: https://lore.kernel.org/r/20240802-openfast-v1-0-a1cff2a33063@xxxxxxxxxx > --- > fs/namei.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++--------- > 1 file changed, 53 insertions(+), 9 deletions(-) > > diff --git a/fs/namei.c b/fs/namei.c > index 1e05a0f3f04d..2d716fb114c9 100644 > --- a/fs/namei.c > +++ b/fs/namei.c > @@ -3518,6 +3518,47 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, > return ERR_PTR(error); > } > > +static inline bool trailing_slashes(struct nameidata *nd) > +{ > + return (bool)nd->last.name[nd->last.len]; > +} > + > +static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) > +{ > + struct dentry *dentry; > + > + if (open_flag & O_CREAT) { > + /* Don't bother on an O_EXCL create */ > + if (open_flag & O_EXCL) > + return NULL; > + > + /* > + * FIXME: If auditing is enabled, then we'll have to unlazy to > + * use the dentry. For now, don't do this, since it shifts > + * contention from parent's i_rwsem to its d_lockref spinlock. > + * Reconsider this once dentry refcounting handles heavy > + * contention better. > + */ > + if ((nd->flags & LOOKUP_RCU) && !audit_dummy_context()) > + return NULL; > + } > + > + if (trailing_slashes(nd)) > + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; > + > + dentry = lookup_fast(nd); Self-NAK on this patch. We have to test for IS_ERR on the returned dentry here. I'll send a v3 along after I've retested it. > + > + if (open_flag & O_CREAT) { > + /* Discard negative dentries. Need inode_lock to do the create */ > + if (dentry && !dentry->d_inode) { > + if (!(nd->flags & LOOKUP_RCU)) > + dput(dentry); > + dentry = NULL; > + } > + } > + return dentry; > +} > + > static const char *open_last_lookups(struct nameidata *nd, > struct file *file, const struct open_flags *op) > { > @@ -3535,28 +3576,31 @@ static const char *open_last_lookups(struct nameidata *nd, > return handle_dots(nd, nd->last_type); > } > > + /* We _can_ be in RCU mode here */ > + dentry = lookup_fast_for_open(nd, open_flag); > + if (IS_ERR(dentry)) > + return ERR_CAST(dentry); > + > if (!(open_flag & O_CREAT)) { > - if (nd->last.name[nd->last.len]) > - nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; > - /* we _can_ be in RCU mode here */ > - dentry = lookup_fast(nd); > - if (IS_ERR(dentry)) > - return ERR_CAST(dentry); > if (likely(dentry)) > goto finish_lookup; > > if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) > return ERR_PTR(-ECHILD); > } else { > - /* create side of things */ > if (nd->flags & LOOKUP_RCU) { > + /* can stay in rcuwalk if not auditing */ > + if (dentry && audit_dummy_context()) > + goto check_slashes; > if (!try_to_unlazy(nd)) > return ERR_PTR(-ECHILD); > } > audit_inode(nd->name, dir, AUDIT_INODE_PARENT); > - /* trailing slashes? */ > - if (unlikely(nd->last.name[nd->last.len])) > +check_slashes: > + if (trailing_slashes(nd)) > return ERR_PTR(-EISDIR); > + if (dentry) > + goto finish_lookup; > } > > if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { > > --- > base-commit: 0c3836482481200ead7b416ca80c68a29cfdaabd > change-id: 20240723-openfast-ac49a7b6ade2 > > Best regards, -- Jeff Layton <jlayton@xxxxxxxxxx>