It's cheap to check if the path is empty in the userspace, but expensive to check if a userspace string is empty from the kernel. So it seems a waste to delay the check into the kernel, and using statx(AT_EMPTY_PATH) to implement fstat is slower than a "native" fstat call. In the past there was a similar performance issue with several Glibc releases using fstatat(AT_EMPTY_PATH) for fstat. That issue was fixed by Glibc with reverting back to plain fstat, and worked around by the kernel with a special fast code path for fstatat(AT_EMPTY_PATH) at commit 9013c51c630a ("vfs: mostly undo glibc turning 'fstat()' into 'fstatat(AT_EMPTY_PATH)'"). But for arch/loongarch fstat does not exist, so we have to use statx. And on all 32-bit architectures we must use statx for fstat after 2037 since the plain fstat call uses 32-bit timestamp. Thus Glibc uses statx for fstat on LoongArch and all 32-bit platforms, and these platforms still suffer the performance issue. So port the fstatat(AT_EMPTY_PATH) fast path to statx(AT_EMPTY_PATH) as well, and add AT_NO_PATH (the name is suggested by Mateusz) which makes statx and fstatat completely skip the path check and assume the path is empty. Benchmark on LoongArch Loongson 3A6000: 1. Unpatched kernel, Glibc fstat, actually statx(AT_EMPTY_PATH): 2575328 ops/s 2. Patched kernel, Glibc fstat, actually statx(AT_EMPTY_PATH): 5434782 ops/s, +111% from 1 3. Patched kernel, statx(AT_NO_PATH): 5773672 ops/s, +124% from 1, +6.2% from 2. Seccomp sandboxes can also green light fstatat/statx(AT_NO_PATH) easier than fstatat/statx(AT_EMPTY_PATH) for which the audition needs to check 5434782543478254347825434782the path but seccomp BPF program cannot do that now. Link: https://sourceware.org/pipermail/libc-alpha/2023-September/151364.html Link: https://sourceware.org/git/?p=glibc.git;a=commit;h=e6547d635b99 Link: https://sourceware.org/git/?p=glibc.git;a=commit;h=551101e8240b Link: https://lore.kernel.org/loongarch/599df4a3-47a4-49be-9c81-8e21ea1f988a@xxxxxxxxxx/ Cc: Christian Brauner <brauner@xxxxxxxxxx> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx> Cc: Jan Kara <jack@xxxxxxx> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Cc: Mateusz Guzik <mjguzik@xxxxxxxxx> Cc: Alejandro Colomar <alx@xxxxxxxxxx> Cc: Arnd Bergmann <arnd@xxxxxxxx> Cc: Huacai Chen <chenhuacai@xxxxxxxxxxx> Cc: Xuerui Wang <kernel@xxxxxxxxxx> Cc: Jiaxun Yang <jiaxun.yang@xxxxxxxxxxx> Cc: Icenowy Zheng <uwu@xxxxxxxxxx> Cc: linux-fsdevel@xxxxxxxxxxxxxxx Cc: linux-arch@xxxxxxxxxxxxxxx Cc: loongarch@xxxxxxxxxxxxxxx Cc: linux-kernel@xxxxxxxxxxxxxxx Signed-off-by: Xi Ruoyao <xry111@xxxxxxxxxxx> --- Superseds https://lore.kernel.org/all/20240622105621.7922-1-xry111@xxxxxxxxxxx/. fs/stat.c | 103 +++++++++++++++++++++++++------------ include/uapi/linux/fcntl.h | 3 ++ 2 files changed, 74 insertions(+), 32 deletions(-) diff --git a/fs/stat.c b/fs/stat.c index 70bd3e888cfa..2b7d4a22f971 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -208,7 +208,7 @@ int getname_statx_lookup_flags(int flags) lookup_flags |= LOOKUP_FOLLOW; if (!(flags & AT_NO_AUTOMOUNT)) lookup_flags |= LOOKUP_AUTOMOUNT; - if (flags & AT_EMPTY_PATH) + if (flags & (AT_EMPTY_PATH | AT_NO_PATH)) lookup_flags |= LOOKUP_EMPTY; return lookup_flags; @@ -217,7 +217,8 @@ int getname_statx_lookup_flags(int flags) /** * vfs_statx - Get basic and extra attributes by filename * @dfd: A file descriptor representing the base dir for a relative filename - * @filename: The name of the file of interest + * @filename: The name of the file of interest, or NULL if the file of + interest is dfd itself and dfd isn't AT_FDCWD * @flags: Flags to control the query * @stat: The result structure to fill in. * @request_mask: STATX_xxx flags indicating what the caller wants @@ -232,42 +233,56 @@ int getname_statx_lookup_flags(int flags) static int vfs_statx(int dfd, struct filename *filename, int flags, struct kstat *stat, u32 request_mask) { - struct path path; + struct path path, *p; + struct fd f; unsigned int lookup_flags = getname_statx_lookup_flags(flags); int error; if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH | - AT_STATX_SYNC_TYPE)) + AT_STATX_SYNC_TYPE | AT_NO_PATH)) return -EINVAL; retry: - error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); - if (error) - goto out; + if (filename) { + p = &path; + error = filename_lookup(dfd, filename, lookup_flags, p, + NULL); + if (error) + goto out; + } else { + f = fdget_raw(dfd); + if (!f.file) + return -EBADF; + p = &f.file->f_path; + } - error = vfs_getattr(&path, stat, request_mask, flags); + error = vfs_getattr(p, stat, request_mask, flags); if (request_mask & STATX_MNT_ID_UNIQUE) { - stat->mnt_id = real_mount(path.mnt)->mnt_id_unique; + stat->mnt_id = real_mount(p->mnt)->mnt_id_unique; stat->result_mask |= STATX_MNT_ID_UNIQUE; } else { - stat->mnt_id = real_mount(path.mnt)->mnt_id; + stat->mnt_id = real_mount(p->mnt)->mnt_id; stat->result_mask |= STATX_MNT_ID; } - if (path.mnt->mnt_root == path.dentry) + if (p->mnt->mnt_root == p->dentry) stat->attributes |= STATX_ATTR_MOUNT_ROOT; stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; /* Handle STATX_DIOALIGN for block devices. */ if (request_mask & STATX_DIOALIGN) { - struct inode *inode = d_backing_inode(path.dentry); + struct inode *inode = d_backing_inode(p->dentry); if (S_ISBLK(inode->i_mode)) bdev_statx_dioalign(inode, stat); } - path_put(&path); + if (filename) + path_put(p); + else + fdput(f); + if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -276,33 +291,53 @@ static int vfs_statx(int dfd, struct filename *filename, int flags, return error; } -int vfs_fstatat(int dfd, const char __user *filename, - struct kstat *stat, int flags) +static struct filename *getname_statx(int dfd, const char __user *filename, + int flags) { - int ret; - int statx_flags = flags | AT_NO_AUTOMOUNT; - struct filename *name; + int r; + char c; + bool no_path = false; + + if (dfd < 0 && dfd != AT_FDCWD) + return ERR_PTR(-EBADF); /* - * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH) + * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH) or + * statx(AT_EMPTY_PATH) * * If AT_EMPTY_PATH is set, we expect the common case to be that * empty path, and avoid doing all the extra pathname work. */ - if (dfd >= 0 && flags == AT_EMPTY_PATH) { - char c; + if (flags & AT_NO_PATH) + no_path = true; + else if (flags & AT_EMPTY_PATH) { + r = get_user(c, filename); + if (unlikely(r)) + return ERR_PTR(r); + no_path = likely(!c); + } - ret = get_user(c, filename); - if (unlikely(ret)) - return ret; + if (no_path) + return dfd == AT_FDCWD ? getname_kernel("") : NULL; - if (likely(!c)) - return vfs_fstat(dfd, stat); - } + return getname_flags(filename, getname_statx_lookup_flags(flags), + NULL); +} + +int vfs_fstatat(int dfd, const char __user *filename, + struct kstat *stat, int flags) +{ + int ret; + int statx_flags = flags | AT_NO_AUTOMOUNT; + struct filename *name = getname_statx(dfd, filename, flags); + + if (IS_ERR(name)) + return PTR_ERR(name); - name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL); ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS); - putname(name); + + if (name) + putname(name); return ret; } @@ -703,11 +738,15 @@ SYSCALL_DEFINE5(statx, struct statx __user *, buffer) { int ret; - struct filename *name; + struct filename *name = getname_statx(dfd, filename, flags); + + if (IS_ERR(name)) + return PTR_ERR(name); - name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL); ret = do_statx(dfd, name, flags, mask, buffer); - putname(name); + + if (name) + putname(name); return ret; } diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index c0bcc185fa48..470b5c77000c 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -113,6 +113,9 @@ #define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */ #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ +#define AT_NO_PATH 0x10000 /* Ignore relative pathname, + behave as if AT_EMPTY_PATH and + the relative pathname is empty */ /* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */ #define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to -- 2.45.2