Okay, I put in a tracepoint (patch attached) and got a trace from the life of the offending mount object. I've cropped non-useful information out of the lines, inserted a blank line every time the usage count goes down to 2 and dropped most of the lines generated by fsnotify. Most of the lines are irrelevant. You can see system calls being issued and commands being run that make no difference on balance. What matters are the first four lines, the two mounts and the umount. You can see it go splat on the last line when the usage count has become poisoned. bash-3597 M=93785f8a u=1 ALC sp=clone_mnt+0x31/0x30a bash-3597 M=93785f8a u=2 GET sp=do_dentry_open+0x24/0x2d3 bash-3597 M=93785f8a u=1 PUT sp=__se_sys_open_tree+0x195/0x1da ^--- These three lines look like the open_tree() syscall done by test-fsmount. bash-3597 M=93785f8a u=2 GET sp=set_fs_pwd+0x37/0xdb ^--- And this the fchdir() syscall from test-fsmount. At this point u=2 would seem correct as the subtree isn't actually mounted anywhere (1 for pwd, 1 for fd). v--- test-fsmount then called execl() on bash, which did some stat'ing to find the executable... bash-3597 M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50 bash-3597 M=93785f8a u=2 PUT sp=vfs_statx+0x95/0xcc bash-3597 M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50 bash-3597 M=93785f8a u=2 PUT sp=vfs_statx+0x95/0xcc v--- and then exec'd it. bash-3597 M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50 bash-3597 M=93785f8a u=4 GET sp=do_dentry_open+0x24/0x2d3 bash-3597 M=93785f8a u=3 PUT sp=terminate_walk+0x20/0xda bash-3597 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 bash-3597 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc bash-3597 M=93785f8a u=2 PUT sp=__fput+0x180/0x1c1 v--- bash then did stuff: bash-3597 M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50 bash-3597 M=93785f8a u=2 PUT sp=vfs_statx+0x95/0xcc bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde grepconf.sh-3598 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3598 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc grepconf.sh-3598 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3598 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc grepconf.sh-3598 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3598 M=93785f8a u=5 GET sp=do_dentry_open+0x24/0x2d3 grepconf.sh-3598 M=93785f8a u=4 PUT sp=terminate_walk+0x20/0xda grepconf.sh-3598 M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3598 M=93785f8a u=4 PUT sp=vfs_statx+0x95/0xcc grepconf.sh-3598 M=93785f8a u=3 PUT sp=__fput+0x180/0x1c1 grepconf.sh-3598 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3598 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc grepconf.sh-3598 M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde grep-3599 M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e grepconf.sh-3598 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde bash-3600 M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde tty-3601 M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e bash-3600 M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde tput-3602 M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e bash-3600 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde bash-3603 M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde dircolors-3604 M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e bash-3603 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde grep-3605 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde grepconf.sh-3606 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3606 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc grepconf.sh-3606 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3606 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc grepconf.sh-3606 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3606 M=93785f8a u=5 GET sp=do_dentry_open+0x24/0x2d3 grepconf.sh-3606 M=93785f8a u=4 PUT sp=terminate_walk+0x20/0xda grepconf.sh-3606 M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3606 M=93785f8a u=4 PUT sp=vfs_statx+0x95/0xcc grepconf.sh-3606 M=93785f8a u=3 PUT sp=__fput+0x180/0x1c1 grepconf.sh-3606 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3606 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc grepconf.sh-3606 M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde grep-3607 M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e grepconf.sh-3606 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde grepconf.sh-3608 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3608 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc grepconf.sh-3608 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3608 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc grepconf.sh-3608 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3608 M=93785f8a u=5 GET sp=do_dentry_open+0x24/0x2d3 grepconf.sh-3608 M=93785f8a u=4 PUT sp=terminate_walk+0x20/0xda grepconf.sh-3608 M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3608 M=93785f8a u=4 PUT sp=vfs_statx+0x95/0xcc grepconf.sh-3608 M=93785f8a u=3 PUT sp=__fput+0x180/0x1c1 grepconf.sh-3608 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 grepconf.sh-3608 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc grepconf.sh-3608 M=93785f8a u=4 GET sp=copy_fs_struct+0xcc/0xde grep-3609 M=93785f8a u=3 PUT sp=free_fs_struct+0x1e/0x2e grepconf.sh-3608 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e bash-3597 M=93785f8a u=3 GET sp=legitimize_path.isra.7+0x16/0x50 bash-3597 M=93785f8a u=2 PUT sp=vfs_statx+0x95/0xcc I ran "mount --move . /mnt": bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde mount-3610 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 mount-3610 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc mount-3610 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 mount-3610 M=93785f8a u=5 GET sp=do_dentry_open+0x24/0x2d3 mount-3610 M=93785f8a u=4 PUT sp=terminate_walk+0x20/0xda mount-3610 M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50 mount-3610 M=93785f8a u=4 PUT sp=vfs_statx+0x95/0xcc mount-3610 M=93785f8a u=3 PUT sp=__fput+0x180/0x1c1 mount-3610 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 mount-3610 M=93785f8a u=3 PUT sp=do_mount+0x715/0x929 mount-3610 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e which worked. Herein lieth the problem. The usage count should be 3 now (1 for pwd, 1 for fd, 1 for mount) - but how does VFS know that this mount object isn't mounted yet? I ran "mount --move /mnt /mnt": bash-3597 M=93785f8a u=3 GET sp=copy_fs_struct+0xcc/0xde mount-3611 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 mount-3611 M=93785f8a u=3 PUT sp=vfs_statx+0x95/0xcc mount-3611 M=93785f8a u=4 GET sp=legitimize_path.isra.7+0x16/0x50 mount-3611 M=93785f8a u=5 GET sp=legitimize_path.isra.7+0x16/0x50 mount-3611 M=93785f8a u=4 PUT sp=do_mount+0x715/0x929 mount-3611 M=93785f8a u=3 PUT sp=do_mount+0x1cf/0x929 mount-3611 M=93785f8a u=2 PUT sp=free_fs_struct+0x1e/0x2e which failed with ELOOP. I ran "cd": bash-3597 M=93785f8a u=1 PUT sp=set_fs_pwd+0xb8/0xdb I ran "umount /mnt": umount-3612 M=93785f8a u=2 GET sp=legitimize_path.isra.7+0x16/0x50 umount-3612 M=93785f8a u=1 PUT sp=vfs_statx+0x95/0xcc umount-3612 M=93785f8a u=2 GET sp=legitimize_path.isra.7+0x16/0x50 umount-3612 M=93785f8a u=1 PUT sp=vfs_statx+0x95/0xcc umount-3612 M=93785f8a u=2 GET sp=legitimize_path.isra.7+0x16/0x50 umount-3612 M=93785f8a u=1 PUT sp=user_statfs+0x61/0x98 umount-3612 M=93785f8a u=2 GET sp=legitimize_mnt+0x12/0x108 umount-3612 M=93785f8a u=1 PUT sp=pin_kill+0x11c/0x325 umount-3612 M=93785f8a u=0 PUT sp=ksys_umount+0x3e8/0x40e umount-3612 M=93785f8a u=0 FRE sp=cleanup_mnt+0x4d/0x5e And finally, I exited the shell, which then tried to release the fd inherited from open_tree(): bash-3597 M=93785f8a u=1802201963 NFY sp=__fput+0xac/0x1c1 Note that the subtree attached to the fd has not at this point been directly mounted by move_mount(), but implicitly mounted by fchdir() into it and then using mount(MS_MOVE) of "." to "/mnt". Note also that if I run the commands all as one go rather than pasting them individually, a crash occurs in pin_kill() instead. So, I'm not sure how to proceed from here. There's no easy way to remove the FMODE_NEED_UNMOUNT flag left by open_tree(). David --- commit e7c8e6590aa0dd3bf2e10450b8992d496c44be8b Author: David Howells <dhowells@xxxxxxxxxx> Date: Fri Oct 19 10:38:35 2018 +0100 mnt_count trace diff --git a/fs/mount.h b/fs/mount.h index f39bc9da4d73..124a6dd73936 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -20,7 +20,7 @@ struct mnt_namespace { } __randomize_layout; struct mnt_pcp { - int mnt_count; + int mnt_countxxx; int mnt_writers; }; @@ -46,6 +46,7 @@ struct mount { int mnt_count; int mnt_writers; #endif + atomic_t mnt_count; struct list_head mnt_mounts; /* list of children, anchored here */ struct list_head mnt_child; /* and going through their mnt_child */ struct list_head mnt_instance; /* mount instance on sb->s_mounts */ diff --git a/fs/namei.c b/fs/namei.c index fb913148d4d1..da1489f6925c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -460,32 +460,6 @@ int inode_permission(struct inode *inode, int mask) } EXPORT_SYMBOL(inode_permission); -/** - * path_get - get a reference to a path - * @path: path to get the reference to - * - * Given a path increment the reference count to the dentry and the vfsmount. - */ -void path_get(const struct path *path) -{ - mntget(path->mnt); - dget(path->dentry); -} -EXPORT_SYMBOL(path_get); - -/** - * path_put - put a reference to a path - * @path: path to put the reference to - * - * Given a path decrement the reference count to the dentry and the vfsmount. - */ -void path_put(const struct path *path) -{ - dput(path->dentry); - mntput(path->mnt); -} -EXPORT_SYMBOL(path_put); - #define EMBEDDED_LEVELS 2 struct nameidata { struct path path; diff --git a/fs/namespace.c b/fs/namespace.c index 6370bfabec99..389e806e1a65 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -29,6 +29,8 @@ #include <linux/sched/task.h> #include <uapi/linux/mount.h> #include <linux/fs_context.h> +#define CREATE_TRACE_POINTS +#include <trace/events/mnt.h> #include "pnode.h" #include "internal.h" @@ -109,8 +111,10 @@ static int mnt_alloc_id(struct mount *mnt) return 0; } -static void mnt_free_id(struct mount *mnt) +static noinline void mnt_free_id(struct mount *mnt) { + trace_mnt_count(mnt, mnt->mnt_id, atomic_read(&mnt->mnt_count), 99, + __builtin_return_address(0)); ida_free(&mnt_id_ida, mnt->mnt_id); } @@ -141,6 +145,9 @@ void mnt_release_group_id(struct mount *mnt) */ static inline void mnt_add_count(struct mount *mnt, int n) { + int u; + +#if 0 #ifdef CONFIG_SMP this_cpu_add(mnt->mnt_pcp->mnt_count, n); #else @@ -148,6 +155,9 @@ static inline void mnt_add_count(struct mount *mnt, int n) mnt->mnt_count += n; preempt_enable(); #endif +#endif + u = atomic_add_return(n, &mnt->mnt_count); + trace_mnt_count(mnt, mnt->mnt_id, u, n, __builtin_return_address(0)); } /* @@ -155,6 +165,7 @@ static inline void mnt_add_count(struct mount *mnt, int n) */ unsigned int mnt_get_count(struct mount *mnt) { +#if 0 #ifdef CONFIG_SMP unsigned int count = 0; int cpu; @@ -167,6 +178,8 @@ unsigned int mnt_get_count(struct mount *mnt) #else return mnt->mnt_count; #endif +#endif + return atomic_read(&mnt->mnt_count); } static void drop_mountpoint(struct fs_pin *p) @@ -198,11 +211,15 @@ static struct mount *alloc_vfsmnt(const char *name) if (!mnt->mnt_pcp) goto out_free_devname; +#if 0 this_cpu_add(mnt->mnt_pcp->mnt_count, 1); +#endif #else mnt->mnt_count = 1; mnt->mnt_writers = 0; #endif + atomic_set(&mnt->mnt_count, 1); + trace_mnt_count(mnt, mnt->mnt_id, 1, 0, __builtin_return_address(0)); INIT_HLIST_NODE(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); @@ -1128,7 +1145,7 @@ static void mntput_no_expire(struct mount *mnt) cleanup_mnt(mnt); } -void mntput(struct vfsmount *mnt) +inline void mntput(struct vfsmount *mnt) { if (mnt) { struct mount *m = real_mount(mnt); @@ -1140,7 +1157,7 @@ void mntput(struct vfsmount *mnt) } EXPORT_SYMBOL(mntput); -struct vfsmount *mntget(struct vfsmount *mnt) +inline struct vfsmount *mntget(struct vfsmount *mnt) { if (mnt) mnt_add_count(real_mount(mnt), 1); @@ -3970,3 +3987,29 @@ const struct proc_ns_operations mntns_operations = { .install = mntns_install, .owner = mntns_owner, }; + +/** + * path_get - get a reference to a path + * @path: path to get the reference to + * + * Given a path increment the reference count to the dentry and the vfsmount. + */ +void path_get(const struct path *path) +{ + mntget(path->mnt); + dget(path->dentry); +} +EXPORT_SYMBOL(path_get); + +/** + * path_put - put a reference to a path + * @path: path to put the reference to + * + * Given a path decrement the reference count to the dentry and the vfsmount. + */ +void path_put(const struct path *path) +{ + dput(path->dentry); + mntput(path->mnt); +} +EXPORT_SYMBOL(path_put); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ababdbfab537..aaef44d6204c 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -23,6 +23,7 @@ #include <linux/module.h> #include <linux/mount.h> #include <linux/srcu.h> +#include <trace/events/mnt.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" @@ -324,10 +325,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, /* global tests shouldn't care about events on child only the specific event */ __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); - if (data_is == FSNOTIFY_EVENT_PATH) + if (data_is == FSNOTIFY_EVENT_PATH) { mnt = real_mount(((const struct path *)data)->mnt); - else + trace_mnt_count(mnt, mnt->mnt_id, atomic_read(&mnt->mnt_count), 88, + __builtin_return_address(0)); + } else { mnt = NULL; + } /* * Optimization: srcu_read_lock() has a memory barrier which can diff --git a/include/trace/events/mnt.h b/include/trace/events/mnt.h new file mode 100644 index 000000000000..da1a981f1a61 --- /dev/null +++ b/include/trace/events/mnt.h @@ -0,0 +1,57 @@ +/* Mount tracepoints + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@xxxxxxxxxx) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mnt + +#if !defined(_TRACE_MNT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MNT_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(mnt_count, + TP_PROTO(const void *mnt, int mnt_id, int mnt_count, + int delta, const void *where), + + TP_ARGS(mnt, mnt_id, mnt_count, delta, where), + + TP_STRUCT__entry( + __field(int, mnt_id ) + __field(int, mnt_count ) + __field(int, delta ) + __field(const void *, mnt ) + __field(const void *, where ) + ), + + TP_fast_assign( + __entry->mnt_id = mnt_id; + __entry->mnt_count = mnt_count; + __entry->delta = delta; + __entry->mnt = mnt; + __entry->where = where; + ), + + TP_printk("M=%p m=%08x u=%d %s sp=%pSR", + __entry->mnt, + __entry->mnt_id, + __entry->mnt_count, + __print_symbolic(__entry->delta, + { 0, "ALC" }, + { 1, "GET" }, + { -1, "PUT" }, + { 88, "NFY" }, + { 99, "FRE" }), + __entry->where) + ); + +#endif /* _TRACE_MNT_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h>