On Wed, Dec 11, 2024 at 04:37:08PM +0100, Miklos Szeredi wrote: > Add notifications for attaching and detaching mounts. The following new > event masks are added: > > FAN_MNT_ATTACH - Mount was attached > FAN_MNT_DETACH - Mount was detached > > If a mount is moved, then the event is reported with (FAN_MNT_ATTACH | > FAN_MNT_DETACH). > > These events add an info record of type FAN_EVENT_INFO_TYPE_MNT containing > these fields identifying the affected mounts: > > __u64 mnt_id - the ID of the mount (see statmount(2)) > > FAN_REPORT_MNT must be supplied to fanotify_init() to receive these events > and no other type of event can be received with this report type. > > Marks are added with FAN_MARK_MNTNS, which records the mount namespace > belonging to the supplied path. > > Prior to this patch mount namespace changes could be monitored by polling > /proc/self/mountinfo, which did not convey any information about what > changed. > > Signed-off-by: Miklos Szeredi <mszeredi@xxxxxxxxxx> > --- I think this starts to look nice and simple for the vfs bits. A lot less violent if we had to reinvent all of the infrastructure. Just a few comments. > > v3: > > - use a global list protected for temporarily storing (Christian) > - move fsnotify_* calls to namespace_unlock() (Christian) > - downgrade namespace_sem to read for fsnotify_* calls (Christian) > - add notification for reparenting in propagate_umount (Christian) > - require nsfs file (/proc/PID/ns/mnt) in fanotify_mark(2) (Christian) > - cleaner check for fsnotify being initialized (Amir) > - fix stub __fsnotify_mntns_delete (kernel test robot) > - don't add FANOTIFY_MOUNT_EVENTS to FANOTIFY_FD_EVENTS (Amir) > > v2: > - notify for whole namespace as this seems to be what people prefer > - move fsnotify() calls outside of mount_lock > - only report mnt_id, not parent_id > > fs/mount.h | 21 ++++++++++ > fs/namespace.c | 62 ++++++++++++++++++++++++++--- > fs/notify/fanotify/fanotify.c | 37 +++++++++++++++-- > fs/notify/fanotify/fanotify.h | 18 +++++++++ > fs/notify/fanotify/fanotify_user.c | 64 ++++++++++++++++++++++++++++-- > fs/notify/fdinfo.c | 2 + > fs/notify/fsnotify.c | 47 ++++++++++++++++++---- > fs/notify/fsnotify.h | 11 +++++ > fs/notify/mark.c | 14 +++++-- > fs/pnode.c | 4 +- > include/linux/fanotify.h | 12 ++++-- > include/linux/fsnotify.h | 20 ++++++++++ > include/linux/fsnotify_backend.h | 40 ++++++++++++++++++- > include/linux/mnt_namespace.h | 1 + > include/uapi/linux/fanotify.h | 10 +++++ > security/selinux/hooks.c | 4 ++ > 16 files changed, 340 insertions(+), 27 deletions(-) > > diff --git a/fs/mount.h b/fs/mount.h > index 185fc56afc13..0ad68c90e7e2 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -5,6 +5,8 @@ > #include <linux/ns_common.h> > #include <linux/fs_pin.h> > > +extern struct list_head notify_list; > + > struct mnt_namespace { > struct ns_common ns; > struct mount * root; > @@ -14,6 +16,10 @@ struct mnt_namespace { > u64 seq; /* Sequence number to prevent loops */ > wait_queue_head_t poll; > u64 event; > +#ifdef CONFIG_FSNOTIFY > + __u32 n_fsnotify_mask; > + struct fsnotify_mark_connector __rcu *n_fsnotify_marks; > +#endif > unsigned int nr_mounts; /* # of mounts in the namespace */ > unsigned int pending_mounts; > struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ > @@ -77,6 +83,9 @@ struct mount { > int mnt_expiry_mark; /* true if marked for expiry */ > struct hlist_head mnt_pins; > struct hlist_head mnt_stuck_children; > + > + struct list_head to_notify; /* need to queue notification */ > + struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */ > } __randomize_layout; > > #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ > @@ -167,3 +176,15 @@ static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) > { > return container_of(ns, struct mnt_namespace, ns); > } > + > +static inline void add_notify(struct mount *m) Can we please name this mnt_add_notify() or mnt_notify_add()? > +{ > + /* Optimize the case where there are no watches */ > + if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) || > + (m->prev_ns && m->prev_ns->n_fsnotify_marks)) > + list_add_tail(&m->to_notify, ¬ify_list); > + else > + m->prev_ns = m->mnt_ns; > +} > + > +struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); > diff --git a/fs/namespace.c b/fs/namespace.c > index 6eec7794f707..186f660abd60 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -79,6 +79,7 @@ static struct kmem_cache *mnt_cache __ro_after_init; > static DECLARE_RWSEM(namespace_sem); > static HLIST_HEAD(unmounted); /* protected by namespace_sem */ > static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ > +LIST_HEAD(notify_list); /* protected by namespace_sem */ > static DEFINE_RWLOCK(mnt_ns_tree_lock); > static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ > > @@ -145,6 +146,7 @@ static void mnt_ns_release(struct mnt_namespace *ns) > > /* keep alive for {list,stat}mount() */ > if (refcount_dec_and_test(&ns->passive)) { > + fsnotify_mntns_delete(ns); > put_user_ns(ns->user_ns); > kfree(ns); > } > @@ -1136,6 +1138,8 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) > rb_link_node(&mnt->mnt_node, parent, link); > rb_insert_color(&mnt->mnt_node, &ns->mounts); > mnt->mnt.mnt_flags |= MNT_ONRB; > + > + add_notify(mnt); > } > > /* > @@ -1683,17 +1687,53 @@ int may_umount(struct vfsmount *mnt) > > EXPORT_SYMBOL(may_umount); > > +static void notify_mount(struct mount *p) Similarly this should please become mnt_notify(). Then we have a nice symmetry: mnt_notify_add() mnt_notify() > +{ > + if (!p->prev_ns && p->mnt_ns) { > + fsnotify_mnt_attach(p->mnt_ns, &p->mnt); > + } else if (p->prev_ns && !p->mnt_ns) { > + fsnotify_mnt_detach(p->prev_ns, &p->mnt); > + } else if (p->prev_ns == p->mnt_ns) { > + fsnotify_mnt_move(p->mnt_ns, &p->mnt); > + } else { > + fsnotify_mnt_detach(p->prev_ns, &p->mnt); > + fsnotify_mnt_attach(p->mnt_ns, &p->mnt); > + } > + p->prev_ns = p->mnt_ns; > +} > + > static void namespace_unlock(void) > { > struct hlist_head head; > struct hlist_node *p; > - struct mount *m; > + struct mount *m, *tmp; > LIST_HEAD(list); > + bool notify = !list_empty(¬ify_list); > > hlist_move_list(&unmounted, &head); > list_splice_init(&ex_mountpoints, &list); > > - up_write(&namespace_sem); > + /* > + * No point blocking out concurrent readers while notifications are > + * sent. This will also allow statmount()/listmount() to run > + * concurrently. > + */ > + if (unlikely(notify)) > + downgrade_write(&namespace_sem); > + > + /* > + * Notify about mounts that were added/reparented/remain connected after > + * unmount. This doesn't mention unmounted mounts in the comment. :) > + */ > + list_for_each_entry_safe(m, tmp, ¬ify_list, to_notify) { > + notify_mount(m); > + list_del_init(&m->to_notify); > + } > + > + if (unlikely(notify)) > + up_read(&namespace_sem); > + else > + up_write(&namespace_sem); > > shrink_dentry_list(&list); > > @@ -1806,6 +1846,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) > change_mnt_propagation(p, MS_PRIVATE); > if (disconnect) > hlist_add_head(&p->mnt_umount, &unmounted); > + add_notify(p); So afaict, calling umount_tree() unconditionally is ok because: (1) copy_tree() will leave mnt->mnt_ns NULL and mnt->prev_ns will be NULL and thus m->prev_ns will be NULL. (2) fanotify will ignore anonymous mount namespaces even though dissolve_on_fput() will call umount_tree() and will end up registering all mounts that are umount if OPEN_TREE_CLONE | AT_RECURSIVE was done. Imho, that is all mighty subtle. So please either makes this explicit by adding the UMOUNT_NOTIFY flag that I had in my diff I sent as a reply to v2 or please add detailed explanatory comments what guarantees and assumptions are made by that function. Thanks! Christian