Re: [PATCH v2] fanotify: notify on mount attach and detach

Christian Brauner <brauner@xxxxxxxxxx> · Sat, 7 Dec 2024 22:17:13 +0100

On Fri, Dec 06, 2024 at 05:00:56PM +0100, Christian Brauner wrote:
> On Fri, Dec 06, 2024 at 04:11:52PM +0100, Miklos Szeredi wrote:
> > Add notifications for attaching and detaching mounts.  The following new
> > event masks are added:
> > 
> >   FAN_MNT_ATTACH  - Mount was attached
> >   FAN_MNT_DETACH  - Mount was detached
> > 
> > If a mount is moved, then the event is reported with (FAN_MNT_ATTACH |
> > FAN_MNT_DETACH).
> > 
> > These events add an info record of type FAN_EVENT_INFO_TYPE_MNT containing
> > these fields identifying the affected mounts:
> > 
> >   __u64 mnt_id    - the ID of the mount (see statmount(2))
> > 
> > FAN_REPORT_MNT must be supplied to fanotify_init() to receive these events
> > and no other type of event can be received with this report type.
> > 
> > Marks are added with FAN_MARK_MNTNS, which records the mount namespace
> > belonging to the supplied path.
> > 
> > Prior to this patch mount namespace changes could be monitored by polling
> > /proc/self/mountinfo, which did not convey any information about what
> > changed.
> > 
> > Signed-off-by: Miklos Szeredi <mszeredi@xxxxxxxxxx>
> > ---
> >  fs/mount.h                         |  11 +++
> >  fs/namespace.c                     | 120 ++++++++++++++++++++++-------
> >  fs/notify/fanotify/fanotify.c      |  56 ++++++++++----
> >  fs/notify/fanotify/fanotify.h      |  18 +++++
> >  fs/notify/fanotify/fanotify_user.c |  70 ++++++++++++++++-
> >  fs/notify/fdinfo.c                 |   2 +
> >  fs/notify/fsnotify.c               |  44 +++++++++--
> >  fs/notify/fsnotify.h               |  11 +++
> >  fs/notify/mark.c                   |  14 +++-
> >  fs/pnode.c                         |   4 +-
> >  include/linux/fanotify.h           |  14 ++--
> >  include/linux/fsnotify.h           |  20 +++++
> >  include/linux/fsnotify_backend.h   |  40 +++++++++-
> >  include/linux/mnt_namespace.h      |   5 ++
> >  include/uapi/linux/fanotify.h      |  10 +++
> >  security/selinux/hooks.c           |   4 +
> >  16 files changed, 384 insertions(+), 59 deletions(-)
> > 
> > v2:
> > 	- notify for whole namespace as this seems to be what people prefer
> > 	- move fsnotify() calls outside of mount_lock
> > 	- only report mnt_id, not parent_id
> > 
> > diff --git a/fs/mount.h b/fs/mount.h
> > index 185fc56afc13..a79232a8c908 100644
> > --- a/fs/mount.h
> > +++ b/fs/mount.h
> > @@ -14,6 +14,10 @@ struct mnt_namespace {
> >  	u64			seq;	/* Sequence number to prevent loops */
> >  	wait_queue_head_t poll;
> >  	u64 event;
> > +#ifdef CONFIG_FSNOTIFY
> > +	__u32			n_fsnotify_mask;
> > +	struct fsnotify_mark_connector __rcu *n_fsnotify_marks;
> > +#endif
> >  	unsigned int		nr_mounts; /* # of mounts in the namespace */
> >  	unsigned int		pending_mounts;
> >  	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
> > @@ -77,6 +81,13 @@ struct mount {
> >  	int mnt_expiry_mark;		/* true if marked for expiry */
> >  	struct hlist_head mnt_pins;
> >  	struct hlist_head mnt_stuck_children;
> > +
> > +	/*
> > +	 * for mount notification
> > +	 * FIXME: maybe move to a union with some other fields?
> > +	 */
> > +	struct list_head to_notify; /* singly linked list? */
> > +	struct mnt_namespace *prev_ns;
> >  } __randomize_layout;
> >  
> >  #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index 23e81c2a1e3f..b376570544a7 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -139,12 +139,13 @@ static void mnt_ns_tree_add(struct mnt_namespace *ns)
> >  	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
> >  }
> >  
> > -static void mnt_ns_release(struct mnt_namespace *ns)
> > +void mnt_ns_release(struct mnt_namespace *ns)
> >  {
> >  	lockdep_assert_not_held(&mnt_ns_tree_lock);
> >  
> >  	/* keep alive for {list,stat}mount() */
> >  	if (refcount_dec_and_test(&ns->passive)) {
> > +		fsnotify_mntns_delete(ns);
> >  		put_user_ns(ns->user_ns);
> >  		kfree(ns);
> >  	}
> > @@ -1119,7 +1120,16 @@ static inline struct mount *node_to_mount(struct rb_node *node)
> >  	return node ? rb_entry(node, struct mount, mnt_node) : NULL;
> >  }
> >  
> > -static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
> > +static void queue_notify(struct mnt_namespace *ns, struct mount *m, struct list_head *notif)
> > +{
> > +	/* Optimize the case where there are no watches */
> > +	if (ns->n_fsnotify_marks)
> > +		list_add_tail(&m->to_notify, notif);
> > +	else
> > +		m->prev_ns = m->mnt_ns;
> > +}
> > +
> > +static void __mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
> >  {
> >  	struct rb_node **link = &ns->mounts.rb_node;
> >  	struct rb_node *parent = NULL;
> > @@ -1138,10 +1148,37 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
> >  	mnt->mnt.mnt_flags |= MNT_ONRB;
> >  }
> >  
> > +static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt, struct list_head *notif)
> > +{
> > +	__mnt_add_to_ns(ns, mnt);
> > +	queue_notify(ns, mnt, notif);
> > +}
> > +
> > +static void notify_mounts(struct list_head *head)
> > +{
> > +	struct mount *p;
> > +
> > +	while (!list_empty(head)) {
> > +		p = list_first_entry(head, struct mount, to_notify);
> > +		if (!p->prev_ns && p->mnt_ns) {
> > +			fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
> > +		} else if (p->prev_ns && !p->mnt_ns) {
> > +			fsnotify_mnt_detach(p->prev_ns, &p->mnt);
> > +		} else if (p->prev_ns == p->mnt_ns) {
> > +			fsnotify_mnt_move(p->mnt_ns, &p->mnt);
> > +		} else	{
> > +			fsnotify_mnt_detach(p->prev_ns, &p->mnt);
> > +			fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
> > +		}
> > +		p->prev_ns = p->mnt_ns;
> > +		list_del_init(&p->to_notify);
> > +	}
> > +}
> > +
> >  /*
> >   * vfsmount lock must be held for write
> >   */
> > -static void commit_tree(struct mount *mnt)
> > +static void commit_tree(struct mount *mnt, struct list_head *notif)
> >  {
> >  	struct mount *parent = mnt->mnt_parent;
> >  	struct mount *m;
> > @@ -1155,7 +1192,7 @@ static void commit_tree(struct mount *mnt)
> >  		m = list_first_entry(&head, typeof(*m), mnt_list);
> >  		list_del(&m->mnt_list);
> >  
> > -		mnt_add_to_ns(n, m);
> > +		mnt_add_to_ns(n, m, notif);
> >  	}
> >  	n->nr_mounts += n->pending_mounts;
> >  	n->pending_mounts = 0;
> > @@ -1752,7 +1789,7 @@ static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
> >   * mount_lock must be held
> >   * namespace_sem must be held for write
> >   */
> > -static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
> > +static void umount_tree(struct mount *mnt, struct list_head *notif, enum umount_tree_flags how)
> >  {
> >  	LIST_HEAD(tmp_list);
> >  	struct mount *p;
> > @@ -1785,11 +1822,12 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
> >  		list_del_init(&p->mnt_expire);
> >  		list_del_init(&p->mnt_list);
> >  		ns = p->mnt_ns;
> > +		p->mnt_ns = NULL;
> >  		if (ns) {
> >  			ns->nr_mounts--;
> >  			__touch_mnt_namespace(ns);
> > +			queue_notify(ns, p, notif);
> >  		}
> > -		p->mnt_ns = NULL;
> >  		if (how & UMOUNT_SYNC)
> >  			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
> >  
> > @@ -1809,7 +1847,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
> >  	}
> >  }
> >  
> > -static void shrink_submounts(struct mount *mnt);
> > +static void shrink_submounts(struct mount *mnt, struct list_head *notif);
> >  
> >  static int do_umount_root(struct super_block *sb)
> >  {
> > @@ -1837,6 +1875,7 @@ static int do_umount_root(struct super_block *sb)
> >  static int do_umount(struct mount *mnt, int flags)
> >  {
> >  	struct super_block *sb = mnt->mnt.mnt_sb;
> > +	LIST_HEAD(notif);
> >  	int retval;
> >  
> >  	retval = security_sb_umount(&mnt->mnt, flags);
> > @@ -1914,20 +1953,21 @@ static int do_umount(struct mount *mnt, int flags)
> >  	if (flags & MNT_DETACH) {
> >  		if (mnt->mnt.mnt_flags & MNT_ONRB ||
> >  		    !list_empty(&mnt->mnt_list))
> > -			umount_tree(mnt, UMOUNT_PROPAGATE);
> > +			umount_tree(mnt, &notif, UMOUNT_PROPAGATE);
> >  		retval = 0;
> >  	} else {
> > -		shrink_submounts(mnt);
> > +		shrink_submounts(mnt, &notif);
> >  		retval = -EBUSY;
> >  		if (!propagate_mount_busy(mnt, 2)) {
> >  			if (mnt->mnt.mnt_flags & MNT_ONRB ||
> >  			    !list_empty(&mnt->mnt_list))
> > -				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
> > +				umount_tree(mnt, &notif, UMOUNT_PROPAGATE|UMOUNT_SYNC);
> >  			retval = 0;
> >  		}
> >  	}
> >  out:
> >  	unlock_mount_hash();
> > +	notify_mounts(&notif);
> >  	namespace_unlock();
> >  	return retval;
> >  }
> > @@ -1946,6 +1986,7 @@ void __detach_mounts(struct dentry *dentry)
> >  {
> >  	struct mountpoint *mp;
> >  	struct mount *mnt;
> > +	LIST_HEAD(notif);
> >  
> >  	namespace_lock();
> >  	lock_mount_hash();
> > @@ -1960,11 +2001,12 @@ void __detach_mounts(struct dentry *dentry)
> >  			umount_mnt(mnt);
> >  			hlist_add_head(&mnt->mnt_umount, &unmounted);
> >  		}
> > -		else umount_tree(mnt, UMOUNT_CONNECTED);
> > +		else umount_tree(mnt, &notif, UMOUNT_CONNECTED);
> >  	}
> >  	put_mountpoint(mp);
> >  out_unlock:
> >  	unlock_mount_hash();
> > +	notify_mounts(&notif);
> 
> For unmount we shouldn't need to do this under the namespace semaphore.
> Instead we should be able to do this in namespace_unlock() after we've
> given up the namespace semaphore and making the umount fsnotify
> notification list global. So in the end you can then do sm like:
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index b376570544a7..70e67f1490eb 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1733,6 +1733,7 @@ static void namespace_unlock(void)
>         up_write(&namespace_sem);
> 
>         shrink_dentry_list(&list);
> +       notify_mounts(&notif);
> 
>         if (likely(hlist_empty(&head)))
>                 return;
> 
> Maybe you can even use the @unmounted list for this directly.
> 
> I wonder if we can avoid doing the notifications for mount events
> without calling copy_to_user() under the namespace semaphore as well.
> Something we've so far managed to avoid and it would let us sleep
> easier.

I took another look at f{a,s}notify. There's no copy_to_user() happening
when adding events via fsnotify(). It happens when the caller retrieves
events via read() from the relevant notify file descriptor. We should
still move calls to notify_mounts() out of the namespace semaphore
whenever we can though.