On Mon, 2024-06-24 at 11:49 -0400, Josef Bacik wrote: > In order to allow for listmount() to be used on different namespaces we > need a way to lookup a mount ns by its id. Keep a rbtree of the current > !anonymous mount name spaces indexed by ID that we can use to look up > the namespace. > > Co-developed-by: Christian Brauner <brauner@xxxxxxxxxx> > Signed-off-by: Josef Bacik <josef@xxxxxxxxxxxxxx> > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> > --- > fs/mount.h | 2 + > fs/namespace.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 113 insertions(+), 2 deletions(-) > > diff --git a/fs/mount.h b/fs/mount.h > index 4adce73211ae..ad4b1ddebb54 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -16,6 +16,8 @@ struct mnt_namespace { > u64 event; > unsigned int nr_mounts; /* # of mounts in the namespace */ > unsigned int pending_mounts; > + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ > + refcount_t passive; /* number references not pinning @mounts */ > } __randomize_layout; > > struct mnt_pcp { > diff --git a/fs/namespace.c b/fs/namespace.c > index 45df82f2a059..babdebdb0a9c 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init; > static DECLARE_RWSEM(namespace_sem); > static HLIST_HEAD(unmounted); /* protected by namespace_sem */ > static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ > +static DEFINE_RWLOCK(mnt_ns_tree_lock); > +static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by namespace_sem */ > > struct mount_kattr { > unsigned int attr_set; > @@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj); > */ > __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); > > +static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) > +{ > + u64 seq_b = ns->seq; > + > + if (seq < seq_b) > + return -1; > + if (seq > seq_b) > + return 1; > + return 0; > +} > + > +static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) > +{ > + if (!node) > + return NULL; > + return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); > +} > + > +static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) > +{ > + struct mnt_namespace *ns_a = node_to_mnt_ns(a); > + struct mnt_namespace *ns_b = node_to_mnt_ns(b); > + u64 seq_a = ns_a->seq; > + > + return mnt_ns_cmp(seq_a, ns_b) < 0; > +} > + > +static void mnt_ns_tree_add(struct mnt_namespace *ns) > +{ > + guard(write_lock)(&mnt_ns_tree_lock); > + rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); > +} > + > +static void mnt_ns_release(struct mnt_namespace *ns) > +{ > + lockdep_assert_not_held(&mnt_ns_tree_lock); > + Why is it bad to hold this lock here? AFAICT, put_user_ns just does a schedule_work when the counter goes to 0. Granted, I don't see a reason why you would want to hold it here, but usually that sort of assertion means that it _must_ be forbidden. > + /* keep alive for {list,stat}mount() */ > + if (refcount_dec_and_test(&ns->passive)) { > + put_user_ns(ns->user_ns); > + kfree(ns); > + } > +} > +DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) > + > +static void mnt_ns_tree_remove(struct mnt_namespace *ns) > +{ > + /* remove from global mount namespace list */ > + if (!is_anon_ns(ns)) { > + guard(write_lock)(&mnt_ns_tree_lock); > + rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); > + } > + > + mnt_ns_release(ns); > +} > + > +/* > + * Returns the mount namespace which either has the specified id, or has the > + * next smallest id afer the specified one. > + */ > +static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) > +{ > + struct rb_node *node = mnt_ns_tree.rb_node; > + struct mnt_namespace *ret = NULL; > + > + lockdep_assert_held(&mnt_ns_tree_lock); > + > + while (node) { > + struct mnt_namespace *n = node_to_mnt_ns(node); > + > + if (mnt_ns_id <= n->seq) { > + ret = node_to_mnt_ns(node); > + if (mnt_ns_id == n->seq) > + break; > + node = node->rb_left; > + } else { > + node = node->rb_right; > + } > + } > + return ret; > +} > + > +/* > + * Lookup a mount namespace by id and take a passive reference count. Taking a > + * passive reference means the mount namespace can be emptied if e.g., the last > + * task holding an active reference exits. To access the mounts of the > + * namespace the @namespace_sem must first be acquired. If the namespace has > + * already shut down before acquiring @namespace_sem, {list,stat}mount() will > + * see that the mount rbtree of the namespace is empty. > + */ > +static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) > +{ > + struct mnt_namespace *ns; > + > + guard(read_lock)(&mnt_ns_tree_lock); > + ns = mnt_ns_find_id_at(mnt_ns_id); > + if (!ns || ns->seq != mnt_ns_id) > + return NULL; > + > + refcount_inc(&ns->passive); > + return ns; > +} > + > static inline void lock_mount_hash(void) > { > write_seqlock(&mount_lock); > @@ -3736,8 +3841,7 @@ static void free_mnt_ns(struct mnt_namespace *ns) > if (!is_anon_ns(ns)) > ns_free_inum(&ns->ns); > dec_mnt_namespaces(ns->ucounts); > - put_user_ns(ns->user_ns); > - kfree(ns); > + mnt_ns_tree_remove(ns); > } > > /* > @@ -3776,7 +3880,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a > if (!anon) > new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); > refcount_set(&new_ns->ns.count, 1); > + refcount_set(&new_ns->passive, 1); > new_ns->mounts = RB_ROOT; > + RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); > init_waitqueue_head(&new_ns->poll); > new_ns->user_ns = get_user_ns(user_ns); > new_ns->ucounts = ucounts; > @@ -3853,6 +3959,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, > while (p->mnt.mnt_root != q->mnt.mnt_root) > p = next_mnt(skip_mnt_tree(p), old); > } > + mnt_ns_tree_add(new_ns); > namespace_unlock(); > > if (rootmnt) > @@ -5208,6 +5315,8 @@ static void __init init_mount_tree(void) > > set_fs_pwd(current->fs, &root); > set_fs_root(current->fs, &root); > + > + mnt_ns_tree_add(ns); > } > > void __init mnt_init(void) -- Jeff Layton <jlayton@xxxxxxxxxx>