On Fri, 2024-12-13 at 19:44 +0100, Christian Brauner wrote: > On Fri, Dec 13, 2024 at 09:11:41AM -0500, Jeff Layton wrote: > > On Fri, 2024-12-13 at 00:03 +0100, Christian Brauner wrote: > > > Currently we use a read-write lock but for the simple search case we can > > > make this lockless. Creating a new mount namespace is a rather rare > > > event compared with querying mounts in a foreign mount namespace. Once > > > this is picked up by e.g., systemd to list mounts in another mount in > > > it's isolated services or in containers this will be used a lot so this > > > seems worthwhile doing. > > > > > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> > > > --- > > > fs/mount.h | 5 ++- > > > fs/namespace.c | 119 +++++++++++++++++++++++++++++++++++---------------------- > > > 2 files changed, 77 insertions(+), 47 deletions(-) > > > > > > diff --git a/fs/mount.h b/fs/mount.h > > > index 185fc56afc13338f8185fe818051444d540cbd5b..36ead0e45e8aa7614c00001102563a711d9dae6e 100644 > > > --- a/fs/mount.h > > > +++ b/fs/mount.h > > > @@ -12,7 +12,10 @@ struct mnt_namespace { > > > struct user_namespace *user_ns; > > > struct ucounts *ucounts; > > > u64 seq; /* Sequence number to prevent loops */ > > > - wait_queue_head_t poll; > > > + union { > > > + wait_queue_head_t poll; > > > + struct rcu_head mnt_ns_rcu; > > > + }; > > > u64 event; > > > unsigned int nr_mounts; /* # of mounts in the namespace */ > > > unsigned int pending_mounts; > > > diff --git a/fs/namespace.c b/fs/namespace.c > > > index 10fa18dd66018fadfdc9d18c59a851eed7bd55ad..52adee787eb1b6ee8831705b2b121854c3370fb3 100644 > > > --- a/fs/namespace.c > > > +++ b/fs/namespace.c > > > @@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem); > > > static HLIST_HEAD(unmounted); /* protected by namespace_sem */ > > > static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ > > > static DEFINE_RWLOCK(mnt_ns_tree_lock); > > > +static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock); > > > + > > > static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ > > > > > > struct mount_kattr { > > > @@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); > > > */ > > > __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); > > > > > > -static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) > > > -{ > > > - u64 seq_b = ns->seq; > > > - > > > - if (seq < seq_b) > > > - return -1; > > > - if (seq > seq_b) > > > - return 1; > > > - return 0; > > > -} > > > - > > > static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) > > > { > > > if (!node) > > > @@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) > > > return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); > > > } > > > > > > -static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) > > > +static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) > > > { > > > struct mnt_namespace *ns_a = node_to_mnt_ns(a); > > > struct mnt_namespace *ns_b = node_to_mnt_ns(b); > > > u64 seq_a = ns_a->seq; > > > + u64 seq_b = ns_b->seq; > > > + > > > + if (seq_a < seq_b) > > > + return -1; > > > + if (seq_a > seq_b) > > > + return 1; > > > + return 0; > > > +} > > > > > > - return mnt_ns_cmp(seq_a, ns_b) < 0; > > > +static inline void mnt_ns_tree_write_lock(void) > > > +{ > > > + write_lock(&mnt_ns_tree_lock); > > > + write_seqcount_begin(&mnt_ns_tree_seqcount); > > > +} > > > + > > > +static inline void mnt_ns_tree_write_unlock(void) > > > +{ > > > + write_seqcount_end(&mnt_ns_tree_seqcount); > > > + write_unlock(&mnt_ns_tree_lock); > > > } > > > > > > static void mnt_ns_tree_add(struct mnt_namespace *ns) > > > { > > > - guard(write_lock)(&mnt_ns_tree_lock); > > > - rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); > > > + struct rb_node *node; > > > + > > > + mnt_ns_tree_write_lock(); > > > + node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); > > > + mnt_ns_tree_write_unlock(); > > > + > > > + WARN_ON_ONCE(node); > > > } > > > > > > static void mnt_ns_release(struct mnt_namespace *ns) > > > @@ -150,41 +163,36 @@ static void mnt_ns_release(struct mnt_namespace *ns) > > > } > > > DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) > > > > > > +static void mnt_ns_release_rcu(struct rcu_head *rcu) > > > +{ > > > + struct mnt_namespace *mnt_ns; > > > + > > > + mnt_ns = container_of(rcu, struct mnt_namespace, mnt_ns_rcu); > > > + mnt_ns_release(mnt_ns); > > > +} > > > + > > > static void mnt_ns_tree_remove(struct mnt_namespace *ns) > > > { > > > /* remove from global mount namespace list */ > > > if (!is_anon_ns(ns)) { > > > - guard(write_lock)(&mnt_ns_tree_lock); > > > + mnt_ns_tree_write_lock(); > > > rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); > > > + mnt_ns_tree_write_unlock(); > > > } > > > > > > - mnt_ns_release(ns); > > > + call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); > > > } > > > > > > -/* > > > - * Returns the mount namespace which either has the specified id, or has the > > > - * next smallest id afer the specified one. > > > - */ > > > -static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) > > > +static int mnt_ns_find(const void *key, const struct rb_node *node) > > > { > > > - struct rb_node *node = mnt_ns_tree.rb_node; > > > - struct mnt_namespace *ret = NULL; > > > - > > > - lockdep_assert_held(&mnt_ns_tree_lock); > > > - > > > - while (node) { > > > - struct mnt_namespace *n = node_to_mnt_ns(node); > > > + const u64 mnt_ns_id = *(u64 *)key; > > > + const struct mnt_namespace *ns = node_to_mnt_ns(node); > > > > > > - if (mnt_ns_id <= n->seq) { > > > - ret = node_to_mnt_ns(node); > > > - if (mnt_ns_id == n->seq) > > > - break; > > > - node = node->rb_left; > > > - } else { > > > - node = node->rb_right; > > > - } > > > - } > > > - return ret; > > > + if (mnt_ns_id < ns->seq) > > > + return -1; > > > + if (mnt_ns_id > ns->seq) > > > + return 1; > > > + return 0; > > > } > > > > > > /* > > > @@ -194,18 +202,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) > > > * namespace the @namespace_sem must first be acquired. If the namespace has > > > * already shut down before acquiring @namespace_sem, {list,stat}mount() will > > > * see that the mount rbtree of the namespace is empty. > > > + * > > > + * Note the lookup is lockless protected by a sequence counter. We only > > > + * need to guard against false negatives as false positives aren't > > > + * possible. So if we didn't find a mount namespace and the sequence > > > + * counter has changed we need to retry. If the sequence counter is > > > + * still the same we know the search actually failed. > > > */ > > > static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) > > > { > > > - struct mnt_namespace *ns; > > > + struct mnt_namespace *ns; > > > + struct rb_node *node; > > > + unsigned int seq; > > > + > > > + guard(rcu)(); > > > + do { > > > + seq = read_seqcount_begin(&mnt_ns_tree_seqcount); > > > + node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); > > > + if (node) > > > + break; > > > + } while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq)); > > > > > > - guard(read_lock)(&mnt_ns_tree_lock); > > > - ns = mnt_ns_find_id_at(mnt_ns_id); > > > - if (!ns || ns->seq != mnt_ns_id) > > > - return NULL; > > > + if (!node) > > > + return NULL; > > > > > > - refcount_inc(&ns->passive); > > > - return ns; > > > + /* > > > + * The last reference count is put with RCU delay so we can > > > + * unconditonally acquire a reference here. > > > + */ > > > + ns = node_to_mnt_ns(node); > > > + refcount_inc(&ns->passive); > > > > I'm a little uneasy with the unconditional refcount_inc() here. It > > seems quite possible that this could to a 0->1 transition here. You may > > be right that that technically won't cause a problem with the rcu lock > > held, but at the very least, that will cause a refcount warning to pop. > > > > Maybe this should be a refcount_inc_not_zero() and then you just return > > NULL if the increment doesn't occur? > > So this shouldn't be possible (and Paul is on the thread and can tell me > if I'm wrong) because: > > call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); > -> mnt_ns_release_rcu() > -> mnt_ns_release() > -> refcount_dec_and_test() > > Which means that decrements are RCU delayed. Any reader must walk the > list holding the RCU lock. If they find the mount namespace still on the > list then mnt_ns_release() will be deferred until they are done. > > > In order for what you describe to happen a reader must find that mount > namespace still in the list or rbtree and mnt_ns_release() is called > directly. > > But afaict that doesn't happen. mnt_ns_release() is only called directly > when the mount namespace has never been on any of the lists. > > refcount_inc() will already WARN() if the previous value was 0 and > splat. If we use refcount_inc_not_zero() we're adding additional memory > barriers when we simply don't need them. Ahh ok, I missed the detail that the refcount decrement was being done in the RCU callback. With that, I guess that can't ever happen. Sorry for the noise! > But if that's important to you though than I'd rather switch the passive > count to atomic_t and use atomic_dec_and_test() in mnt_ns_release() and > then use similar logic as I used for file_ref_inc(): > > unsigned long prior = atomic_fetch_inc_relaxed(&ns->passive); > WARN_ONCE(!prior, "increment from zero UAF condition present"); > > This would give us the opportunity to catch a "shouldn't happen > condition". But using refcount_inc_not_zero() would e.g., confuse me a > few months down the line (Yes, we could probably add a comment.). > I don't feel strongly about it, but a comment explaining why you can unconditionally bump it there would be helpful. > > > > > + return ns; > > > } > > > > > > static inline void lock_mount_hash(void) > > > -- Jeff Layton <jlayton@xxxxxxxxxx>