Re: [PATCH 3/5] fs: lockless mntns rbtree lookup

Christian Brauner <brauner@xxxxxxxxxx> · Wed, 11 Dec 2024 09:13:16 +0100

Hey Peter,

I had a question for you and meant to Cc you but forgot. This makes one
of our rbtree uses lockless using the seqlock pattern. See below.

I saw that in 50a38035ed5c ("rbtree: provide rb_find_rcu() /
rb_find_add_rcu()") you added new _rcu() variants.

We're using another search function that allows us to walk the tree in
either direction:

        guard(read_lock)(&mnt_ns_tree_lock);
        for (;;) {
                struct rb_node *node;

                if (previous)
                        node = rb_prev(&mntns->mnt_ns_tree_node);
                else
                        node = rb_next(&mntns->mnt_ns_tree_node);
                if (!node)
                        return ERR_PTR(-ENOENT);

                mntns = node_to_mnt_ns(node);
                node = &mntns->mnt_ns_tree_node;

But afaict neither rb_prev() nor rb_next() are rcu safe. Have you ever
considered adding rcu safe variants for those two as well?

Thanks!
Christian

On Tue, Dec 10, 2024 at 09:57:59PM +0100, Christian Brauner wrote:
> Currently we use a read-write lock but for the simple search case we can
> make this lockless. Creating a new mount namespace is a rather rare
> event compared with querying mounts in a foreign mount namespace. Once
> this is picked up by e.g., systemd to list mounts in another mount in
> it's isolated services or in containers this will be used a lot so this
> seems worthwhile doing.
> 
> Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>
> ---
>  fs/mount.h     |  5 ++-
>  fs/namespace.c | 99 ++++++++++++++++++++++++++++++++++++++++------------------
>  2 files changed, 73 insertions(+), 31 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index 185fc56afc13338f8185fe818051444d540cbd5b..3c3763d8ae821d6a117c528808dbc94d0251f964 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -16,7 +16,10 @@ struct mnt_namespace {
>  	u64 event;
>  	unsigned int		nr_mounts; /* # of mounts in the namespace */
>  	unsigned int		pending_mounts;
> -	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
> +	union {
> +		struct rb_node	mnt_ns_tree_node; /* node in the mnt_ns_tree */
> +		struct rcu_head mnt_ns_rcu;
> +	};
>  	refcount_t		passive; /* number references not pinning @mounts */
>  } __randomize_layout;
>  
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 10fa18dd66018fadfdc9d18c59a851eed7bd55ad..21e990482c5b2e1844d17413b55b58803fa7b008 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem);
>  static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
>  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
>  static DEFINE_RWLOCK(mnt_ns_tree_lock);
> +static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);
> +
>  static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
>  
>  struct mount_kattr {
> @@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
>   */
>  __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
>  
> -static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
> -{
> -	u64 seq_b = ns->seq;
> -
> -	if (seq < seq_b)
> -		return -1;
> -	if (seq > seq_b)
> -		return 1;
> -	return 0;
> -}
> -
>  static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
>  {
>  	if (!node)
> @@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
>  	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
>  }
>  
> -static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
> +static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
>  {
>  	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
>  	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
>  	u64 seq_a = ns_a->seq;
> +	u64 seq_b = ns_b->seq;
>  
> -	return mnt_ns_cmp(seq_a, ns_b) < 0;
> +	if (seq_a < seq_b)
> +		return -1;
> +	if (seq_a > seq_b)
> +		return 1;
> +	return 0;
> +}
> +
> +static inline void mnt_ns_tree_write_lock(void)
> +{
> +	write_lock(&mnt_ns_tree_lock);
> +	write_seqcount_begin(&mnt_ns_tree_seqcount);
> +}
> +
> +static inline void mnt_ns_tree_write_unlock(void)
> +{
> +	write_seqcount_end(&mnt_ns_tree_seqcount);
> +	write_unlock(&mnt_ns_tree_lock);
>  }
>  
>  static void mnt_ns_tree_add(struct mnt_namespace *ns)
>  {
> -	guard(write_lock)(&mnt_ns_tree_lock);
> -	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
> +	struct rb_node *node;
> +
> +	mnt_ns_tree_write_lock();
> +	node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
> +	mnt_ns_tree_write_unlock();
> +
> +	WARN_ON_ONCE(node);
>  }
>  
>  static void mnt_ns_release(struct mnt_namespace *ns)
> @@ -150,15 +163,24 @@ static void mnt_ns_release(struct mnt_namespace *ns)
>  }
>  DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
>  
> +static void mnt_ns_release_rcu(struct rcu_head *rcu)
> +{
> +	struct mnt_namespace *mnt_ns;
> +
> +	mnt_ns = container_of(rcu, struct mnt_namespace, mnt_ns_rcu);
> +	mnt_ns_release(mnt_ns);
> +}
> +
>  static void mnt_ns_tree_remove(struct mnt_namespace *ns)
>  {
>  	/* remove from global mount namespace list */
>  	if (!is_anon_ns(ns)) {
> -		guard(write_lock)(&mnt_ns_tree_lock);
> +		mnt_ns_tree_write_lock();
>  		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
> +		mnt_ns_tree_write_unlock();
>  	}
>  
> -	mnt_ns_release(ns);
> +	call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
>  }
>  
>  /*
> @@ -168,23 +190,23 @@ static void mnt_ns_tree_remove(struct mnt_namespace *ns)
>  static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
>  {
>  	struct rb_node *node = mnt_ns_tree.rb_node;
> -	struct mnt_namespace *ret = NULL;
> +	struct mnt_namespace *mnt_ns = NULL;
>  
> -	lockdep_assert_held(&mnt_ns_tree_lock);
> +	lockdep_assert(rcu_read_lock_held());
>  
>  	while (node) {
>  		struct mnt_namespace *n = node_to_mnt_ns(node);
>  
>  		if (mnt_ns_id <= n->seq) {
> -			ret = node_to_mnt_ns(node);
> +			mnt_ns = node_to_mnt_ns(node);
>  			if (mnt_ns_id == n->seq)
>  				break;
> -			node = node->rb_left;
> +			node = rcu_dereference_raw(node->rb_left);
>  		} else {
> -			node = node->rb_right;
> +			node = rcu_dereference_raw(node->rb_right);
>  		}
>  	}
> -	return ret;
> +	return mnt_ns;
>  }
>  
>  /*
> @@ -194,18 +216,35 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
>   * namespace the @namespace_sem must first be acquired. If the namespace has
>   * already shut down before acquiring @namespace_sem, {list,stat}mount() will
>   * see that the mount rbtree of the namespace is empty.
> + *
> + * Note the lookup is lockless protected by a sequence counter. We only
> + * need to guard against false negatives as false positives aren't
> + * possible. So if we didn't find a mount namespace and the sequence
> + * counter has changed we need to retry. If the sequence counter is
> + * still the same we know the search actually failed.
>   */
>  static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
>  {
> -       struct mnt_namespace *ns;
> +	struct mnt_namespace *ns;
> +	unsigned int seq;
>  
> -       guard(read_lock)(&mnt_ns_tree_lock);
> -       ns = mnt_ns_find_id_at(mnt_ns_id);
> -       if (!ns || ns->seq != mnt_ns_id)
> -               return NULL;
> +	guard(rcu)();
> +	do {
> +		seq = read_seqcount_begin(&mnt_ns_tree_seqcount);
> +		ns = mnt_ns_find_id_at(mnt_ns_id);
> +		if (ns)
> +			break;
> +	} while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq));
>  
> -       refcount_inc(&ns->passive);
> -       return ns;
> +	if (!ns || ns->seq != mnt_ns_id)
> +		return NULL;
> +
> +	/*
> +	* The last reference count is put with after RCU delay so we
> +	* don't need to use refcount_inc_not_zero().
> +	*/
> +	refcount_inc(&ns->passive);
> +	return ns;
>  }
>  
>  static inline void lock_mount_hash(void)
> 
> -- 
> 2.45.2
>