Re: [RFC PATCH 1/2] fs: allow detached mounts in clone_private_mount()

Mike Baynton <mike@xxxxxxxxxxxx> · Thu, 23 Jan 2025 23:42:51 -0600

On 1/23/25 13:19, Christian Brauner wrote:
> In container workloads idmapped mounts are often used as layers for
> overlayfs. Recently I added the ability to specify layers in overlayfs
> as file descriptors instead of path names. It should be possible to
> simply use the detached mounts directly when specifying layers instead
> of having to attach them beforehand. They are discarded after overlayfs
> is mounted anyway so it's pointless system calls for userspace and
> pointless locking for the kernel.
> 
> This just recently come up again in [1]. So enable clone_private_mount()
> to use detached mounts directly. Following conditions must be met:
> 
> - Provided path must be the root of a detached mount tree.
> - Provided path may not create mount namespace loops.
> - Provided path must be mounted.
> 
> It would be possible to be stricter and require that the caller must
> have CAP_SYS_ADMIN in the owning user namespace of the anonymous mount
> namespace but since this restriction isn't enforced for move_mount()
> there's no point in enforcing it for clone_private_mount().
> 
> Link: https://lore.kernel.org/r/fd8f6574-f737-4743-b220-79c815ee1554@xxxxxxxxxxxx [1]
> Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>
> ---
>  fs/namespace.c | 78 ++++++++++++++++++++++++++++----------------------
>  1 file changed, 43 insertions(+), 35 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 4013fbac354a..3985a695d373 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2287,6 +2287,28 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
>  	return false;
>  }
>  
> +/*
> + * Check that there aren't references to earlier/same mount namespaces in the
> + * specified subtree.  Such references can act as pins for mount namespaces
> + * that aren't checked by the mount-cycle checking code, thereby allowing
> + * cycles to be made.
> + */
> +static bool check_for_nsfs_mounts(struct mount *subtree)
> +{
> +	struct mount *p;
> +	bool ret = false;
> +
> +	lock_mount_hash();
> +	for (p = subtree; p; p = next_mnt(p, subtree))
> +		if (mnt_ns_loop(p->mnt.mnt_root))
> +			goto out;
> +
> +	ret = true;
> +out:
> +	unlock_mount_hash();
> +	return ret;
> +}
> +
>  /**
>   * clone_private_mount - create a private clone of a path
>   * @path: path to clone
> @@ -2295,6 +2317,8 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
>   * will not be attached anywhere in the namespace and will be private (i.e.
>   * changes to the originating mount won't be propagated into this).
>   *
> + * This assumes caller has called or done the equivalent of may_mount().
> + *
>   * Release with mntput().
>   */
>  struct vfsmount *clone_private_mount(const struct path *path)
> @@ -2302,30 +2326,36 @@ struct vfsmount *clone_private_mount(const struct path *path)
>  	struct mount *old_mnt = real_mount(path->mnt);
>  	struct mount *new_mnt;
>  
> -	down_read(&namespace_sem);
> +	scoped_guard(rwsem_read, &namespace_sem)
>  	if (IS_MNT_UNBINDABLE(old_mnt))
> -		goto invalid;
> +		return ERR_PTR(-EINVAL);
> +
> +	if (mnt_has_parent(old_mnt)) {
> +		if (!check_mnt(old_mnt))
> +			return ERR_PTR(-EINVAL);
> +	} else {
> +		/* Make sure this isn't something purely kernel internal. */
> +		if (!is_anon_ns(old_mnt->mnt_ns))
> +			return ERR_PTR(-EINVAL);
>  
> -	if (!check_mnt(old_mnt))
> -		goto invalid;
> +		/* Make sure we don't create mount namespace loops. */
> +		if (!check_for_nsfs_mounts(old_mnt))
> +			return ERR_PTR(-EINVAL);
> +
> +		if (!path_mounted(path))
> +			return ERR_PTR(-EINVAL);
> +	}
>  
>  	if (has_locked_children(old_mnt, path->dentry))
> -		goto invalid;
> +		return ERR_PTR(-EINVAL);
>  
>  	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
> -	up_read(&namespace_sem);
> -
>  	if (IS_ERR(new_mnt))
> -		return ERR_CAST(new_mnt);
> +		return ERR_PTR(-EINVAL);
>  
>  	/* Longterm mount to be removed by kern_unmount*() */
>  	new_mnt->mnt_ns = MNT_NS_INTERNAL;
> -
>  	return &new_mnt->mnt;
> -
> -invalid:
> -	up_read(&namespace_sem);
> -	return ERR_PTR(-EINVAL);
>  }
>  EXPORT_SYMBOL_GPL(clone_private_mount);
>  
> @@ -3123,28 +3153,6 @@ static inline int tree_contains_unbindable(struct mount *mnt)
>  	return 0;
>  }
>  
> -/*
> - * Check that there aren't references to earlier/same mount namespaces in the
> - * specified subtree.  Such references can act as pins for mount namespaces
> - * that aren't checked by the mount-cycle checking code, thereby allowing
> - * cycles to be made.
> - */
> -static bool check_for_nsfs_mounts(struct mount *subtree)
> -{
> -	struct mount *p;
> -	bool ret = false;
> -
> -	lock_mount_hash();
> -	for (p = subtree; p; p = next_mnt(p, subtree))
> -		if (mnt_ns_loop(p->mnt.mnt_root))
> -			goto out;
> -
> -	ret = true;
> -out:
> -	unlock_mount_hash();
> -	return ret;
> -}
> -
>  static int do_set_group(struct path *from_path, struct path *to_path)
>  {
>  	struct mount *from, *to;

Confirmed this works for the use case I'm interested in of directly
passing idmapped mounts to lower layers.

Tested-by: Mike Baynton <mike@xxxxxxxxxxxx>