Re: [PATCH RFC 5/5] fs: allow to tuck mounts explicitly

Al Viro <viro@xxxxxxxxxxxxxxxxxx> · Fri, 21 Apr 2023 07:29:58 +0100

On Sat, Mar 18, 2023 at 04:52:01PM +0100, Christian Brauner wrote:

> +/**
> + * mnt_tuck_mountpoint - tuck a mount beneath another one
> + *
> + * @tucked_mnt: the mount to tuck
> + * @top_mnt:	the mount to tuck @tucked_mnt under
> + * @tucked_mp:	the new mountpoint of @top_mnt on @tucked_mnt
> + *
> + * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
> + * parent @top_mnt->mnt_parent and mount it on top of @tucked_mnt at
> + * @tucked_mp. And mount @tucked_mnt on the old parent and old
> + * mountpoint of @top_mnt.
> + *
> + * Note that we keep the reference count in tact when we remove @top_mnt
> + * from its old mountpoint and parent to prevent UAF issues. Once we've
> + * mounted @top_mnt on @tucked_mnt the reference count gets bumped once
> + * more. So make sure that we drop it to not leak the mount and
> + * mountpoint.
> + */
> +static void mnt_tuck_mountpoint(struct mount *tucked_mnt, struct mount *top_mnt,
> +				struct mountpoint *tucked_mp)
> +{
> +	struct mount *old_top_parent = top_mnt->mnt_parent;
> +	struct mountpoint *old_top_mp;
> +
> +	old_top_mp = unhash_mnt(top_mnt);
> +	attach_mnt(top_mnt, tucked_mnt, tucked_mp);
> +	mnt_set_mountpoint(old_top_parent, old_top_mp, tucked_mnt);
> +	put_mountpoint(old_top_mp);
> +	mnt_add_count(old_top_parent, -1);
> +}

Umm...  And if something is mounted on top of your tucked_mnt?  Right
on top of its root, I mean...  BTW, why not make it

static void mnt_tuck_mountpoint(struct mount *tucked_mnt, struct mount *top_mnt,
                                struct mountpoint *tucked_mp)
{
        struct mount *old_parent = top_mnt->mnt_parent;
        struct mountpoint *old_mp = top_mnt->mnt_mp;

        mnt_set_mountpoint(old_parent, old_mp, tucked_mnt);
        mnt_change_mountpoint(tucked_mnt, tucked_mp, top_mnt);
}

I mean, look at the existing caller of mnt_change_mountpoint() nearby -
that's precisely how that "tucking" is done right now.  And I'm rather
afraid that it's vulnerable to the same problem - if the tree we are
copying has the root of its root mount overmounted by something...

> +static void tuck_mnt(struct mount *tucked_mnt,
> +		     struct mount *top_mnt,
> +		     struct mountpoint *tucked_mp)
> +{
> +	mnt_tuck_mountpoint(tucked_mnt, top_mnt, tucked_mp);
> +	__attach_mnt(tucked_mnt, tucked_mnt->mnt_parent);
> +}

Not sure it's worth bothering with - only one caller and it might be
cleaner to expand attach_mnt() there, turning it into set or tuck,
followed by unconditional __attach...

> +typedef enum mnt_tree_flags_t {
> +	MNT_TREE_MOVE	= BIT(0),
> +	MNT_TREE_TUCK	= BIT(1),
> +} mnt_tree_flags_t;

What combinations are possible?

>  static int attach_recursive_mnt(struct mount *source_mnt,
> -			struct mount *dest_mnt,
> -			struct mountpoint *dest_mp,
> -			bool moving)
> +				struct mount *top_mnt,
> +				struct mountpoint *dest_mp,
> +				mnt_tree_flags_t flags)
>  {
>  	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
>  	HLIST_HEAD(tree_list);
> -	struct mnt_namespace *ns = dest_mnt->mnt_ns;
> +	struct mnt_namespace *ns = top_mnt->mnt_ns;
>  	struct mountpoint *smp;
> -	struct mount *child, *p;
> +	struct mount *child, *dest_mnt, *p;
>  	struct hlist_node *n;
> -	int err;
> +	int err = 0;
> +	bool moving = flags & MNT_TREE_MOVE, tuck = flags & MNT_TREE_TUCK;
>  
>  	/* Preallocate a mountpoint in case the new mounts need
>  	 * to be tucked under other mounts.
> @@ -2244,29 +2305,48 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>  			goto out;
>  	}
>  
> +	if (tuck)
> +		dest_mnt = top_mnt->mnt_parent;
> +	else
> +		dest_mnt = top_mnt;
> +
>  	if (IS_MNT_SHARED(dest_mnt)) {
>  		err = invent_group_ids(source_mnt, true);
>  		if (err)
>  			goto out;
>  		err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
> -		lock_mount_hash();
> -		if (err)
> -			goto out_cleanup_ids;
> +	}
> +	lock_mount_hash();
> +	if (err)
> +		goto out_cleanup_ids;
> +
> +	/* Recheck with lock_mount_hash() held. */
> +	if (tuck && IS_MNT_LOCKED(top_mnt)) {
> +		err = -EINVAL;
> +		goto out_cleanup_ids;
> +	}
> +
> +	if (IS_MNT_SHARED(dest_mnt)) {
>  		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
>  			set_mnt_shared(p);
> -	} else {
> -		lock_mount_hash();
>  	}
> +
>  	if (moving) {
>  		unhash_mnt(source_mnt);
> -		attach_mnt(source_mnt, dest_mnt, dest_mp);
> +		if (tuck)
> +			tuck_mnt(source_mnt, top_mnt, smp);
> +		else
> +			attach_mnt(source_mnt, dest_mnt, dest_mp);
>  		touch_mnt_namespace(source_mnt->mnt_ns);
>  	} else {
>  		if (source_mnt->mnt_ns) {
>  			/* move from anon - the caller will destroy */
>  			list_del_init(&source_mnt->mnt_ns->list);
>  		}
> -		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
> +		if (tuck)
> +			mnt_tuck_mountpoint(source_mnt, top_mnt, smp);
> +		else
> +			mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
>  		commit_tree(source_mnt);
>  	}
>  
> @@ -2306,14 +2386,35 @@ static int attach_recursive_mnt(struct mount *source_mnt,
>  	return err;
>  }
>  
> -static struct mountpoint *lock_mount(struct path *path)
> +/**
> + * lock_mount_mountpoint - lock mount and mountpoint
> + * @path: target path
> + * @tuck: whether we intend to tuck a mount beneath @path
> + *
> + * Follow the mount stack on @path until the top mount is found.
> + *
> + * If we intend to mount on top of @path->mnt acquire the inode_lock()
> + * for the top mount's ->mnt_root to protect against concurrent removal
> + * of our prospective mountpoint from another mount namespace.
> + *
> + * If we intend to tuck beneath the top mount @m acquire the
> + * inode_lock() on @m's mountpoint @mp on @m->mnt_parent. Otherwise we
> + * risk racing with someone who unlinked @mp from another mount
> + * namespace where @m doesn't have a child mount mounted @mp. We don't
> + * care if @m->mnt_root/@path->dentry is removed (as long as
> + * @path->dentry isn't equal to @m->mnt_mountpoint of course).
> + *
> + * Return: Either the target mountpoint on the top mount or the top
> + *         mount's mountpoint.
> + */
> +static struct mountpoint *lock_mount_mountpoint(struct path *path, bool tuck)
>  {
>  	struct vfsmount *mnt = path->mnt;
>  	struct dentry *dentry;
>  	struct mountpoint *mp;
>  
>  	for (;;) {
> -		dentry = path->dentry;
> +		dentry = tuck ? real_mount(mnt)->mnt_mountpoint : path->dentry;
>  		inode_lock(dentry->d_inode);

What happens if another mount --move changes ->mnt_mountpoint of that sucker
just as we fetch it and we end up with inode_lock(dentry->d_inode) of a dentry
that is not pinned by anything?  path->dentry *is* pinned, so it's safe to
work with; this one, OTOH...

> +/**
> + * can_tuck_mount - check that we can tuck a mount
> + * @from: mount to tuck under
> + * @to:   mount under which to tuck
> + *
> + * - Make sure that the mount to tuck under isn't a shared mount so we
> + *   force the kernel to allocate a new peer group id. This simplifies
> + *   the mount trees that can be created and limits propagation events
> + *   in cases where @to, and/or @to->mnt_parent are in the same peer
> + *   group. Something that's a nuisance already today.
> + * - Make sure that @to->dentry is actually the root of a mount under
> + *   which we can tuck another mount.
> + * - Make sure that nothing can be tucked under the caller's current
> + *   root or the rootfs of the namespace.
> + * - Make sure that the caller can unmount the topmost mount ensuring
> + *   that the caller could reveal the underlying mountpoint.
> + *
> + * Return: On success 0, and on error a negative error code is returned.
> + */
> +static int can_tuck_mount(struct path *from, struct path *to)
> +{
> +	struct mount *mnt_from = real_mount(from->mnt),
> +		     *mnt_to = real_mount(to->mnt);
> +
> +	if (!check_mnt(mnt_to))
> +		return -EINVAL;
> +
> +	if (!mnt_has_parent(mnt_to))
> +		return -EINVAL;
> +
> +	if (IS_MNT_SHARED(mnt_from))
> +		return -EINVAL;

What if it's not shared, but gets propagation from whatever we
are going to attach it to?

> +	if (!path_mounted(to))
> +		return -EINVAL;
> +
> +	if (mnt_from == mnt_to)
> +		return -EINVAL;
> +
> +	/*
> +	 * Tucking a mount beneath the rootfs only makes sense when the
> +	 * tuck semantics of pivot_root(".", ".") are used.
> +	 */
> +	if (&mnt_to->mnt == current->fs->root.mnt)
> +		return -EINVAL;
> +	if (mnt_to->mnt_parent == current->nsproxy->mnt_ns->root)
> +		return -EINVAL;
> +
> +	for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent)
> +		if (p == mnt_to)
> +			return -EINVAL;

Umm...  Can we have !mnt_has_parent(mnt_from), BTW?

> +	/* Ensure the caller could reveal the underlying mount. */
> +	return can_umount(to, 0);

That's really odd.  It duplicates quite a bit of the tests above and
the whole thing is rather hard to follow, especially since you've got
tests done after the can_tuck_mount() call in do_move_mount()...

> +}
> +
> +static int do_move_mount(struct path *old_path, struct path *new_path,
> +			 bool tuck)
>  {
>  	struct mnt_namespace *ns;
>  	struct mount *p;
> @@ -2858,8 +3021,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>  	struct mountpoint *mp, *old_mp;
>  	int err;
>  	bool attached;
> +	mnt_tree_flags_t flags = 0;
>  
> -	mp = lock_mount(new_path);
> +	mp = lock_mount_mountpoint(new_path, tuck);
>  	if (IS_ERR(mp))
>  		return PTR_ERR(mp);
>  
> @@ -2867,9 +3031,20 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>  	p = real_mount(new_path->mnt);
>  	parent = old->mnt_parent;
>  	attached = mnt_has_parent(old);
> +	if (attached)
> +		flags |= MNT_TREE_MOVE;
>  	old_mp = old->mnt_mp;
>  	ns = old->mnt_ns;
>  
> +	if (tuck) {
> +		err = can_tuck_mount(old_path, new_path);
> +		if (err)
> +			goto out;
> +
> +		p = p->mnt_parent;
> +		flags |= MNT_TREE_TUCK;
> +	}
> +
>  	err = -EINVAL;
>  	/* The mountpoint must be in our namespace. */
>  	if (!check_mnt(p))

Why not check it first?  That'd kill the corresponding
check_mnt(to) in can_tuck_mount() (check_mnt() on parent is
the same as on child).  Confused...

> @@ -2910,8 +3085,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
>  		if (p == old)
>  			goto out;
>  
> -	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
> -				   attached);
> +	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
>  	if (err)
>  		goto out;
>  
> @@ -2943,7 +3117,7 @@ static int do_move_mount_old(struct path *path, const char *old_name)
>  	if (err)
>  		return err;
>  
> -	err = do_move_mount(&old_path, path);
> +	err = do_move_mount(&old_path, path, false);
>  	path_put(&old_path);
>  	return err;
>  }
> @@ -3807,6 +3981,10 @@ SYSCALL_DEFINE5(move_mount,
>  	if (flags & ~MOVE_MOUNT__MASK)
>  		return -EINVAL;
>  
> +	if ((flags & (MOVE_MOUNT_TUCK | MOVE_MOUNT_SET_GROUP)) ==
> +	    (MOVE_MOUNT_TUCK | MOVE_MOUNT_SET_GROUP))
> +		return -EINVAL;
> +
>  	/* If someone gives a pathname, they aren't permitted to move
>  	 * from an fd that requires unmount as we can't get at the flag
>  	 * to clear it afterwards.
> @@ -3836,7 +4014,8 @@ SYSCALL_DEFINE5(move_mount,
>  	if (flags & MOVE_MOUNT_SET_GROUP)
>  		ret = do_set_group(&from_path, &to_path);
>  	else
> -		ret = do_move_mount(&from_path, &to_path);
> +		ret = do_move_mount(&from_path, &to_path,
> +				    (flags & MOVE_MOUNT_TUCK));

OK, so you want to be able to use that on an anonymous tree, then?