Re: [PATCH v2 10/10] fs: support mapped mounts of mapped filesystems

Seth Forshee <sforshee@xxxxxxxxxxxxxxxx> · Thu, 2 Dec 2021 11:50:43 -0600

On Tue, Nov 30, 2021 at 01:10:32PM +0100, Christian Brauner wrote:
> From: Christian Brauner <christian.brauner@xxxxxxxxxx>
> 
> In previous patches we added new and modified existing helpers to handle
> idmapped mounts of filesystems mounted with an idmapping. In this final
> patch we convert all relevant places in the vfs to actually pass the
> filesystem's idmapping into these helpers.
> 
> With this the vfs is in shape to handle idmapped mounts of filesystems
> mounted with an idmapping. Note that this is just the generic
> infrastructure. Actually adding support for idmapped mounts to a
> filesystem mountable with an idmapping is follow-up work.
> 
> In this patch we extend the definition of an idmapped mount from a mount
> that that has the initial idmapping attached to it to a mount that has
> an idmapping attached to it which is not the same as the idmapping the
> filesystem was mounted with.
> 
> As before we do not allow the initial idmapping to be attached to a
> mount. In addition this patch prevents that the idmapping the filesystem
> was mounted with can be attached to a mount created based on this
> filesystem.
> 
> This has multiple reasons and advantages. First, attaching the initial
> idmapping or the filesystem's idmapping doesn't make much sense as in
> both cases the values of the i_{g,u}id and other places where k{g,u}ids
> are used do not change. Second, a user that really wants to do this for
> whatever reason can just create a separate dedicated identical idmapping
> to attach to the mount. Third, we can continue to use the initial
> idmapping as an indicator that a mount is not idmapped allowing us to
> continue to keep passing the initial idmapping into the mapping helpers
> to tell them that something isn't an idmapped mount even if the
> filesystem is mounted with an idmapping.
> 
> Link: https://lore.kernel.org/r/20211123114227.3124056-11-brauner@xxxxxxxxxx (v1)
> Cc: Seth Forshee <sforshee@xxxxxxxxxxxxxxxx>
> Cc: Amir Goldstein <amir73il@xxxxxxxxx>
> Cc: Christoph Hellwig <hch@xxxxxx>
> Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
> CC: linux-fsdevel@xxxxxxxxxxxxxxx
> Signed-off-by: Christian Brauner <christian.brauner@xxxxxxxxxx>
> ---
> /* v2 */
> - Amir Goldstein <amir73il@xxxxxxxxx>:
>   - Continue passing down the initial idmapping to xfs. Since xfs cannot
>     and is not planned to support idmapped superblocks don't mislead
>     readers in the code by passing down the filesystem idmapping. It
>     will be the initial idmapping always anyway.
>   - Include mnt_idmapping.h header into fs/namespace.c
> ---
>  fs/namespace.c       | 37 +++++++++++++++++++++++++------------
>  fs/open.c            |  7 ++++---
>  fs/posix_acl.c       |  8 ++++----
>  include/linux/fs.h   | 17 +++++++++--------
>  security/commoncap.c |  9 ++++-----
>  5 files changed, 46 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 4994b816a74c..ccefede4ba1b 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -31,6 +31,7 @@
>  #include <uapi/linux/mount.h>
>  #include <linux/fs_context.h>
>  #include <linux/shmem_fs.h>
> +#include <linux/mnt_idmapping.h>
>  
>  #include "pnode.h"
>  #include "internal.h"
> @@ -561,7 +562,7 @@ static void free_vfsmnt(struct mount *mnt)
>  	struct user_namespace *mnt_userns;
>  
>  	mnt_userns = mnt_user_ns(&mnt->mnt);
> -	if (mnt_userns != &init_user_ns)
> +	if (!initial_mapping(mnt_userns))
>  		put_user_ns(mnt_userns);
>  	kfree_const(mnt->mnt_devname);
>  #ifdef CONFIG_SMP
> @@ -965,6 +966,7 @@ static struct mount *skip_mnt_tree(struct mount *p)
>  struct vfsmount *vfs_create_mount(struct fs_context *fc)
>  {
>  	struct mount *mnt;
> +	struct user_namespace *fs_userns;
>  
>  	if (!fc->root)
>  		return ERR_PTR(-EINVAL);
> @@ -982,6 +984,10 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
>  	mnt->mnt_mountpoint	= mnt->mnt.mnt_root;
>  	mnt->mnt_parent		= mnt;
>  
> +	fs_userns = mnt->mnt.mnt_sb->s_user_ns;
> +	if (!initial_mapping(fs_userns))
> +		mnt->mnt.mnt_userns = get_user_ns(fs_userns);
> +

Won't this get be leaked if mnt_userns is overwritten by
do_idmap_mount()?

>  	lock_mount_hash();
>  	list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
>  	unlock_mount_hash();
> @@ -1072,7 +1078,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
>  
>  	atomic_inc(&sb->s_active);
>  	mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt);
> -	if (mnt->mnt.mnt_userns != &init_user_ns)
> +	if (!initial_mapping(mnt->mnt.mnt_userns))
>  		mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns);
>  	mnt->mnt.mnt_sb = sb;
>  	mnt->mnt.mnt_root = dget(root);
> @@ -3927,10 +3933,18 @@ static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
>  static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
>  {
>  	struct vfsmount *m = &mnt->mnt;
> +	struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
>  
>  	if (!kattr->mnt_userns)
>  		return 0;
>  
> +	/*
> +	 * Creating an idmapped mount with the filesystem wide idmapping
> +	 * doesn't make sense so block that. We don't allow mushy semantics.
> +	 */
> +	if (kattr->mnt_userns == fs_userns)
> +		return -EINVAL;
> +
>  	/*
>  	 * Once a mount has been idmapped we don't allow it to change its
>  	 * mapping. It makes things simpler and callers can just create
> @@ -3943,12 +3957,8 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
>  	if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
>  		return -EINVAL;
>  
> -	/* Don't yet support filesystem mountable in user namespaces. */
> -	if (m->mnt_sb->s_user_ns != &init_user_ns)
> -		return -EINVAL;
> -
>  	/* We're not controlling the superblock. */
> -	if (!capable(CAP_SYS_ADMIN))
> +	if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
>  		return -EPERM;
>  
>  	/* Mount has already been visible in the filesystem hierarchy. */
> @@ -4133,13 +4143,16 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
>  	}
>  
>  	/*
> -	 * The init_user_ns is used to indicate that a vfsmount is not idmapped.
> -	 * This is simpler than just having to treat NULL as unmapped. Users
> -	 * wanting to idmap a mount to init_user_ns can just use a namespace
> -	 * with an identity mapping.
> +	 * The initial idmapping cannot be used to create an idmapped
> +	 * mount. Attaching the initial idmapping doesn't make much sense
> +	 * as it is an identity mapping. A user can just create a dedicated
> +	 * identity mapping to achieve the same result. We also use the
> +	 * initial idmapping as an indicator of a mount that is not
> +	 * idmapped. It can simply be passed into helpers that are aware of
> +	 * idmapped mounts as a convenient shortcut.
>  	 */

This isn't completely accurate, is it? If sb->s_user_ns != init_user_ns,
an idmap to init_user_ns isn't an idendity mapping.

>  	mnt_userns = container_of(ns, struct user_namespace, ns);
> -	if (mnt_userns == &init_user_ns) {
> +	if (initial_mapping(mnt_userns)) {
>  		err = -EPERM;
>  		goto out_fput;
>  	}
> diff --git a/fs/open.c b/fs/open.c
> index 40a00e71865b..9ff2f621b760 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -641,7 +641,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
>  
>  int chown_common(const struct path *path, uid_t user, gid_t group)
>  {
> -	struct user_namespace *mnt_userns;
> +	struct user_namespace *mnt_userns, *fs_userns;
>  	struct inode *inode = path->dentry->d_inode;
>  	struct inode *delegated_inode = NULL;
>  	int error;
> @@ -653,8 +653,9 @@ int chown_common(const struct path *path, uid_t user, gid_t group)
>  	gid = make_kgid(current_user_ns(), group);
>  
>  	mnt_userns = mnt_user_ns(path->mnt);
> -	uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid);
> -	gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid);
> +	fs_userns = i_user_ns(inode);
> +	uid = mapped_kuid_user(mnt_userns, fs_userns, uid);
> +	gid = mapped_kgid_user(mnt_userns, fs_userns, gid);
>  
>  retry_deleg:
>  	newattrs.ia_valid =  ATTR_CTIME;
> diff --git a/fs/posix_acl.c b/fs/posix_acl.c
> index 4b5fb9a9b90f..80acb6885cf9 100644
> --- a/fs/posix_acl.c
> +++ b/fs/posix_acl.c
> @@ -376,8 +376,8 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
>                                  break;
>                          case ACL_USER:
>  				uid = mapped_kuid_fs(mnt_userns,
> -						      &init_user_ns,
> -						      pa->e_uid);
> +						     i_user_ns(inode),
> +						     pa->e_uid);
>  				if (uid_eq(uid, current_fsuid()))
>                                          goto mask;
>  				break;
> @@ -391,8 +391,8 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
>  				break;
>                          case ACL_GROUP:
>  				gid = mapped_kgid_fs(mnt_userns,
> -						      &init_user_ns,
> -						      pa->e_gid);
> +						     i_user_ns(inode),
> +						     pa->e_gid);
>  				if (in_group_p(gid)) {
>  					found = 1;
>  					if ((pa->e_perm & want) == want)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c7f72b78ab7e..9df6903634e8 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1641,7 +1641,7 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
>  static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
>  				    const struct inode *inode)
>  {
> -	return mapped_kuid_fs(mnt_userns, &init_user_ns, inode->i_uid);
> +	return mapped_kuid_fs(mnt_userns, i_user_ns(inode), inode->i_uid);
>  }
>  
>  /**
> @@ -1655,7 +1655,7 @@ static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
>  static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
>  				    const struct inode *inode)
>  {
> -	return mapped_kgid_fs(mnt_userns, &init_user_ns, inode->i_gid);
> +	return mapped_kgid_fs(mnt_userns, i_user_ns(inode), inode->i_gid);
>  }
>  
>  /**
> @@ -1669,7 +1669,7 @@ static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
>  static inline void inode_fsuid_set(struct inode *inode,
>  				   struct user_namespace *mnt_userns)
>  {
> -	inode->i_uid = mapped_fsuid(mnt_userns, &init_user_ns);
> +	inode->i_uid = mapped_fsuid(mnt_userns, i_user_ns(inode));
>  }
>  
>  /**
> @@ -1683,7 +1683,7 @@ static inline void inode_fsuid_set(struct inode *inode,
>  static inline void inode_fsgid_set(struct inode *inode,
>  				   struct user_namespace *mnt_userns)
>  {
> -	inode->i_gid = mapped_fsgid(mnt_userns, &init_user_ns);
> +	inode->i_gid = mapped_fsgid(mnt_userns, i_user_ns(inode));
>  }
>  
>  /**
> @@ -1704,10 +1704,10 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
>  	kuid_t kuid;
>  	kgid_t kgid;
>  
> -	kuid = mapped_fsuid(mnt_userns, &init_user_ns);
> +	kuid = mapped_fsuid(mnt_userns, fs_userns);
>  	if (!uid_valid(kuid))
>  		return false;
> -	kgid = mapped_fsgid(mnt_userns, &init_user_ns);
> +	kgid = mapped_fsgid(mnt_userns, fs_userns);
>  	if (!gid_valid(kgid))
>  		return false;
>  	return kuid_has_mapping(fs_userns, kuid) &&
> @@ -2654,13 +2654,14 @@ static inline struct user_namespace *file_mnt_user_ns(struct file *file)
>   * is_idmapped_mnt - check whether a mount is mapped
>   * @mnt: the mount to check
>   *
> - * If @mnt has an idmapping attached to it @mnt is mapped.
> + * If @mnt has an idmapping attached different from the
> + * filesystem's idmapping then @mnt is mapped.
>   *
>   * Return: true if mount is mapped, false if not.
>   */
>  static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
>  {
> -	return mnt_user_ns(mnt) != &init_user_ns;
> +	return mnt_user_ns(mnt) != mnt->mnt_sb->s_user_ns;
>  }
>  
>  extern long vfs_truncate(const struct path *, loff_t);
> diff --git a/security/commoncap.c b/security/commoncap.c
> index d288a62e2999..5fc8986c3c77 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -419,7 +419,7 @@ int cap_inode_getsecurity(struct user_namespace *mnt_userns,
>  	kroot = make_kuid(fs_ns, root);
>  
>  	/* If this is an idmapped mount shift the kuid. */
> -	kroot = mapped_kuid_fs(mnt_userns, &init_user_ns, kroot);
> +	kroot = mapped_kuid_fs(mnt_userns, fs_ns, kroot);
>  
>  	/* If the root kuid maps to a valid uid in current ns, then return
>  	 * this as a nscap. */
> @@ -556,13 +556,12 @@ int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
>  		return -EINVAL;
>  	if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
>  		return -EPERM;
> -	if (size == XATTR_CAPS_SZ_2 && (mnt_userns == &init_user_ns))
> +	if (size == XATTR_CAPS_SZ_2 && (mnt_userns == fs_ns))
>  		if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
>  			/* user is privileged, just write the v2 */
>  			return size;
>  
> -	rootid = rootid_from_xattr(*ivalue, size, task_ns, mnt_userns,
> -				   &init_user_ns);
> +	rootid = rootid_from_xattr(*ivalue, size, task_ns, mnt_userns, fs_ns);
>  	if (!uid_valid(rootid))
>  		return -EINVAL;
>  
> @@ -703,7 +702,7 @@ int get_vfs_caps_from_disk(struct user_namespace *mnt_userns,
>  	/* Limit the caps to the mounter of the filesystem
>  	 * or the more limited uid specified in the xattr.
>  	 */
> -	rootkuid = mapped_kuid_fs(mnt_userns, &init_user_ns, rootkuid);
> +	rootkuid = mapped_kuid_fs(mnt_userns, fs_ns, rootkuid);
>  	if (!rootid_owns_currentns(rootkuid))
>  		return -ENODATA;
>  
> -- 
> 2.30.2
>