Re: Regression in 5.3 for some FS_USERNS_MOUNT (aka user-namespace-mountable) filesystems

ebiederm@xxxxxxxxxxxx (Eric W. Biederman) · Fri, 26 Jul 2019 19:46:18 -0500

Al Viro <viro@xxxxxxxxxxxxxxxxxx> writes:

> On Fri, Jul 26, 2019 at 03:47:02PM -0700, Linus Torvalds wrote:
>
>> Of course, then later on, commit 20284ab7427f ("switch mount_capable()
>> to fs_context") drops that argument entirely, and hardcodes the
>> decision to look at fc->global.
>> 
>> But that fc->global decision wasn't there originally, and is incorrect
>> since it breaks existing users.
>> 
>> What gets much more confusing about this is that the two different
>> users then moved around. The sget_userns() case got moved to
>> legacy_get_tree(), and then joined together in vfs_get_tree(), and
>> then split and moved out to do_new_mount() and vfs_fsconfig_locked().
>> 
>> And that "joined together into vfs_get_tree()" must be wrong, because
>> the two cases used two different namespace rules. The sget_userns()
>> case *did* have that "global" flag check, while the sget_fc() did not.
>> 
>> Messy. Al?
>
> Digging through that mess...  It's my fuckup, and we obviously need to
> restore the old behaviour, but I really hope to manage that with
> checks _not_ in superblock allocator ;-/

If someone had bothered to actually look at how I was proposing to clean
things up before the new mount api we would already have that.  Sigh.

You should be able to get away with something like this which moves the
checks earlier and makes things clearer.  My old patch against the pre
new mount api code.

I am running at undependable speed due to the new baby so it is probably
better for someone else to forward port this, but I will attempt it
otherwise.

Eric

From: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
Date: Wed, 21 Nov 2018 11:17:01 -0600
Subject: [PATCH] vfs: Replace FS_USERNS_MOUNT with file_system_type->permission

Permission checking of the user to see if the can mount an individual
filesystem using FS_USERNS_MOUNT and checks in sget is not very
comprehensible.  Further by pushing the logic down into sget the
attack surface on filesystems that don't support unprivilged mounts is
much larger than it should be.

Now that it is understood what the permission checks need to be refactor the
checks into a simple per filesystme permission check.  If no permission check is
implemented the default check becomes a simple capable(CAP_SYS_ADMIN).

The result is code that is much simpler to understand and much easier to maintain.

Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
---
 fs/devpts/inode.c      |  2 +-
 fs/fuse/inode.c        |  3 ++-
 fs/namespace.c         | 15 +++++++++++++++
 fs/proc/root.c         |  8 +++++++-
 fs/ramfs/inode.c       |  2 +-
 fs/super.c             | 20 ++++++--------------
 fs/sysfs/mount.c       | 13 +++++++------
 include/linux/fs.h     |  4 +++-
 ipc/mqueue.c           |  8 +++++++-
 kernel/cgroup/cgroup.c | 16 ++++++++--------
 mm/shmem.c             |  4 ++--
 11 files changed, 59 insertions(+), 36 deletions(-)

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c53814539070..1418912efc7d 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -519,9 +519,9 @@ static void devpts_kill_sb(struct super_block *sb)
 
 static struct file_system_type devpts_fs_type = {
 	.name		= "devpts",
+	.permission	= userns_mount_permission,
 	.mount		= devpts_mount,
 	.kill_sb	= devpts_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 /*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 0b94b23b02d4..e9f6aa9974f8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1259,7 +1259,8 @@ static void fuse_kill_sb_anon(struct super_block *sb)
 static struct file_system_type fuse_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuse",
-	.fs_flags	= FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
+	.fs_flags	= FS_HAS_SUBTYPE,
+	.permission	= userns_mount_permission,
 	.mount		= fuse_mount,
 	.kill_sb	= fuse_kill_sb_anon,
 };
diff --git a/fs/namespace.c b/fs/namespace.c
index 74f64294a410..44935dbdb162 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2448,6 +2448,16 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 
 static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
 
+static int new_mount_permission(struct file_system_type *type)
+{
+	int err = 0;
+	if (type->permission)
+		err = type->permission();
+	else if (!capable(CAP_SYS_ADMIN))
+		err = -EPERM;
+	return err;
+}
+
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
@@ -2466,6 +2476,11 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
 	if (!type)
 		return -ENODEV;
 
+	/* Verify the mounter has permission to mount the filesystem */
+	err = new_mount_permission(type);
+	if (err)
+		return err;
+
 	mnt = vfs_kern_mount(type, sb_flags, name, data);
 	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
 	    !mnt->mnt_sb->s_subtype)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index f4b1a9d2eca6..b870eacef952 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -86,6 +86,12 @@ int proc_remount(struct super_block *sb, int *flags, char *data)
 	return !proc_parse_options(data, pid);
 }
 
+static int proc_mount_permission(void)
+{
+	struct pid_namespace *ns = task_active_pid_ns(current);
+	return ns_capable(ns->user_ns, CAP_SYS_ADMIN) ? 0 : -EPERM;
+}
+
 static struct dentry *proc_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
@@ -116,9 +122,9 @@ static void proc_kill_sb(struct super_block *sb)
 
 static struct file_system_type proc_fs_type = {
 	.name		= "proc",
+	.permission	= proc_mount_permission,
 	.mount		= proc_mount,
 	.kill_sb	= proc_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 void __init proc_root_init(void)
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 11201b2d06b9..6710cbe5e087 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -261,9 +261,9 @@ static void ramfs_kill_sb(struct super_block *sb)
 
 static struct file_system_type ramfs_fs_type = {
 	.name		= "ramfs",
+	.permission	= userns_mount_permission,
 	.mount		= ramfs_mount,
 	.kill_sb	= ramfs_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 int __init init_ramfs_fs(void)
diff --git a/fs/super.c b/fs/super.c
index ca53a08497ed..0783794e6173 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -474,6 +474,12 @@ void generic_shutdown_super(struct super_block *sb)
 
 EXPORT_SYMBOL(generic_shutdown_super);
 
+int userns_mount_permission(void)
+{
+	return ns_capable(current_user_ns(), CAP_SYS_ADMIN) ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(userns_mount_permission);
+
 /**
  *	sget_userns -	find or create a superblock
  *	@type:	filesystem type superblock should belong to
@@ -493,10 +499,6 @@ struct super_block *sget_userns(struct file_system_type *type,
 	struct super_block *old;
 	int err;
 
-	if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) &&
-	    !(type->fs_flags & FS_USERNS_MOUNT) &&
-	    !capable(CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
 retry:
 	spin_lock(&sb_lock);
 	if (test) {
@@ -563,10 +565,6 @@ struct super_block *sget(struct file_system_type *type,
 	if (flags & SB_SUBMOUNT)
 		user_ns = &init_user_ns;
 
-	/* Ensure the requestor has permissions over the target filesystem */
-	if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
-
 	return sget_userns(type, test, set, flags, user_ns, data);
 }
 
@@ -1059,12 +1057,6 @@ struct dentry *mount_ns(struct file_system_type *fs_type,
 {
 	struct super_block *sb;
 
-	/* Don't allow mounting unless the caller has CAP_SYS_ADMIN
-	 * over the namespace.
-	 */
-	if (!(flags & SB_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
-
 	sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
 			 user_ns, ns);
 	if (IS_ERR(sb))
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 015a8d95952b..058ebd28c413 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -21,6 +21,12 @@
 static struct kernfs_root *sysfs_root;
 struct kernfs_node *sysfs_root_kn;
 
+static int sysfs_mount_permission(void)
+{
+	struct net *net = current->nsproxy->net_ns;
+	return ns_capable(net->user_ns, CAP_SYS_ADMIN) ? 0 : -EPERM;
+}
+
 static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
@@ -28,11 +34,6 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	struct dentry *root;
 	bool new_sb = false;
 
-	if (!(flags & SB_KERNMOUNT)) {
-		if (!ns_capable(net->user_ns, CAP_SYS_ADMIN))
-			return ERR_PTR(-EPERM);
-	}
-
 	net = hold_net(net);
 	root = kernfs_mount_ns(fs_type, flags, sysfs_root,
 				SYSFS_MAGIC, &new_sb, net);
@@ -54,9 +55,9 @@ static void sysfs_kill_sb(struct super_block *sb)
 
 static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
+	.permission	= sysfs_mount_permission,
 	.mount		= sysfs_mount,
 	.kill_sb	= sysfs_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 int __init sysfs_init(void)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c95c0807471f..61dcd9b98118 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2165,8 +2165,8 @@ struct file_system_type {
 #define FS_REQUIRES_DEV		1 
 #define FS_BINARY_MOUNTDATA	2
 #define FS_HAS_SUBTYPE		4
-#define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
+	int (*permission)(void);
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
 	void (*kill_sb) (struct super_block *);
@@ -2186,6 +2186,8 @@ struct file_system_type {
 
 #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
 
+int userns_mount_permission(void);
+
 extern struct dentry *mount_ns(struct file_system_type *fs_type,
 	int flags, void *data, void *ns, struct user_namespace *user_ns,
 	int (*fill_super)(struct super_block *, void *, int));
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c595bed7bfcb..2c9533082110 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -343,6 +343,12 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
+static int mqueue_mount_permission(void)
+{
+	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+	return ns_capable(ns->user_ns, CAP_SYS_ADMIN) ? 0 : -EPERM;
+}
+
 static struct dentry *mqueue_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name,
 			 void *data)
@@ -1524,9 +1530,9 @@ static const struct super_operations mqueue_super_ops = {
 
 static struct file_system_type mqueue_fs_type = {
 	.name = "mqueue",
+	.permission = mqueue_mount_permission,
 	.mount = mqueue_mount,
 	.kill_sb = kill_litter_super,
-	.fs_flags = FS_USERNS_MOUNT,
 };
 
 int mq_init_ns(struct ipc_namespace *ns)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6aaf5dd5383b..82539ac4fa28 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2030,6 +2030,12 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
 	return dentry;
 }
 
+static int cgroup_mount_permission(void)
+{
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+	return ns_capable(ns->user_ns, CAP_SYS_ADMIN) ? 0 : -EPERM;
+}
+
 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
@@ -2040,12 +2046,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
 	get_cgroup_ns(ns);
 
-	/* Check if the caller has permission to mount. */
-	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
-		put_cgroup_ns(ns);
-		return ERR_PTR(-EPERM);
-	}
-
 	/*
 	 * The first time anyone tries to mount a cgroup, enable the list
 	 * linking each css_set to its tasks and fix up all existing tasks.
@@ -2101,16 +2101,16 @@ static void cgroup_kill_sb(struct super_block *sb)
 
 struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
+	.permission = cgroup_mount_permission,
 	.mount = cgroup_mount,
 	.kill_sb = cgroup_kill_sb,
-	.fs_flags = FS_USERNS_MOUNT,
 };
 
 static struct file_system_type cgroup2_fs_type = {
 	.name = "cgroup2",
+	.permission = cgroup_mount_permission,
 	.mount = cgroup_mount,
 	.kill_sb = cgroup_kill_sb,
-	.fs_flags = FS_USERNS_MOUNT,
 };
 
 int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
diff --git a/mm/shmem.c b/mm/shmem.c
index ea26d7a0342d..02f3bdaf7dc2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3654,9 +3654,9 @@ static struct dentry *shmem_mount(struct file_system_type *fs_type,
 static struct file_system_type shmem_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "tmpfs",
+	.permission	= userns_mount_permission,
 	.mount		= shmem_mount,
 	.kill_sb	= kill_litter_super,
-	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 int __init shmem_init(void)
@@ -3799,9 +3799,9 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
 
 static struct file_system_type shmem_fs_type = {
 	.name		= "tmpfs",
+	.permission	= userns_mount_permission,
 	.mount		= ramfs_mount,
 	.kill_sb	= kill_litter_super,
-	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 int __init shmem_init(void)
-- 
2.21.0.dirty