Make it possible for fsopen() to create a superblock in a specified container, using the namespaces associated with that container to cover UID translation, networking and filesystem content. This involves adding a new fsconfig command to specify the container. For example: cfd = container_create("fred", CONTAINER_NEW_FS_NS); fsfd = fsopen("ext4", 0); fsconfig(fsfd, FSCONFIG_SET_CONTAINER, NULL, NULL, cfd); fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "/dev/sda3", 0); fsconfig(fsfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0); fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); mfd = fsmount(fsfd, 0, MOUNT_ATTR_RDONLY); move_mount(mfd, "", cfd, "/", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_CONTAINER_ROOT); Signed-off-by: David Howells <dhowells@xxxxxxxxxx> --- fs/fs_context.c | 19 +++++++++++++++ fs/fsopen.c | 54 +++++++++++++++++++++++++++++++++++++------- fs/namespace.c | 19 +++++++++++---- fs/proc/root.c | 11 +++++++-- include/linux/container.h | 1 + include/linux/fs_context.h | 3 ++ include/linux/pid.h | 5 +++- include/linux/proc_ns.h | 6 +++-- include/uapi/linux/mount.h | 1 + kernel/container.c | 4 +++ kernel/fork.c | 2 +- kernel/pid.c | 4 ++- 12 files changed, 108 insertions(+), 21 deletions(-) diff --git a/fs/fs_context.c b/fs/fs_context.c index a47ccd5a4a78..fc76ac02d618 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/magic.h> #include <linux/security.h> +#include <linux/container.h> #include <linux/mnt_namespace.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> @@ -169,6 +170,21 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) } EXPORT_SYMBOL(vfs_parse_fs_param); +/* + * Specify a container in which a superblock will exist. + */ +void vfs_set_container(struct fs_context *fc, struct container *container) +{ + if (container) { + put_user_ns(fc->user_ns); + put_net(fc->net_ns); + + fc->container = get_container(container); + fc->user_ns = get_user_ns(container->cred->user_ns); + fc->net_ns = get_net(container->ns->net_ns); + } +} + /** * vfs_parse_fs_string - Convenience function to just parse a string. */ @@ -364,6 +380,8 @@ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) fc->source = NULL; fc->security = NULL; get_filesystem(fc->fs_type); + if (fc->container) + get_container(fc->container); get_net(fc->net_ns); get_user_ns(fc->user_ns); get_cred(fc->cred); @@ -510,6 +528,7 @@ void put_fs_context(struct fs_context *fc) put_net(fc->net_ns); put_user_ns(fc->user_ns); put_cred(fc->cred); + put_container(fc->container); kfree(fc->subtype); put_fc_log(fc); put_filesystem(fc->fs_type); diff --git a/fs/fsopen.c b/fs/fsopen.c index 3bb9c0c8cbcc..d0fe9e563ebb 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -17,11 +17,33 @@ #include <linux/security.h> #include <linux/anon_inodes.h> #include <linux/namei.h> +#include <linux/container.h> #include <linux/file.h> #include <uapi/linux/mount.h> #include "internal.h" #include "mount.h" +/* + * Configure the destination container on a filesystem context. This must be + * done before any other parameters are offered. Containers are presented as + * fds attached to such objects given by the auxiliary parameter. + * + * For example: + * + * fsconfig(fsfd, FSCONFIG_SET_CONTAINER, NULL, NULL, container_fd); + */ +static int fsconfig_set_container(struct fs_context *fc, struct fs_parameter *param) +{ + struct container *c; + + if (!is_container_file(param->file)) + return -EINVAL; + + c = param->file->private_data; + vfs_set_container(fc, c); + return 0; +} + /* * Allow the user to read back any error, warning or informational messages. */ @@ -111,10 +133,6 @@ static int fscontext_alloc_log(struct fs_context *fc) /* * Open a filesystem by name so that it can be configured for mounting. - * - * We are allowed to specify a container in which the filesystem will be - * opened, thereby indicating which namespaces will be used (notably, which - * network namespace will be used for network filesystems). */ SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags) { @@ -143,7 +161,7 @@ SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags) if (IS_ERR(fc)) return PTR_ERR(fc); - fc->phase = FS_CONTEXT_CREATE_PARAMS; + fc->phase = FS_CONTEXT_CREATE_NS; ret = fscontext_alloc_log(fc); if (ret < 0) @@ -228,7 +246,8 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, return ret; switch (cmd) { case FSCONFIG_CMD_CREATE: - if (fc->phase != FS_CONTEXT_CREATE_PARAMS) + if (fc->phase != FS_CONTEXT_CREATE_NS && + fc->phase != FS_CONTEXT_CREATE_PARAMS) return -EBUSY; fc->phase = FS_CONTEXT_CREATING; ret = vfs_get_tree(fc); @@ -259,9 +278,17 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, break; vfs_clean_context(fc); return 0; + + case FSCONFIG_SET_CONTAINER: + if (fc->phase != FS_CONTEXT_CREATE_NS) + return -EBUSY; + return fsconfig_set_container(fc, param); + default: - if (fc->phase != FS_CONTEXT_CREATE_PARAMS && - fc->phase != FS_CONTEXT_RECONF_PARAMS) + if (fc->phase == FS_CONTEXT_CREATE_NS) + fc->phase = FS_CONTEXT_CREATE_PARAMS; + else if (fc->phase != FS_CONTEXT_CREATE_PARAMS && + fc->phase != FS_CONTEXT_RECONF_PARAMS) return -EBUSY; return vfs_parse_fs_param(fc, param); @@ -353,6 +380,10 @@ SYSCALL_DEFINE5(fsconfig, if (!_key || _value || aux < 0) return -EINVAL; break; + case FSCONFIG_SET_CONTAINER: + if (_key || _value || aux < 0) + return -EINVAL; + break; case FSCONFIG_CMD_CREATE: case FSCONFIG_CMD_RECONFIGURE: if (_key || _value || aux) @@ -438,6 +469,12 @@ SYSCALL_DEFINE5(fsconfig, if (!param.file) goto out_key; break; + case FSCONFIG_SET_CONTAINER: + ret = -EBADF; + param.file = fget(aux); + if (!param.file) + goto out_key; + break; default: break; } @@ -463,6 +500,7 @@ SYSCALL_DEFINE5(fsconfig, putname(param.name); break; case FSCONFIG_SET_FD: + case FSCONFIG_SET_CONTAINER: if (param.file) fput(param.file); break; diff --git a/fs/namespace.c b/fs/namespace.c index ea005f55ec4c..cc5d56f7ae29 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -781,9 +781,16 @@ static void put_mountpoint(struct mountpoint *mp) } } +static inline int __check_mnt(struct mount *mnt, struct mnt_namespace *mnt_ns) +{ + if (!mnt_ns) + mnt_ns = current->nsproxy->mnt_ns; + return mnt->mnt_ns == mnt_ns; +} + static inline int check_mnt(struct mount *mnt) { - return mnt->mnt_ns == current->nsproxy->mnt_ns; + return __check_mnt(mnt, NULL); } /* @@ -2696,7 +2703,8 @@ static int do_move_mount_old(struct path *path, const char *old_name) /* * add a mount into a namespace's mount tree */ -static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) +static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags, + struct mnt_namespace *mnt_ns) { struct mountpoint *mp; struct mount *parent; @@ -2710,7 +2718,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) parent = real_mount(path->mnt); err = -EINVAL; - if (unlikely(!check_mnt(parent))) { + if (unlikely(!__check_mnt(parent, mnt_ns))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) goto unlock; @@ -2765,7 +2773,8 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, if (IS_ERR(mnt)) return PTR_ERR(mnt); - error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags, + fc->container ? fc->container->ns->mnt_ns : NULL); if (error < 0) mntput(mnt); return error; @@ -2839,7 +2848,7 @@ int finish_automount(struct vfsmount *m, struct path *path) goto fail; } - err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE, NULL); if (!err) return 0; fail: diff --git a/fs/proc/root.c b/fs/proc/root.c index 6927b29ece76..aa802006d855 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -18,6 +18,7 @@ #include <linux/sched/stat.h> #include <linux/module.h> #include <linux/bitops.h> +#include <linux/container.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <linux/mount.h> @@ -186,8 +187,12 @@ static int proc_init_fs_context(struct fs_context *fc) ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; + + if (fc->container) + ctx->pid_ns = get_pid_ns(fc->container->pid_ns); + else + ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); - ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->fs_private = ctx; fc->ops = &proc_fs_context_ops; return 0; @@ -300,7 +305,7 @@ struct proc_dir_entry proc_root = { .name = "/proc", }; -int pid_ns_prepare_proc(struct pid_namespace *ns) +int pid_ns_prepare_proc(struct pid_namespace *ns, struct container *container) { struct proc_fs_context *ctx; struct fs_context *fc; @@ -315,6 +320,8 @@ int pid_ns_prepare_proc(struct pid_namespace *ns) fc->user_ns = get_user_ns(ns->user_ns); } + vfs_set_container(fc, container); + ctx = fc->fs_private; if (ctx->pid_ns != ns) { put_pid_ns(ctx->pid_ns); diff --git a/include/linux/container.h b/include/linux/container.h index 0a8918435097..087aa1885ef7 100644 --- a/include/linux/container.h +++ b/include/linux/container.h @@ -37,6 +37,7 @@ struct container { struct path root; /* The root of the container's fs namespace */ struct task_struct *init; /* The 'init' task for this container */ struct container *parent; /* Parent of this container. */ + struct pid_namespace *pid_ns; /* The process ID namespace for this container */ void *security; /* LSM data */ struct list_head members; /* Member processes, guarded with ->lock */ struct list_head child_link; /* Link in parent->children */ diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index dc8c9fcba341..45486080eb84 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -40,6 +40,7 @@ enum fs_context_purpose { * Userspace usage phase for fsopen/fspick. */ enum fs_context_phase { + FS_CONTEXT_CREATE_NS, /* Set namespaces for sb creation */ FS_CONTEXT_CREATE_PARAMS, /* Loading params for sb creation */ FS_CONTEXT_CREATING, /* A superblock is being created */ FS_CONTEXT_AWAITING_MOUNT, /* Superblock created, awaiting fsmount() */ @@ -93,6 +94,7 @@ struct fs_context { struct file_system_type *fs_type; void *fs_private; /* The filesystem's context */ struct dentry *root; /* The root and superblock */ + struct container *container; /* The container in which the mount will exist */ struct user_namespace *user_ns; /* The user namespace for this mount */ struct net *net_ns; /* The network namespace for this mount */ const struct cred *cred; /* The mounter's credentials */ @@ -136,6 +138,7 @@ extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) extern int vfs_parse_fs_string(struct fs_context *fc, const char *key, const char *value, size_t v_size); extern int generic_parse_monolithic(struct fs_context *fc, void *data); +extern void vfs_set_container(struct fs_context *fc, struct container *container); extern int vfs_get_tree(struct fs_context *fc); extern void put_fs_context(struct fs_context *fc); diff --git a/include/linux/pid.h b/include/linux/pid.h index 14a9a39da9c7..16dc152ceef1 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -73,6 +73,8 @@ static inline struct pid *get_pid(struct pid *pid) return pid; } +struct container; + extern void put_pid(struct pid *pid); extern struct task_struct *pid_task(struct pid *pid, enum pid_type); extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type); @@ -111,7 +113,8 @@ extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); int next_pidmap(struct pid_namespace *pid_ns, unsigned int last); -extern struct pid *alloc_pid(struct pid_namespace *ns); +extern struct pid *alloc_pid(struct pid_namespace *ns, + struct container *container); extern void free_pid(struct pid *pid); extern void disable_pid_allocation(struct pid_namespace *ns); diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index d31cb6215905..dee0881eca5c 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -47,14 +47,16 @@ enum { #ifdef CONFIG_PROC_FS -extern int pid_ns_prepare_proc(struct pid_namespace *ns); +extern int pid_ns_prepare_proc(struct pid_namespace *ns, + struct container *container); extern void pid_ns_release_proc(struct pid_namespace *ns); extern int proc_alloc_inum(unsigned int *pino); extern void proc_free_inum(unsigned int inum); #else /* CONFIG_PROC_FS */ -static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; } +static inline int pid_ns_prepare_proc(struct pid_namespace *ns, struct container *container) +{ return 0; } static inline void pid_ns_release_proc(struct pid_namespace *ns) {} static inline int proc_alloc_inum(unsigned int *inum) diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 96a0240f23fe..f60bbe6f4099 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -97,6 +97,7 @@ enum fsconfig_command { FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ + FSCONFIG_SET_CONTAINER = 8, /* Set a container, supplied by fd */ }; /* diff --git a/kernel/container.c b/kernel/container.c index 1d2cb1c1e9b1..fd3b2a6849a1 100644 --- a/kernel/container.c +++ b/kernel/container.c @@ -30,6 +30,7 @@ struct container init_container = { .cred = &init_cred, .ns = &init_nsproxy, .init = &init_task, + .pid_ns = &init_pid_ns, .members.next = &init_task.container_link, .members.prev = &init_task.container_link, .children = LIST_HEAD_INIT(init_container.children), @@ -51,6 +52,8 @@ void put_container(struct container *c) while (c && refcount_dec_and_test(&c->usage)) { BUG_ON(!list_empty(&c->members)); + if (c->pid_ns) + put_pid_ns(c->pid_ns); if (c->ns) put_nsproxy(c->ns); path_put(&c->root); @@ -391,6 +394,7 @@ static struct container *create_container(const char __user *name, unsigned int } c->ns = ns; + c->pid_ns = get_pid_ns(c->ns->pid_ns_for_children); c->root = fs->root; c->seq = fs->seq; fs->root.mnt = NULL; diff --git a/kernel/fork.c b/kernel/fork.c index 71401deb4434..09de5f35d312 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1958,7 +1958,7 @@ static __latent_entropy struct task_struct *copy_process( stackleak_task_init(p); if (pid != &init_struct_pid) { - pid = alloc_pid(p->nsproxy->pid_ns_for_children); + pid = alloc_pid(p->nsproxy->pid_ns_for_children, dest_container); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..6528a75e6c0d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -156,7 +156,7 @@ void free_pid(struct pid *pid) call_rcu(&pid->rcu, delayed_put_pid); } -struct pid *alloc_pid(struct pid_namespace *ns) +struct pid *alloc_pid(struct pid_namespace *ns, struct container *container) { struct pid *pid; enum pid_type type; @@ -205,7 +205,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) } if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) + if (pid_ns_prepare_proc(ns, container)) goto out_free; }