Make it possible for fsopen() to mount into a specified container, using the namespaces associated with that container to cover UID translation, networking and filesystem content. This involves modifying the fsopen() syscall to use the reserved parameter: int mfd = fsopen(const char *fsname, int containerfd, int open_flags); where containerfd can be -1 to use the current process's namespaces (as before) or a file descriptor created by container_create() to mount into that container. For example: containerfd = container_create("fred", CONTAINER_NEW_FS_NS); mfd = fsopen("nfs4", containerfd, 0); write(mfd, "d warthog:/data", ...); write(mfd, "o fsc", ...); write(mfd, "o sync", ...); write(mfd, "o intr", ...); write(mfd, "o vers=4.2", ...); write(mfd, "o addr=192.168.1.1", ...); write(mfd, "o clientaddr=192.168.1.2", ...); fsmount(mfd, container_fd, "/mnt", AT_NO_FOLLOW, 0); Any upcalls the mount makes, say to access DNS services, will be made inside the container. Signed-off-by: David Howells <dhowells@xxxxxxxxxx> --- fs/fsopen.c | 33 ++++++++++++++++++++++++++------- fs/libfs.c | 3 ++- fs/namespace.c | 23 ++++++++++++++++------- fs/nfs/namespace.c | 2 +- fs/nfs/nfs4namespace.c | 4 ++-- fs/proc/root.c | 13 ++++++++++--- fs/sb_config.c | 29 ++++++++++++++++++++++------- include/linux/container.h | 1 + include/linux/mount.h | 2 +- include/linux/pid.h | 5 ++++- include/linux/proc_ns.h | 3 ++- include/linux/sb_config.h | 5 ++++- kernel/container.c | 4 ++++ kernel/fork.c | 2 +- kernel/pid.c | 4 ++-- 15 files changed, 98 insertions(+), 35 deletions(-) diff --git a/fs/fsopen.c b/fs/fsopen.c index cbede77158ba..65278b7f5a45 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -13,6 +13,8 @@ #include <linux/mount.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/fs.h> +#include <linux/container.h> #include <linux/file.h> #include <linux/magic.h> #include <linux/syscalls.h> @@ -219,30 +221,44 @@ fs_initcall(init_fs_fs); * opened, thereby indicating which namespaces will be used (notably, which * network namespace will be used for network filesystems). */ -SYSCALL_DEFINE3(fsopen, const char __user *, _fs_name, int, reserved, +SYSCALL_DEFINE3(fsopen, const char __user *, _fs_name, int, containerfd, unsigned int, flags) { + struct container *container = NULL; struct sb_config *sc; struct file *file; const char *fs_name; int fd, ret; - if (flags & ~O_CLOEXEC || reserved != -1) + if (flags & ~O_CLOEXEC) return -EINVAL; fs_name = strndup_user(_fs_name, PAGE_SIZE); if (IS_ERR(fs_name)) return PTR_ERR(fs_name); - sc = vfs_new_sb_config(fs_name); + if (containerfd != -1) { + struct fd f = fdget(containerfd); + + ret = -EBADF; + if (!f.file) + goto err_fs_name; + ret = -EINVAL; + if (is_container_file(f.file)) { + container = get_container(f.file->private_data); + ret = 0; + } + fdput(f); + if (ret < 0) + goto err_fs_name; + } + + sc = vfs_new_sb_config(fs_name, container); kfree(fs_name); + put_container(container); if (IS_ERR(sc)) return PTR_ERR(sc); - ret = -ENOTSUPP; - if (!sc->ops) - goto err_sc; - file = create_fs_file(sc); if (IS_ERR(file)) { ret = PTR_ERR(file); @@ -264,4 +280,7 @@ SYSCALL_DEFINE3(fsopen, const char __user *, _fs_name, int, reserved, err_sc: put_sb_config(sc); return ret; +err_fs_name: + kfree(fs_name); + return ret; } diff --git a/fs/libfs.c b/fs/libfs.c index e8787adf0363..d59dae7a9bd0 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -583,7 +583,8 @@ int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *c if (unlikely(!*mount)) { spin_unlock(&pin_fs_lock); - sc = __vfs_new_sb_config(type, NULL, MS_KERNMOUNT, SB_CONFIG_FOR_NEW); + sc = __vfs_new_sb_config(type, NULL, NULL, MS_KERNMOUNT, + SB_CONFIG_FOR_NEW); if (IS_ERR(sc)) return PTR_ERR(sc); diff --git a/fs/namespace.c b/fs/namespace.c index 7e2d5fe5728b..9ca8b9f49f80 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -783,9 +783,16 @@ static void put_mountpoint(struct mountpoint *mp) } } +static inline int __check_mnt(struct mount *mnt, struct mnt_namespace *mnt_ns) +{ + if (!mnt_ns) + mnt_ns = current->nsproxy->mnt_ns; + return mnt->mnt_ns == mnt_ns; +} + static inline int check_mnt(struct mount *mnt) { - return mnt->mnt_ns == current->nsproxy->mnt_ns; + return __check_mnt(mnt, NULL); } /* @@ -2408,7 +2415,8 @@ static int do_move_mount(struct path *path, const char *old_name) /* * add a mount into a namespace's mount tree */ -static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) +static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags, + struct mnt_namespace *mnt_ns) { struct mountpoint *mp; struct mount *parent; @@ -2422,7 +2430,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) parent = real_mount(path->mnt); err = -EINVAL; - if (unlikely(!check_mnt(parent))) { + if (unlikely(!__check_mnt(parent, mnt_ns))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) goto unlock; @@ -2471,7 +2479,8 @@ static int do_new_mount_sc(struct sb_config *sc, struct path *mountpoint, goto err_mnt; } - ret = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + ret = do_add_mount(real_mount(mnt), mountpoint, mnt_flags, + sc->container ? sc->container->ns->mnt_ns : NULL); if (ret < 0) { errorf("VFS: Failed to add mount"); goto err_mnt; @@ -2496,7 +2505,7 @@ static int do_new_mount(struct path *mountpoint, const char *fstype, int flags, if (!fstype) return -EINVAL; - sc = vfs_new_sb_config(fstype); + sc = vfs_new_sb_config(fstype, NULL); if (IS_ERR(sc)) { err = PTR_ERR(sc); goto err; @@ -2544,7 +2553,7 @@ int finish_automount(struct vfsmount *m, struct path *path) goto fail; } - err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE, NULL); if (!err) return 0; fail: @@ -3175,7 +3184,7 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type, if (!type) return ERR_PTR(-EINVAL); - sc = __vfs_new_sb_config(type, NULL, flags, SB_CONFIG_FOR_NEW); + sc = __vfs_new_sb_config(type, NULL, NULL, flags, SB_CONFIG_FOR_NEW); if (IS_ERR(sc)) return ERR_CAST(sc); diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index e95e669e4db8..2dcb0c3b4cbb 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -239,7 +239,7 @@ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, /* Open a new mount context, transferring parameters from the parent * superblock, including the network namespace. */ - sc = __vfs_new_sb_config(&nfs_fs_type, dentry->d_sb, 0, + sc = __vfs_new_sb_config(&nfs_fs_type, dentry->d_sb, NULL, 0, SB_CONFIG_FOR_SUBMOUNT); if (IS_ERR(sc)) return ERR_CAST(sc); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 60b711aa0618..5e49684faf79 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -346,8 +346,8 @@ static struct vfsmount *nfs_follow_referral(struct dentry *dentry, if (locations == NULL || locations->nlocations <= 0) goto out; - - sc = __vfs_new_sb_config(&nfs4_fs_type, dentry->d_sb, 0, + + sc = __vfs_new_sb_config(&nfs4_fs_type, dentry->d_sb, NULL, 0, SB_CONFIG_FOR_SUBMOUNT); if (IS_ERR(sc)) { mnt = ERR_CAST(sc); diff --git a/fs/proc/root.c b/fs/proc/root.c index 9878b62e874c..70e52b060873 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -17,6 +17,7 @@ #include <linux/sched/stat.h> #include <linux/module.h> #include <linux/bitops.h> +#include <linux/container.h> #include <linux/user_namespace.h> #include <linux/sb_config.h> #include <linux/pid_namespace.h> @@ -171,8 +172,14 @@ static const struct sb_config_operations proc_sb_config_ops = { static int proc_init_sb_config(struct sb_config *sc, struct super_block *src_sb) { struct proc_sb_config *cfg = container_of(sc, struct proc_sb_config, sc); + struct pid_namespace *pid_ns; - cfg->pid_ns = get_pid_ns(task_active_pid_ns(current)); + if (cfg->sc.container) + pid_ns = cfg->sc.container->pid_ns; + else + pid_ns = task_active_pid_ns(current); + + cfg->pid_ns = get_pid_ns(pid_ns); cfg->sc.ops = &proc_sb_config_ops; return 0; } @@ -292,14 +299,14 @@ struct proc_dir_entry proc_root = { .name = "/proc", }; -int pid_ns_prepare_proc(struct pid_namespace *ns) +int pid_ns_prepare_proc(struct pid_namespace *ns, struct container *container) { struct proc_sb_config *cfg; struct sb_config *sc; struct vfsmount *mnt; int ret; - sc = __vfs_new_sb_config(&proc_fs_type, NULL, 0, SB_CONFIG_FOR_NEW); + sc = __vfs_new_sb_config(&proc_fs_type, NULL, container, 0, SB_CONFIG_FOR_NEW); if (IS_ERR(sc)) return PTR_ERR(sc); diff --git a/fs/sb_config.c b/fs/sb_config.c index 4d9bfb982d41..c1ea2a98bd8d 100644 --- a/fs/sb_config.c +++ b/fs/sb_config.c @@ -19,6 +19,7 @@ #include <linux/magic.h> #include <linux/security.h> #include <linux/parser.h> +#include <linux/container.h> #include <linux/mnt_namespace.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> @@ -108,7 +109,7 @@ static int vfs_parse_ms_mount_option(struct sb_config *sc, char *data) /** * vfs_parse_mount_option - Add a single mount option to a superblock config - * @mc: The superblock configuration to modify + * @sc: The superblock configuration to modify * @p: The option to apply. * * A single mount option in string form is applied to the superblock @@ -148,7 +149,7 @@ EXPORT_SYMBOL(vfs_parse_mount_option); /** * generic_monolithic_mount_data - Parse key[=val][,key[=val]]* mount data - * @mc: The superblock configuration to fill in. + * @sc: The superblock configuration to fill in. * @data: The data to parse * * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be @@ -181,6 +182,7 @@ EXPORT_SYMBOL(generic_monolithic_mount_data); * __vfs_new_sb_config - Create a superblock config. * @fs_type: The filesystem type. * @src_sb: A superblock from which this one derives (or NULL) + * @c: The container that will be opened in (or NULL) * @ms_flags: Superblock flags and op flags (such as MS_REMOUNT) * @purpose: The purpose that this configuration shall be used for. * @@ -191,6 +193,7 @@ EXPORT_SYMBOL(generic_monolithic_mount_data); */ struct sb_config *__vfs_new_sb_config(struct file_system_type *fs_type, struct super_block *src_sb, + struct container *c, unsigned int ms_flags, enum sb_config_purpose purpose) { @@ -210,10 +213,17 @@ struct sb_config *__vfs_new_sb_config(struct file_system_type *fs_type, sc->purpose = purpose; sc->ms_flags = ms_flags; sc->fs_type = get_filesystem(fs_type); - sc->net_ns = get_net(current->nsproxy->net_ns); - sc->user_ns = get_user_ns(current_user_ns()); sc->cred = get_current_cred(); + if (!c) { + sc->net_ns = get_net(current->nsproxy->net_ns); + sc->user_ns = get_user_ns(current_user_ns()); + } else { + sc->container = get_container(c); + sc->net_ns = get_net(c->ns->net_ns); + sc->user_ns = get_user_ns(c->cred->user_ns); + } + /* TODO: Make all filesystems support this unconditionally */ if (sc->fs_type->init_sb_config) { ret = sc->fs_type->init_sb_config(sc, src_sb); @@ -241,6 +251,7 @@ EXPORT_SYMBOL(__vfs_new_sb_config); /** * vfs_new_sb_config - Create a superblock config for a new mount. * @fs_name: The name of the filesystem + * @container: The container to create in (or NULL) * * Open a filesystem and create a superblock config context for a new mount * that will hold the mount options, device name, security details, etc.. Note @@ -248,7 +259,8 @@ EXPORT_SYMBOL(__vfs_new_sb_config); * determine whether the filesystem actually supports the superblock context * itself. */ -struct sb_config *vfs_new_sb_config(const char *fs_name) +struct sb_config *vfs_new_sb_config(const char *fs_name, + struct container *c) { struct file_system_type *fs_type; struct sb_config *sc; @@ -257,7 +269,7 @@ struct sb_config *vfs_new_sb_config(const char *fs_name) if (!fs_type) return ERR_PTR(-ENODEV); - sc = __vfs_new_sb_config(fs_type, NULL, 0, SB_CONFIG_FOR_NEW); + sc = __vfs_new_sb_config(fs_type, NULL, c, 0, SB_CONFIG_FOR_NEW); put_filesystem(fs_type); return sc; } @@ -275,7 +287,7 @@ struct sb_config *vfs_sb_reconfig(struct vfsmount *mnt, unsigned int ms_flags) { return __vfs_new_sb_config(mnt->mnt_sb->s_type, mnt->mnt_sb, - ms_flags, SB_CONFIG_FOR_REMOUNT); + NULL, ms_flags, SB_CONFIG_FOR_REMOUNT); } /** @@ -302,6 +314,8 @@ struct sb_config *vfs_dup_sb_config(struct sb_config *src_sc) sc->device = NULL; sc->security = NULL; get_filesystem(sc->fs_type); + if (sc->container) + get_container(sc->container); get_net(sc->net_ns); get_user_ns(sc->user_ns); get_cred(sc->cred); @@ -347,6 +361,7 @@ void put_sb_config(struct sb_config *sc) if (sc->cred) put_cred(sc->cred); kfree(sc->subtype); + put_container(sc->container); put_filesystem(sc->fs_type); kfree(sc->device); kfree(sc); diff --git a/include/linux/container.h b/include/linux/container.h index 084ea9982fe6..073674fab160 100644 --- a/include/linux/container.h +++ b/include/linux/container.h @@ -36,6 +36,7 @@ struct container { struct path root; /* The root of the container's fs namespace */ struct task_struct *init; /* The 'init' task for this container */ struct container *parent; /* Parent of this container. */ + struct pid_namespace *pid_ns; /* The process ID namespace for this container */ void *security; /* LSM data */ struct list_head members; /* Member processes, guarded with ->lock */ struct list_head child_link; /* Link in parent->children */ diff --git a/include/linux/mount.h b/include/linux/mount.h index a5dca6abc4d5..265e9aa2ab0b 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -70,7 +70,7 @@ struct vfsmount { int mnt_flags; }; -struct file; /* forward dec */ + struct file; /* forward dec */ struct path; extern int mnt_want_write(struct vfsmount *mnt); diff --git a/include/linux/pid.h b/include/linux/pid.h index 4d179316e431..ac429dea2f84 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -79,6 +79,8 @@ static inline struct pid *get_pid(struct pid *pid) return pid; } +struct container; + extern void put_pid(struct pid *pid); extern struct task_struct *pid_task(struct pid *pid, enum pid_type); extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type); @@ -117,7 +119,8 @@ extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); int next_pidmap(struct pid_namespace *pid_ns, unsigned int last); -extern struct pid *alloc_pid(struct pid_namespace *ns); +extern struct pid *alloc_pid(struct pid_namespace *ns, + struct container *container); extern void free_pid(struct pid *pid); extern void disable_pid_allocation(struct pid_namespace *ns); diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 58ab28d81fc2..52f0b2db5dda 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -46,7 +46,8 @@ enum { #ifdef CONFIG_PROC_FS -extern int pid_ns_prepare_proc(struct pid_namespace *ns); +extern int pid_ns_prepare_proc(struct pid_namespace *ns, + struct container *container); extern void pid_ns_release_proc(struct pid_namespace *ns); extern int proc_alloc_inum(unsigned int *pino); extern void proc_free_inum(unsigned int inum); diff --git a/include/linux/sb_config.h b/include/linux/sb_config.h index 144258d82fa1..8bc7ac70b11a 100644 --- a/include/linux/sb_config.h +++ b/include/linux/sb_config.h @@ -46,6 +46,7 @@ enum sb_config_purpose { struct sb_config { const struct sb_config_operations *ops; struct file_system_type *fs_type; + struct container *container; /* The container in which the mount will exist */ struct dentry *root; /* The root and superblock */ struct user_namespace *user_ns; /* The user namespace for this mount */ struct net *net_ns; /* The network namespace for this mount */ @@ -69,9 +70,11 @@ struct sb_config_operations { int (*get_tree)(struct sb_config *sc); }; -extern struct sb_config *vfs_new_sb_config(const char *fs_name); +extern struct sb_config *vfs_new_sb_config(const char *fs_name, + struct container *c); extern struct sb_config *__vfs_new_sb_config(struct file_system_type *fs_type, struct super_block *src_sb, + struct container *c, unsigned int ms_flags, enum sb_config_purpose purpose); extern struct sb_config *vfs_sb_reconfig(struct vfsmount *mnt, diff --git a/kernel/container.c b/kernel/container.c index d5849c07a76b..5ebbf548f01a 100644 --- a/kernel/container.c +++ b/kernel/container.c @@ -31,6 +31,7 @@ struct container init_container = { .cred = &init_cred, .ns = &init_nsproxy, .init = &init_task, + .pid_ns = &init_pid_ns, .members.next = &init_task.container_link, .members.prev = &init_task.container_link, .children = LIST_HEAD_INIT(init_container.children), @@ -52,6 +53,8 @@ void put_container(struct container *c) while (c && refcount_dec_and_test(&c->usage)) { BUG_ON(!list_empty(&c->members)); + if (c->pid_ns) + put_pid_ns(c->pid_ns); if (c->ns) put_nsproxy(c->ns); path_put(&c->root); @@ -491,6 +494,7 @@ static struct container *create_container(const char *name, unsigned int flags) } c->ns = ns; + c->pid_ns = get_pid_ns(c->ns->pid_ns_for_children); c->root = fs->root; c->seq = fs->seq; fs->root.mnt = NULL; diff --git a/kernel/fork.c b/kernel/fork.c index d185c13820d7..68cd7367fcd5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1764,7 +1764,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { - pid = alloc_pid(p->nsproxy->pid_ns_for_children); + pid = alloc_pid(p->nsproxy->pid_ns_for_children, container); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; diff --git a/kernel/pid.c b/kernel/pid.c index fd1cde1e4576..adc65cdc2613 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -293,7 +293,7 @@ void free_pid(struct pid *pid) call_rcu(&pid->rcu, delayed_put_pid); } -struct pid *alloc_pid(struct pid_namespace *ns) +struct pid *alloc_pid(struct pid_namespace *ns, struct container *container) { struct pid *pid; enum pid_type type; @@ -321,7 +321,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) } if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) { + if (pid_ns_prepare_proc(ns, container)) { disable_pid_allocation(ns); goto out_free; }