Allow unprivileged processes to control subtrees of their associated processes, a necessary feature if a rootless container wishes to take advantage of cgroups for its own processes. As cgroups are hierarchical, having the ability to set limits in a subtree does not preclude the ability to modify the limits imposed by parent cgroups. In addition, in the default hierarchy a process must have write access to the common ancestor of the two (src and dest) cgroups' cgroup.procs file. This makes this change safe against cgroup escape. There isn't a way to disable this at the moment. Signed-off-by: Aleksa Sarai <asarai@xxxxxxx> --- kernel/cgroup.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 120 insertions(+), 23 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f1c798b69561..f455488dc899 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -62,6 +62,7 @@ #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> +#include <linux/time.h> #include <net/sock.h> /* @@ -5269,34 +5270,40 @@ out_destroy: return ERR_PTR(ret); } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +/** + * cgroup_create_subtree - creates a new subtree of a cgroup + * @parent: the parent cgroup to create the subtree under + * @name: the name of the cgroup in kernfs + * @mode: the mode of the cgroup in kernfs + * + * Creates a new cgroup under the given @parent, with the given @name and @mode. + * The caller must hold cgroup_mutex, and must not be under active protection of + * kernfs. + */ +static struct cgroup *cgroup_create_subtree(struct cgroup *parent, + const char *name, umode_t mode) { - struct cgroup *parent, *cgrp; + struct cgroup *child; struct kernfs_node *kn; - int ret; + int ret = 0; + + lockdep_assert_held(&cgroup_mutex); /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ if (strchr(name, '\n')) - return -EINVAL; - - parent = cgroup_kn_lock_live(parent_kn, false); - if (!parent) - return -ENODEV; + return ERR_PTR(-EINVAL); - cgrp = cgroup_create(parent); - if (IS_ERR(cgrp)) { - ret = PTR_ERR(cgrp); - goto out_unlock; - } + child = cgroup_create(parent); + if (IS_ERR(child)) + return child; /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + kn = kernfs_create_dir(parent->kn, name, mode, child); if (IS_ERR(kn)) { ret = PTR_ERR(kn); goto out_destroy; } - cgrp->kn = kn; + child->kn = kn; /* * This extra ref will be put in cgroup_free_fn() and guarantees @@ -5308,22 +5315,51 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = css_populate_dir(&cgrp->self); + ret = css_populate_dir(&child->self); if (ret) goto out_destroy; - ret = cgroup_apply_control_enable(cgrp); + ret = cgroup_apply_control_enable(child); if (ret) goto out_destroy; /* let's create and online css's */ kernfs_activate(kn); - ret = 0; - goto out_unlock; + return child; out_destroy: - cgroup_destroy_locked(cgrp); + cgroup_destroy_locked(child); + return ERR_PTR(ret); +} + +/* + * cgroup directories starting with this prefix are forbidden from being created + * from userspace. This prefix is used internally to make sure that there's no + * conflicts with userspace when creating cgroups inside copy_cgroup_ns(). + */ +#define CGROUPNS_INTERNAL_PREFIX ".__cgroupns_subtree:" + +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + struct cgroup *parent, *cgrp; + int ret = 0; + + if (strncmp(CGROUPNS_INTERNAL_PREFIX, name, + strlen(CGROUPNS_INTERNAL_PREFIX)) == 0) + return -EINVAL; + + parent = cgroup_kn_lock_live(parent_kn, false); + if (!parent) + return -ENODEV; + + cgrp = cgroup_create_subtree(parent, name, mode); + if (IS_ERR(cgrp)) { + ret = PTR_ERR(cgrp); + goto out_unlock; + } + out_unlock: cgroup_kn_unlock(parent_kn); return ret; @@ -6298,7 +6334,9 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; + struct cgroup_root *root; struct css_set *cset; + char id[16], id_string[1+2*ARRAY_SIZE(id)] = {0}; BUG_ON(!old_ns); @@ -6311,12 +6349,71 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); + /* + * In order to make sure that the dirname we create is unique, we use a + * random id for all of the subtrees. The ID is the same to reduce + * confusion when reading /proc/<pid>/cgroup. + */ + get_random_bytes(id, ARRAY_SIZE(id)); + bin2hex(id_string, id, ARRAY_SIZE(id)); + + /* + * Create a new subtree in every cgroup the task is associated with. + * The cgroup is owned by the task uid and gid, to allow for management + * of subtrees in cgroup namespaces. This is safe because: + * + * 1. cgroups are hierarchical, so having the ability to set limits in + * a subtree does not preclude the ability to modify the limits + * imposed by parent cgroups. + * + * 2. cgroup_procs_write_permission() does checks to ensure that a + * task cannot move other tasks into its cgroup unless they are both + * running as the same user (or the task moving the process has + * CAP_SYS_ADMIN in the user namespace of the process being moved). + * This means that a misbehaving process can't start messing around + * with other processes' cgroup associations. + * + * 3. On the default hierarchy, you cannot migrate a process to a + * non-descendant cgroup unless you have write access to the + * cgroup.procs file in the common ancestor of the two cgroups. This + * means that two cooperative processes in the default hierarchy + * can't move processes between their cgroups (if the admin + * disallows it). Unfortunately, this functionality doesn't exist in + * the other hierarchies (for backwards compatibility reasons). + * However, this requirement isn't as important as the previous two. + */ mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + for_each_root(root) { + struct cgroup *parent, *child; + char namebuf[CGROUP_FILE_NAME_MAX]; + bool is_dfl = cgroup_on_dfl(&root->cgrp); + + spin_lock_bh(&css_set_lock); + parent = task_cgroup_from_root(current, root); + spin_unlock_bh(&css_set_lock); + + snprintf(namebuf, CGROUP_FILE_NAME_MAX, + CGROUPNS_INTERNAL_PREFIX "%s", id_string); + + /* This should not fail, since we're under &cgroup_mutex. */ + child = cgroup_create_subtree(parent, namebuf, 0755); + if (WARN_ON(IS_ERR(child))) + continue; + /* + * Move the task to the new cgroup, which is owned by the user. + * Should never fail, since we're under &cgroup_mutex here. + */ + rcu_read_lock(); + if (WARN_ON(cgroup_attach_task(child, current, is_dfl))) + cgroup_destroy_locked(child); + rcu_read_unlock(); + + } + + spin_lock_bh(&css_set_lock); cset = task_css_set(current); get_css_set(cset); - spin_unlock_bh(&css_set_lock); mutex_unlock(&cgroup_mutex); -- 2.8.2 -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html