Controllers set to bypass mode in the parent's "cgroup.subtree_control" can now be optionally enabled by writing the controller name with the '+' prefix to "cgroup.controllers". Using the '#' prefix will reset it back to the bypass state. This capability increases the flexibility each controller has in shaping the effective cgroup hierarchy to best suit its need. Signed-off-by: Waiman Long <longman@xxxxxxxxxx> --- Documentation/cgroup-v2.txt | 23 +++++++++- include/linux/cgroup-defs.h | 7 +++ kernel/cgroup/cgroup.c | 109 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 134 insertions(+), 5 deletions(-) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index f17a74b..efb69c4 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -395,6 +395,18 @@ prefixed controller interface files from C and D. This means that the controller interface files - anything which doesn't start with "cgroup." are owned by the parent rather than the cgroup itself. +Once a controller is put into bypass mode in "cgroup.subtree_control", +the cgroup's children can optionally enable this controller by writing +the controller name with the '+ prefix into "cgroup.controllers". +In this case, the controller interface files are considered to be +owned by the child cgroup itself, not by its parent. Therefore, +setting the bypass mode in "cgroup.subtree_control" means delegating +the authority of enabling or disabling the controller interface files +to its children. Writing the controller name with the '#' prefix into +"cgroup.controllers" resets the state back to bypass mode. The state +of a controller cannot be changed if it is enabled or bypassed in its +"cgroup.subtree_control". + Cgroup Hierarchy ~~~~~~~~~~~~~~~~ @@ -859,11 +871,18 @@ All cgroup core files are prefixed with "cgroup." should be granted along with the containing directory. cgroup.controllers - A read-only space separated values file which exists on all + A read-write space separated values file which exists on all cgroups. It shows space separated list of all controllers available to - the cgroup. The controllers are not ordered. + the cgroup. Controller names with '#' prefix are in bypass + mode. The controllers are not ordered. + + When a controller is set into bypass mode in its parent's + "cgroup.subtree_control", its name prefixed with '+' or '#' + can be written to enable it or reset it back to bypass mode + respectively. Controllers not in bypass mode are not allowed + to be written. cgroup.subtree_control A read-write space separated values file which exists on all diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 3cac6d0..25c2ac8 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -308,6 +308,13 @@ struct cgroup { u16 old_subtree_ss_mask; u16 old_subtree_bypass; + /* + * The bitmask of subsystems that are set in its parent's + * ->subtree_bypass and explictly enabled in this cgroup. + */ + u16 enable_ss_mask; + u16 old_enable_ss_mask; + /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1e7feae..358d8b3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -420,7 +420,7 @@ static u16 cgroup_control(struct cgroup *cgrp, bool show_bypass) u16 root_ss_mask = cgrp->root->subsys_mask; if (parent) { - u16 ss_mask = parent->subtree_control; + u16 ss_mask = parent->subtree_control|cgrp->enable_ss_mask; if (show_bypass) ss_mask |= parent->subtree_bypass; @@ -443,7 +443,7 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp, bool show_bypass) struct cgroup *parent = cgroup_parent(cgrp); if (parent) { - u16 ss_mask = parent->subtree_ss_mask; + u16 ss_mask = parent->subtree_ss_mask|cgrp->enable_ss_mask; if (show_bypass) @@ -2811,6 +2811,7 @@ static void cgroup_save_control(struct cgroup *cgrp) dsct->old_subtree_control = dsct->subtree_control; dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; dsct->old_subtree_bypass = dsct->subtree_bypass; + dsct->old_enable_ss_mask = dsct->enable_ss_mask; } } @@ -2854,6 +2855,7 @@ static void cgroup_restore_control(struct cgroup *cgrp) dsct->subtree_control = dsct->old_subtree_control; dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; dsct->subtree_bypass = dsct->old_subtree_bypass; + dsct->enable_ss_mask = dsct->old_enable_ss_mask; } } @@ -3124,7 +3126,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgroup_for_each_live_child(child, cgrp) - child_enable |= child->subtree_control|child->subtree_bypass; + child_enable |= child->subtree_control|child->subtree_bypass| + child->enable_ss_mask; /* * Cannot change the state of a controller if enabled in children. @@ -3157,6 +3160,105 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return ret ?: nbytes; } +/* + * Change bypass status of controllers for a cgroup in the default hierarchy. + */ +static ssize_t cgroup_controllers_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + u16 enable = 0, bypass = 0; + struct cgroup *cgrp, *parent; + struct cgroup_subsys *ss; + char *tok; + int ssid, ret; + + /* + * Parse input - space separated list of subsystem names prefixed + * with either + or #. + */ + buf = strstrip(buf); + while ((tok = strsep(&buf, " "))) { + if (tok[0] == '\0') + continue; + do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { + if (!cgroup_ssid_enabled(ssid) || + strcmp(tok + 1, ss->name)) + continue; + + if (*tok == '+') { + enable |= 1 << ssid; + bypass &= ~(1 << ssid); + } else if (*tok == '#') { + bypass |= 1 << ssid; + enable &= ~(1 << ssid); + } else { + return -EINVAL; + } + break; + } while_each_subsys_mask(); + if (ssid == CGROUP_SUBSYS_COUNT) + return -EINVAL; + } + + cgrp = cgroup_kn_lock_live(of->kn, true); + if (!cgrp) + return -ENODEV; + + /* + * Write to root cgroup's controllers file is not allowed. + */ + parent = cgroup_parent(cgrp); + if (!parent) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Only controllers set into bypass mode in the parent cgroup + * can be specified here. + */ + if (~parent->subtree_bypass & (enable|bypass)) { + ret = -ENOENT; + goto out_unlock; + } + + /* + * Mask off irrelevant bits. + */ + enable &= ~cgrp->enable_ss_mask; + bypass &= cgrp->enable_ss_mask; + + if (!(enable|bypass)) { + ret = 0; + goto out_unlock; + } + + /* + * We cannot change the bypass state of a controller that is enabled + * in subtree_control. + */ + if ((cgrp->subtree_control|cgrp->subtree_bypass) & (enable|bypass)) { + ret = -EBUSY; + goto out_unlock; + } + + /* Save and update control masks and prepare csses */ + cgroup_save_control(cgrp); + + cgrp->enable_ss_mask |= enable; + cgrp->enable_ss_mask &= ~bypass; + + ret = cgroup_apply_control(cgrp); + cgroup_finalize_control(cgrp, ret); + kernfs_activate(cgrp->kn); + ret = 0; + +out_unlock: + cgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + static int cgroup_enable_threaded(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); @@ -4322,6 +4424,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, { .name = "cgroup.controllers", .seq_show = cgroup_controllers_show, + .write = cgroup_controllers_write, }, { .name = "cgroup.subtree_control", -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html