Quoting Tejun Heo (tj@xxxxxxxxxx): > Currently, cgroup hierarchy support is a mess. cpu related subsystems > behave correctly - configuration, accounting and control on a parent > properly cover its children. blkio and freezer completely ignore > hierarchy and treat all cgroups as if they're directly under the root > cgroup. Others show yet different behaviors. > > These differing interpretations of cgroup hierarchy make using cgroup > confusing and it impossible to co-mount controllers into the same > hierarchy and obtain sane behavior. > > Eventually, we want full hierarchy support from all subsystems and > probably a unified hierarchy. Users using separate hierarchies > expecting completely different behaviors depending on the mounted > subsystem is deterimental to making any progress on this front. > > This patch adds cgroup_subsys.broken_hierarchy and sets it to %true > for controllers which are lacking in hierarchy support. The goal of > this patch is two-fold. > > * Move users away from using hierarchy on currently non-hierarchical > subsystems, so that implementing proper hierarchy support on those > doesn't surprise them. > > * Keep track of which controllers are broken how and nudge the > subsystems to implement proper hierarchy support. > > For now, start with a single warning message. We can whine louder > later on. > > v2: Fixed a typo spotted by Michal. Warning message updated. > > v3: Updated memcg part so that it doesn't generate warning in the > cases where .use_hierarchy=false doesn't make the behavior > different from root.use_hierarchy=true. Fixed a typo spotted by > Glauber. > > v4: Check ->broken_hierarchy after cgroup creation is complete so that > ->create() can affect the result per Michal. Dropped unnecessary > memcg root handling per Michal. > > Signed-off-by: Tejun Heo <tj@xxxxxxxxxx> > Cc: Michal Hocko <mhocko@xxxxxxx> > Cc: Li Zefan <lizefan@xxxxxxxxxx> > Cc: Glauber Costa <glommer@xxxxxxxxxxxxx> > Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> > Cc: Paul Turner <pjt@xxxxxxxxxx> > Cc: Johannes Weiner <hannes@xxxxxxxxxxx> > Cc: Thomas Graf <tgraf@xxxxxxx> > Cc: Serge E. Hallyn <serue@xxxxxxxxxx> Acked-by: Serge E. Hallyn <serge.hallyn@xxxxxxxxxx> > Cc: Vivek Goyal <vgoyal@xxxxxxxxxx> > Cc: Paul Mackerras <paulus@xxxxxxxxx> > Cc: Ingo Molnar <mingo@xxxxxxxxxx> > Cc: Arnaldo Carvalho de Melo <acme@xxxxxxxxxxxxxxxxxx> > Cc: Neil Horman <nhorman@xxxxxxxxxxxxx> > Cc: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx> > --- > block/blk-cgroup.c | 8 ++++++++ > include/linux/cgroup.h | 15 +++++++++++++++ > kernel/cgroup.c | 12 +++++++++++- > kernel/cgroup_freezer.c | 8 ++++++++ > kernel/events/core.c | 7 +++++++ > mm/memcontrol.c | 7 +++++++ > net/core/netprio_cgroup.c | 12 +++++++++++- > net/sched/cls_cgroup.c | 9 +++++++++ > security/device_cgroup.c | 9 +++++++++ > 9 files changed, 85 insertions(+), 2 deletions(-) > > --- a/block/blk-cgroup.c > +++ b/block/blk-cgroup.c > @@ -737,6 +737,14 @@ struct cgroup_subsys blkio_subsys = { > .subsys_id = blkio_subsys_id, > .base_cftypes = blkcg_files, > .module = THIS_MODULE, > + > + /* > + * blkio subsystem is utterly broken in terms of hierarchy support. > + * It treats all cgroups equally regardless of where they're > + * located in the hierarchy - all cgroups are treated as if they're > + * right below the root. Fix it and remove the following. > + */ > + .broken_hierarchy = true, > }; > EXPORT_SYMBOL_GPL(blkio_subsys); > > --- a/include/linux/cgroup.h > +++ b/include/linux/cgroup.h > @@ -503,6 +503,21 @@ struct cgroup_subsys { > */ > bool __DEPRECATED_clear_css_refs; > > + /* > + * If %false, this subsystem is properly hierarchical - > + * configuration, resource accounting and restriction on a parent > + * cgroup cover those of its children. If %true, hierarchy support > + * is broken in some ways - some subsystems ignore hierarchy > + * completely while others are only implemented half-way. > + * > + * It's now disallowed to create nested cgroups if the subsystem is > + * broken and cgroup core will emit a warning message on such > + * cases. Eventually, all subsystems will be made properly > + * hierarchical and this will go away. > + */ > + bool broken_hierarchy; > + bool warned_broken_hierarchy; > + > #define MAX_CGROUP_TYPE_NAMELEN 32 > const char *name; > > --- a/kernel/cgroup.c > +++ b/kernel/cgroup.c > @@ -4075,8 +4075,9 @@ static long cgroup_create(struct cgroup > set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); > > for_each_subsys(root, ss) { > - struct cgroup_subsys_state *css = ss->create(cgrp); > + struct cgroup_subsys_state *css; > > + css = ss->create(cgrp); > if (IS_ERR(css)) { > err = PTR_ERR(css); > goto err_destroy; > @@ -4090,6 +4091,15 @@ static long cgroup_create(struct cgroup > /* At error, ->destroy() callback has to free assigned ID. */ > if (clone_children(parent) && ss->post_clone) > ss->post_clone(cgrp); > + > + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && > + parent->parent) { > + pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", > + current->comm, current->pid, ss->name); > + if (!strcmp(ss->name, "memory")) > + pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); > + ss->warned_broken_hierarchy = true; > + } > } > > list_add(&cgrp->sibling, &cgrp->parent->children); > --- a/kernel/cgroup_freezer.c > +++ b/kernel/cgroup_freezer.c > @@ -373,4 +373,12 @@ struct cgroup_subsys freezer_subsys = { > .can_attach = freezer_can_attach, > .fork = freezer_fork, > .base_cftypes = files, > + > + /* > + * freezer subsys doesn't handle hierarchy at all. Frozen state > + * should be inherited through the hierarchy - if a parent is > + * frozen, all its children should be frozen. Fix it and remove > + * the following. > + */ > + .broken_hierarchy = true, > }; > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -7285,5 +7285,12 @@ struct cgroup_subsys perf_subsys = { > .destroy = perf_cgroup_destroy, > .exit = perf_cgroup_exit, > .attach = perf_cgroup_attach, > + > + /* > + * perf_event cgroup doesn't handle nesting correctly. > + * ctx->nr_cgroups adjustments should be propagated through the > + * cgroup hierarchy. Fix it and remove the following. > + */ > + .broken_hierarchy = true, > }; > #endif /* CONFIG_CGROUP_PERF */ > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -4973,6 +4973,13 @@ mem_cgroup_create(struct cgroup *cont) > } else { > res_counter_init(&memcg->res, NULL); > res_counter_init(&memcg->memsw, NULL); > + /* > + * Deeper hierachy with use_hierarchy == false doesn't make > + * much sense so let cgroup subsystem know about this > + * unfortunate state in our controller. > + */ > + if (parent && parent != root_mem_cgroup) > + mem_cgroup_subsys.broken_hierarchy = true; > } > memcg->last_scanned_node = MAX_NUMNODES; > INIT_LIST_HEAD(&memcg->oom_notify); > --- a/net/core/netprio_cgroup.c > +++ b/net/core/netprio_cgroup.c > @@ -330,7 +330,17 @@ struct cgroup_subsys net_prio_subsys = { > .subsys_id = net_prio_subsys_id, > #endif > .base_cftypes = ss_files, > - .module = THIS_MODULE > + .module = THIS_MODULE, > + > + /* > + * net_prio has artificial limit on the number of cgroups and > + * disallows nesting making it impossible to co-mount it with other > + * hierarchical subsystems. Remove the artificially low PRIOIDX_SZ > + * limit and properly nest configuration such that children follow > + * their parents' configurations by default and are allowed to > + * override and remove the following. > + */ > + .broken_hierarchy = true, > }; > > static int netprio_device_event(struct notifier_block *unused, > --- a/net/sched/cls_cgroup.c > +++ b/net/sched/cls_cgroup.c > @@ -82,6 +82,15 @@ struct cgroup_subsys net_cls_subsys = { > #endif > .base_cftypes = ss_files, > .module = THIS_MODULE, > + > + /* > + * While net_cls cgroup has the rudimentary hierarchy support of > + * inheriting the parent's classid on cgroup creation, it doesn't > + * properly propagates config changes in ancestors to their > + * descendents. A child should follow the parent's configuration > + * but be allowed to override it. Fix it and remove the following. > + */ > + .broken_hierarchy = true, > }; > > struct cls_cgroup_head { > --- a/security/device_cgroup.c > +++ b/security/device_cgroup.c > @@ -457,6 +457,15 @@ struct cgroup_subsys devices_subsys = { > .destroy = devcgroup_destroy, > .subsys_id = devices_subsys_id, > .base_cftypes = dev_cgroup_files, > + > + /* > + * While devices cgroup has the rudimentary hierarchy support which > + * checks the parent's restriction, it doesn't properly propagates > + * config changes in ancestors to their descendents. A child > + * should only be allowed to add more restrictions to the parent's > + * configuration. Fix it and remove the following. > + */ > + .broken_hierarchy = true, > }; > > int __devcgroup_inode_permission(struct inode *inode, int mask) > _______________________________________________ > Containers mailing list > Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx > https://lists.linuxfoundation.org/mailman/listinfo/containers _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers