Paul Menage wrote: > [RFC] Support multiply-bindable cgroup subsystems > > This patch allows a cgroup subsystem to be marked as bindable on > multiple cgroup hierarchies independently, when declared in > cgroup_subsys.h via MULTI_SUBSYS() rather than SUBSYS(). > > The state for such subsystems cannot be accessed directly from a > task->cgroups (since there's no unique mapping for a task) but instead > must be accessed via a particular control group object. > > Multiply-bound subsystems are useful in cases where there's no direct > correspondence between the cgroup configuration and some property of > the kernel outside of the cgroups subsystem. So this would not be > applicable to e.g. the CFS cgroup, since there has to a unique mapping > from a task to its CFS run queue. > > As an example, the "debug" subsystem is marked multiply-bindable, > since it has no state outside the cgroups framework itself. > > Example usage: > > mount -t cgroup -o name=foo,debug,cpu cgroup /mnt1 > mount -t cgroup -o name=bar,debug,memory cgroup /mnt2 > > Open issues: > > - what should /proc/cgroup show for multiply-bindable subsystems? > Currently it shows just one entry for each hierarchy the subsystem > is bound to, which means the subsystem doesn't show up when unbound, > and is indistinguishable from a regular (singleton) subsystem when > singly-bound. > > - should the bind() callback be extended to support passing more > information to multiply-bindable subsystems? Or should we just scrap > the bind() callback since nothing appears to be using it. > I agree to scrap it. > - how to stop checkpatch.pl complaining about the way the > SUBSYS/MULTI_SUBSYS macros as used to define the cgroup_subsys_id > enum in cgroup.h? > I think we can ignore it. > Signed-off-by: Paul Menage <menage@xxxxxxxxxx> > > --- > > include/linux/cgroup.h | 37 ++++++++++- > include/linux/cgroup_subsys.h | 2 - > kernel/cgroup.c | 137 ++++++++++++++++++++++++++--------------- > 3 files changed, 123 insertions(+), 53 deletions(-) > > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h > index adf6739..9b5999d 100644 > --- a/include/linux/cgroup.h > +++ b/include/linux/cgroup.h > @@ -39,10 +39,37 @@ extern int cgroupstats_build(struct cgroupstats *stats, > > extern struct file_operations proc_cgroup_operations; > > -/* Define the enumeration of all cgroup subsystems */ > -#define SUBSYS(_x) _x ## _subsys_id, > +/* Define the enumeration of all cgroup subsystems. There are two /* * xxx */ > + * kinds of subsystems: > + * > + * - regular (singleton) subsystems can be only mounted once; these > + * generally correspond to some system that has substantial state > + * outside of the cgroups framework, or where the state is being used > + * to control some external behaviour (e.g. CPU scheduler). Every > + * task has an associated state pointer (accessed efficiently through > + * task->cgroups) for each singleton subsystem. > + * > + * - multiply-bound subsystems may be mounted on zero or more > + * hierarchies. Since there's no unique mapping from a task to a > + * subsystem state pointer for multiply-bound subsystems, the state of > + * these subsystems cannot be accessed rapidly from a task > + * pointer. These correspond to subsystems where most or all of the > + * state is maintained within the subsystem itself, and/or the > + * subsystem is used for monitoring rather than control. > + */ > enum cgroup_subsys_id { > +#define SUBSYS(_x) _x ## _subsys_id, > +#define MULTI_SUBSYS(_x) > #include <linux/cgroup_subsys.h> > +#undef SUBSYS > +#undef MULTI_SUBSYS > + CGROUP_SINGLETON_SUBSYS_COUNT, > + CGROUP_DUMMY_ENUM_RESET = CGROUP_SINGLETON_SUBSYS_COUNT - 1, > +#define SUBSYS(_x) > +#define MULTI_SUBSYS(_x) _x ## _subsys_id, > +#include <linux/cgroup_subsys.h> > +#undef SUBSYS > +#undef MULTI_SUBSYS > CGROUP_SUBSYS_COUNT > }; > #undef SUBSYS > @@ -231,7 +258,7 @@ struct css_set { > * time). Multi-subsystems don't have an entry in here since > * there's no unique state for a given task. > */ > - struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; > + struct cgroup_subsys_state *subsys[CGROUP_SINGLETON_SUBSYS_COUNT]; > }; > > /* > @@ -419,8 +446,10 @@ struct cgroup_subsys { > }; > > #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; > +#define MULTI_SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; > #include <linux/cgroup_subsys.h> > #undef SUBSYS > +#undef MULTI_SUBSYS > > static inline struct cgroup_subsys_state *cgroup_subsys_state( > struct cgroup *cgrp, int subsys_id) > @@ -431,6 +460,8 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state( > static inline struct cgroup_subsys_state *task_subsys_state( > struct task_struct *task, int subsys_id) > { > + /* This check is optimized out for most callers */ > + BUG_ON(subsys_id >= CGROUP_SINGLETON_SUBSYS_COUNT); > return rcu_dereference(task->cgroups->subsys[subsys_id]); > } > > diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h > index 9c8d31b..f78605e 100644 > --- a/include/linux/cgroup_subsys.h > +++ b/include/linux/cgroup_subsys.h > @@ -14,7 +14,7 @@ SUBSYS(cpuset) > /* */ > > #ifdef CONFIG_CGROUP_DEBUG > -SUBSYS(debug) > +MULTI_SUBSYS(debug) > #endif > > /* */ > diff --git a/kernel/cgroup.c b/kernel/cgroup.c > index fcb8181..942a951 100644 > --- a/kernel/cgroup.c > +++ b/kernel/cgroup.c > @@ -52,10 +52,18 @@ > static DEFINE_MUTEX(cgroup_mutex); > > /* Generate an array of cgroup subsystem pointers */ > -#define SUBSYS(_x) &_x ## _subsys, > > static struct cgroup_subsys *subsys[] = { > +#define SUBSYS(_x) &_x ## _subsys, > +#define MULTI_SUBSYS(_x) > +#include <linux/cgroup_subsys.h> > +#undef SUBSYS > +#undef MULTI_SUBSYS > +#define SUBSYS(_x) > +#define MULTI_SUBSYS(_x) &_x ## _subsys, > #include <linux/cgroup_subsys.h> > +#undef SUBSYS > +#undef MULTI_SUBSYS > }; > > /* > @@ -265,7 +273,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) > int index; > unsigned long tmp = 0UL; > > - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) > + for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) > tmp += (unsigned long)css[i]; > tmp = (tmp >> 16) ^ tmp; > > @@ -435,7 +443,7 @@ static struct css_set *find_existing_css_set( > > /* Build the set of subsystem state objects that we want to > * see in the new css_set */ > - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { > + for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) { > if (root->subsys_bits & (1UL << i)) { > /* Subsystem is in this hierarchy. So we want > * the subsystem state from the new > @@ -529,7 +537,7 @@ static struct css_set *find_css_set( > struct css_set *oldcg, struct cgroup *cgrp) > { > struct css_set *res; > - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; > + struct cgroup_subsys_state *template[CGROUP_SINGLETON_SUBSYS_COUNT]; > > struct list_head tmp_cg_links; > > @@ -853,6 +861,20 @@ static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) > wake_up_all(&cgroup_rmdir_waitq); > } > > +static void init_cgroup_css(struct cgroup_subsys_state *css, > + struct cgroup_subsys *ss, > + struct cgroup *cgrp) > +{ > + css->cgroup = cgrp; > + atomic_set(&css->refcnt, 1); > + css->flags = 0; > + css->id = NULL; > + if (cgrp == &cgrp->root->top_cgroup) > + set_bit(CSS_ROOT, &css->flags); > + BUG_ON(cgrp->subsys[ss->subsys_id]); > + cgrp->subsys[ss->subsys_id] = css; > +} > + > static int rebind_subsystems(struct cgroupfs_root *root, > unsigned long final_bits) > { > @@ -863,7 +885,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, > removed_bits = root->subsys_bits & ~final_bits; > added_bits = final_bits & ~root->subsys_bits; > /* Check that any added subsystems are currently free */ > - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { > + for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) { > unsigned long bit = 1UL << i; > if (!(bit & added_bits)) > continue; > @@ -883,33 +905,57 @@ static int rebind_subsystems(struct cgroupfs_root *root, > /* Process each subsystem */ > for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { > struct cgroup_subsys *ss = subsys[i]; > + bool singleton = i < CGROUP_SINGLETON_SUBSYS_COUNT; > unsigned long bit = 1UL << i; > if (bit & added_bits) { > + struct cgroup_subsys_state *css; > /* We're binding this subsystem to this hierarchy */ > BUG_ON(cgrp->subsys[i]); > - BUG_ON(!dummytop->subsys[i]); > - BUG_ON(dummytop->subsys[i]->cgroup != dummytop); > - BUG_ON(!(rootnode.subsys_bits & bit)); > + if (singleton) { > + css = dummytop->subsys[i]; > + BUG_ON(!css); > + > + BUG_ON(css->cgroup != dummytop); > + BUG_ON(!(rootnode.subsys_bits & bit)); > + } else { > + BUG_ON(dummytop->subsys[i]); > + BUG_ON(rootnode.subsys_bits & bit); > + css = ss->create(ss, cgrp); > + if (IS_ERR(css)) > + return PTR_ERR(css); > + init_cgroup_css(css, ss, cgrp); > + } > mutex_lock(&ss->hierarchy_mutex); > - cgrp->subsys[i] = dummytop->subsys[i]; > - cgrp->subsys[i]->cgroup = cgrp; > + cgrp->subsys[i] = css; > + css->cgroup = cgrp; > if (ss->bind) > ss->bind(ss, cgrp); > - rootnode.subsys_bits &= ~bit; > + if (singleton) > + rootnode.subsys_bits &= ~bit; > root->subsys_bits |= bit; > mutex_unlock(&ss->hierarchy_mutex); > } else if (bit & removed_bits) { > + struct cgroup_subsys_state *css; > /* We're removing this subsystem */ > - BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); > - BUG_ON(cgrp->subsys[i]->cgroup != cgrp); > - BUG_ON(rootnode.subsys_bits & bit); > + css = cgrp->subsys[i]; > + BUG_ON(css->cgroup != cgrp); > + if (singleton) { > + BUG_ON(css != dummytop->subsys[i]); > + BUG_ON(rootnode.subsys_bits & bit); > + } > mutex_lock(&ss->hierarchy_mutex); > if (ss->bind) > ss->bind(ss, dummytop); > - dummytop->subsys[i]->cgroup = dummytop; > + if (singleton) { > + css->cgroup = dummytop; > + } else { > + /* Is this safe? */ > + ss->destroy(ss, cgrp); > + } > cgrp->subsys[i] = NULL; > root->subsys_bits &= ~bit; > - rootnode.subsys_bits |= bit; > + if (singleton) > + rootnode.subsys_bits |= bit; initially we can see all subsystems in /proc/cgroups, but: # mount -o cpu,debug xxx /mnt # remount -o cpu xxx /mnt # umount /mnt now no root including rootnode contains debug_subsys, so we won't see debug_subsys in /proc/cgroups. I think this inconsistency is wrong. > mutex_unlock(&ss->hierarchy_mutex); > } else if (bit & final_bits) { > /* Subsystem state should already exist */ > @@ -974,7 +1020,7 @@ static int parse_cgroupfs_options(char *data, > /* Add all non-disabled subsystems */ > int i; > opts->subsys_bits = 0; > - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { > + for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) { I don't see why only singleton subsystems are binded when mount with '-o all'. > struct cgroup_subsys *ss = subsys[i]; > if (!ss->disabled) > opts->subsys_bits |= 1ul << i; > @@ -1040,6 +1086,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) > struct cgroupfs_root *root = sb->s_fs_info; > struct cgroup *cgrp = &root->top_cgroup; > struct cgroup_sb_opts opts; > + unsigned long original_bits; > > mutex_lock(&cgrp->dentry->d_inode->i_mutex); > mutex_lock(&cgroup_mutex); > @@ -1061,9 +1108,13 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) > goto out_unlock; > } > > + original_bits = root->subsys_bits; > ret = rebind_subsystems(root, opts.subsys_bits); > - if (ret) > + if (ret) { > + int tmp = rebind_subsystems(root, original_bits); > + BUG_ON(tmp); > goto out_unlock; > + } > > /* (re)populate subsystem files */ > cgroup_populate_dir(cgrp); > @@ -1265,16 +1316,15 @@ static int cgroup_get_sb(struct file_system_type *fs_type, > } > > ret = rebind_subsystems(root, opts.subsys_bits); > - if (ret == -EBUSY) { > + if (ret) { > + int tmp = rebind_subsystems(root, 0); > + BUG_ON(tmp); > mutex_unlock(&cgroup_mutex); > mutex_unlock(&inode->i_mutex); > free_cg_links(&tmp_cg_links); > goto drop_new_super; > } > > - /* EBUSY should be the only error here */ > - BUG_ON(ret); > - > list_add(&root->root_list, &roots); > root_count++; > > @@ -2591,20 +2641,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp) > return 0; > } > > -static void init_cgroup_css(struct cgroup_subsys_state *css, > - struct cgroup_subsys *ss, > - struct cgroup *cgrp) > -{ > - css->cgroup = cgrp; > - atomic_set(&css->refcnt, 1); > - css->flags = 0; > - css->id = NULL; > - if (cgrp == dummytop) > - set_bit(CSS_ROOT, &css->flags); > - BUG_ON(cgrp->subsys[ss->subsys_id]); > - cgrp->subsys[ss->subsys_id] = css; > -} > - > static void cgroup_lock_hierarchy(struct cgroupfs_root *root) > { > /* We need to take each hierarchy_mutex in a consistent order */ > @@ -2890,21 +2926,23 @@ again: > static void __init cgroup_init_subsys(struct cgroup_subsys *ss) > { > struct cgroup_subsys_state *css; > - > + bool singleton = ss->subsys_id < CGROUP_SINGLETON_SUBSYS_COUNT; > printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); > > - /* Create the top cgroup state for this subsystem */ > - css = ss->create(ss, dummytop); > - /* We don't handle early failures gracefully */ > - BUG_ON(IS_ERR(css)); > - init_cgroup_css(css, ss, dummytop); > - > - /* Update the init_css_set to contain a subsys > - * pointer to this state - since the subsystem is > - * newly registered, all tasks and hence the > - * init_css_set is in the subsystem's top cgroup. */ > - init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; > + if (singleton) { > + /* Create the top cgroup state for this subsystem */ > + css = ss->create(ss, dummytop); > + /* We don't handle early failures gracefully */ > + BUG_ON(IS_ERR(css)); > + init_cgroup_css(css, ss, dummytop); > > + /* Update the init_css_set to contain a subsys > + * pointer to this state - since the subsystem is > + * newly registered, all tasks and hence the > + * init_css_set is in the subsystem's top cgroup. */ > + init_css_set.subsys[ss->subsys_id] = css; > + rootnode.subsys_bits |= 1ULL << ss->subsys_id; > + } > need_forkexit_callback |= ss->fork || ss->exit; > > /* At system boot, before all subsystems have been > @@ -2916,7 +2954,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) > lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); > ss->active = 1; > tailing blank line > - rootnode.subsys_bits |= 1ULL << ss->subsys_id; > } > > /** > @@ -3285,6 +3322,8 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, > > /* We shouldn't be called by an unregistered subsystem */ > BUG_ON(!subsys->active); > + /* We can only support singleton subsystems */ > + BUG_ON(subsys->subsys_id >= CGROUP_SINGLETON_SUBSYS_COUNT); > > /* First figure out what hierarchy and cgroup we're dealing > * with, and pin them so we can drop cgroup_mutex */ > > _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers