Cpuset v2 has some valuable attributes that are not present in v1 because of backward compatibility concern. One of that is the restoration of the original cpu and memory node mask after a hot removal and addition event sequence. This patch adds a new kernel command line option "cpuset_v2_mode=" to allow cpuset to have v2 behavior in a v1 cgroup when using a non-zero value. This enables users to optionally enable cpuset v2 behavior if they choose to. Signed-off-by: Waiman Long <longman@xxxxxxxxxx> --- Documentation/admin-guide/kernel-parameters.txt | 7 ++++ kernel/cgroup/cpuset.c | 46 ++++++++++++++++++------- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 372cc66..7abca11 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -668,6 +668,13 @@ on every CPU online, such as boot, and resume from suspend. Default: 10000 + cpuset_v2_mode= [KNL] Enable cpuset v2 behavior in cpuset v1 cgroups. + In v2 mode, the cpus and mems can be restored to + their original values after a removal-addition + event sequence. + 0: default value, cpuset v1 keeps legacy behavior. + 1: cpuset v1 behaves like cpuset v2. + cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver Format: <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>] diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 9ed6a05..7e56383 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -300,6 +300,17 @@ static inline int is_spread_slab(const struct cpuset *cs) static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); /* + * Flag to make cpuset v1 behaves like v2 so that cpus and mems can be + * restored back to their original values after an offline-online sequence. + */ +static bool cpuset_v2_mode __read_mostly; + +static inline bool is_in_v2_mode(void) +{ + return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || cpuset_v2_mode; +} + +/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead @@ -489,8 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_cpuset_subset(trial, par)) + if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) goto out; /* @@ -903,8 +913,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - cpumask_empty(new_cpus)) + if (is_in_v2_mode() && cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -921,7 +930,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) cpumask_copy(cp->effective_cpus, new_cpus); spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp); @@ -1157,8 +1166,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - nodes_empty(*new_mems)) + if (is_in_v2_mode() && nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1175,7 +1183,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); update_tasks_nodemask(cp); @@ -1467,7 +1475,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + if (!is_in_v2_mode() && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1985,7 +1993,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); spin_lock_irq(&callback_lock); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } @@ -2062,7 +2070,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { @@ -2135,6 +2143,18 @@ int __init cpuset_init(void) return 0; } +static int __init set_v2_mode(char *str) +{ + ssize_t ret, val; + + if (!str) + return 0; + ret = kstrtol(str, 0, &val); + cpuset_v2_mode = !!val; + return ret; +} +__setup("cpuset_v2_mode=", set_v2_mode); + /* * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, @@ -2256,7 +2276,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (is_in_v2_mode()) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else @@ -2287,7 +2307,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); + bool on_dfl = is_in_v2_mode(); mutex_lock(&cpuset_mutex); -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html