Handling error returned by update_parent_subparts_cpumask() in update_cpumasks_hier() is problematic as the states may become inconsistent. To avoid that and increase flexibility in handling other error cases, a new error state (-1) is added to the partition_root_state flag. This new error state is set internally and user cannot write this value to "cpuset.sched.partition". In this error state, the partition root becomes an erroneous one. It is no longer a real partition root, but the CS_CPU_EXCLUSIVE flag will still be set as it can be changed back to a real one if appropriate change happens later on. Signed-off-by: Waiman Long <longman@xxxxxxxxxx> --- kernel/cgroup/cpuset.c | 146 ++++++++++++++++++++++++++++++++++------- 1 file changed, 123 insertions(+), 23 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a648a2ae851c..a1f6fdb6be02 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -153,10 +153,19 @@ struct cpuset { * Partition root states: * * 0 - not a partition root + * * 1 - partition root + * + * -1 - erroneous partition root + * None of the cpus in cpus_allowed can be put into the parent's + * subparts_cpus. In this case, the cpuset is not a real partition + * root anymore. However, the CPU_EXCLUSIVE bit will still be set + * and the cpuset can be restored back to a partition root if the + * parent cpuset can give more CPUs back to this child cpuset. */ #define PRS_DISABLED 0 #define PRS_ENABLED 1 +#define PRS_ERROR -1 /* * Temporary cpumasks for working with partitions that are passed among @@ -251,7 +260,7 @@ static inline int is_spread_slab(const struct cpuset *cs) static inline int is_partition_root(const struct cpuset *cs) { - return cs->partition_root_state; + return cs->partition_root_state > 0; } static struct cpuset top_cpuset = { @@ -1021,9 +1030,12 @@ enum subparts_cmd { * * For partcmd_update, if the optional newmask is specified, the cpu * list is to be changed from cpus_allowed to newmask. Otherwise, - * cpus_allowed is assumed to remain the same. The function will return - * 1 if changes to parent's subparts_cpus and effective_cpus happen or 0 - * otherwise. In case of error, an error code will be returned. + * cpus_allowed is assumed to remain the same. The cpuset should either + * be a partition root or an erroneous partition root. The partition root + * state may change if newmask is NULL and none of the requested CPUs can + * be granted by the parent. The function will return 1 if changes to + * parent's subparts_cpus and effective_cpus happen or 0 otherwise. + * Error code should only be returned when newmask is non-NULL. * * The partcmd_enable and partcmd_disable commands are used by * update_prstate(). The partcmd_update command is used by @@ -1048,6 +1060,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpuset *parent = parent_cs(cpuset); int adding; /* Moving cpus from effective_cpus to subparts_cpus */ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ + bool part_error = false; /* Partition error? */ /* * The parent must be a partition root. @@ -1114,13 +1127,48 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * addmask = cpus_allowed & parent->effectiveb_cpus * * Note that parent's subparts_cpus may have been - * pre-shrunk in case the CPUs granted to the parent - * by the grandparent changes. So no deletion is needed. + * pre-shrunk in case there is a change in the cpu list. + * So no deletion is needed. */ adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed, parent->effective_cpus); - if (cpumask_equal(tmp->addmask, parent->effective_cpus)) - return -EINVAL; + part_error = cpumask_equal(tmp->addmask, + parent->effective_cpus); + } + + if (cmd == partcmd_update) { + int prev_prs = cpuset->partition_root_state; + + /* + * Check for possible transition between PRS_ENABLED + * and PRS_ERROR. + */ + switch (cpuset->partition_root_state) { + case PRS_ENABLED: + if (part_error) + cpuset->partition_root_state = PRS_ERROR; + break; + case PRS_ERROR: + if (!part_error) + cpuset->partition_root_state = PRS_ENABLED; + break; + } + /* + * Set part_error if previously in erroneous state. + */ + part_error = (prev_prs == PRS_ERROR); + } + + if (!part_error && (cpuset->partition_root_state == PRS_ERROR)) + return 0; /* Nothing need to be done */ + + if (cpuset->partition_root_state == PRS_ERROR) { + /* + * Remove all its cpus from parent's subparts_cpus. + */ + adding = false; + deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, + parent->subparts_cpus); } if (!adding && !deleting) @@ -1172,28 +1220,21 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); - bool cs_empty; compute_effective_cpumask(tmp->new_cpus, cp, parent); - cs_empty = cpumask_empty(tmp->new_cpus); - - /* - * A partition root cannot have empty effective_cpus - */ - WARN_ON_ONCE(cs_empty && is_partition_root(cp)); /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (is_in_v2_mode() && cs_empty) + if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) cpumask_copy(tmp->new_cpus, parent->effective_cpus); /* * Skip the whole subtree if the cpumask remains the same * and has no partition root state. */ - if (!is_partition_root(cp) && + if (!cp->partition_root_state && cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -1205,11 +1246,39 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) * update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's subparts_cpus changes. */ - if ((cp != cs) && cp->partition_root_state && - update_parent_subparts_cpumask(cp, partcmd_update, - NULL, tmp)) { - if (parent != &top_cpuset) - update_tasks_cpumask(parent); + if ((cp != cs) && cp->partition_root_state) { + switch (parent->partition_root_state) { + case PRS_DISABLED: + /* + * If parent is not a partition root or an + * erroneous partition root, clear the state + * state and the CS_CPU_EXCLUSIVE flag. + */ + WARN_ON_ONCE(cp->partition_root_state + != PRS_ERROR); + cp->partition_root_state = 0; + spin_lock_irq(&callback_lock); + clear_bit(CS_CPU_EXCLUSIVE, &cp->flags); + spin_unlock_irq(&callback_lock); + break; + + case PRS_ENABLED: + if (update_parent_subparts_cpumask(cp, + partcmd_update, NULL, tmp)) + update_tasks_cpumask(parent); + break; + + case PRS_ERROR: + /* + * When parent is erroneous, it has to be too. + */ + cp->partition_root_state = PRS_ERROR; + if (cp->nr_subparts_cpus) { + cp->nr_subparts_cpus = 0; + cpumask_clear(cp->subparts_cpus); + } + break; + } } if (!css_tryget_online(&cp->css)) @@ -1219,13 +1288,33 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); - if (cp->nr_subparts_cpus) { + if (cp->nr_subparts_cpus && + (cp->partition_root_state != PRS_ENABLED)) { + cp->nr_subparts_cpus = 0; + cpumask_clear(cp->subparts_cpus); + } else if (cp->nr_subparts_cpus) { /* * Make sure that effective_cpus & subparts_cpus * are mutually exclusive. + * + * In the unlikely event that effective_cpus + * becomes empty. we clear cp->nr_subparts_cpus and + * let its child partition roots to compete for + * CPUs again. */ cpumask_andnot(cp->effective_cpus, cp->effective_cpus, cp->subparts_cpus); + if (cpumask_empty(cp->effective_cpus)) { + cpumask_copy(cp->effective_cpus, tmp->new_cpus); + cpumask_clear(cp->subparts_cpus); + cp->nr_subparts_cpus = 0; + } else if (!cpumask_subset(cp->subparts_cpus, + tmp->new_cpus)) { + cpumask_andnot(cp->subparts_cpus, + cp->subparts_cpus, tmp->new_cpus); + cp->nr_subparts_cpus + = cpumask_weight(cp->subparts_cpus); + } } spin_unlock_irq(&callback_lock); @@ -1733,6 +1822,17 @@ static int update_prstate(struct cpuset *cs, int val) } cs->partition_root_state = PRS_ENABLED; } else { + /* + * Turning off partition root will clear the + * CS_CPU_EXCLUSIVE bit. + */ + if (cs->partition_root_state == PRS_ERROR) { + cs->partition_root_state = 0; + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + err = 0; + goto out; + } + err = update_parent_subparts_cpumask(cs, partcmd_disable, NULL, &tmp); if (err) -- 2.18.0