Re: [PATCH v10 0/8] cgroup/cpuset: Major cpu partition code restructuring

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 5/4/22 07:28, Michal Koutný wrote:
Hello.

On Tue, May 03, 2022 at 12:21:41PM -0400, Waiman Long <longman@xxxxxxxxxx> wrote:
v10:
  - Relax constraints for changes made to "cpuset.cpus"
    and "cpuset.cpus.partition" as suggested. Now almost all changes
    are allowed.
I see there were also some other changes from v9 (like the first patches
of series).
Any chance you have a public git repo with both versions for a
convenient range-diff?

That is true. Both patches 1 and 2 are new and the changes are pretty straight forward. Patch 1 of v9 has been merged with a latent bug. Patch 4 of this series is a replacement of patch 3 "cgroup/cpuset: Refining features and constraints of a  partition" of v9. The other patches are similar to their versions in v9 with some adjustment based on the different code base.

I don't have a public repo. Attached is the file diff between v9 and v10 in cpuset.c with some other unrelated cpuset patches included.

Cheers,
Longman
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 8703a8452c33..90ee0e4d8d7e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -71,7 +71,7 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
 
 /*
  * There could be abnormal cpuset configurations for cpu or memory
- * node binding, add this key to provide a quick low-cost judgement
+ * node binding, add this key to provide a quick low-cost judgment
  * of the situation.
  */
 DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
@@ -93,16 +93,20 @@ enum prs_errcode {
 	PERR_INVCPUS,
 	PERR_INVPARENT,
 	PERR_NOTPART,
+	PERR_NOTEXCL,
 	PERR_NOCPUS,
 	PERR_HOTPLUG,
+	PERR_CPUSEMPTY,
 };
 
 static const char * const perr_strings[] = {
-	[PERR_INVCPUS]   = "Invalid change to cpuset.cpus",
+	[PERR_INVCPUS]   = "Invalid cpu list in cpuset.cpus",
 	[PERR_INVPARENT] = "Parent is an invalid partition root",
 	[PERR_NOTPART]   = "Parent is not a partition root",
+	[PERR_NOTEXCL]   = "Cpu list in cpuset.cpus not exclusive",
 	[PERR_NOCPUS]    = "Parent unable to distribute cpu downstream",
 	[PERR_HOTPLUG]   = "No cpu available due to hotplug",
+	[PERR_CPUSEMPTY] = "cpuset.cpus is empty",
 };
 
 struct cpuset {
@@ -198,23 +202,22 @@ struct cpuset {
 /*
  * Partition root states:
  *
- *   0 - not a partition root
- *
+ *   0 - member (not a partition root)
  *   1 - partition root
- *
  *   2 - partition root without load balancing (isolated)
- *
  *  -1 - invalid partition root
- *       None of the cpus in cpus_allowed can be put into the parent's
- *       subparts_cpus. In this case, the cpuset is not a real partition
- *       root anymore.  However, the CPU_EXCLUSIVE bit will still be set
- *       and the cpuset can be restored back to a partition root if the
- *       parent cpuset can give more CPUs back to this child cpuset.
+ *  -2 - invalid isolated partition root
  */
-#define PRS_DISABLED		0
-#define PRS_ENABLED		1
+#define PRS_MEMBER		0
+#define PRS_ROOT		1
 #define PRS_ISOLATED		2
-#define PRS_ERROR		-1
+#define PRS_INVALID_ROOT	-1
+#define PRS_INVALID_ISOLATED	-2
+
+static inline bool is_prs_invalid(int prs_state)
+{
+	return prs_state < 0;
+}
 
 /*
  * Temporary cpumasks for working with partitions that are passed among
@@ -294,30 +297,40 @@ static inline int is_spread_slab(const struct cpuset *cs)
 	return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }
 
-static inline int is_partition_root(const struct cpuset *cs)
+static inline int is_partition_valid(const struct cpuset *cs)
 {
 	return cs->partition_root_state > 0;
 }
 
+static inline int is_partition_invalid(const struct cpuset *cs)
+{
+	return cs->partition_root_state < 0;
+}
+
+static inline void set_partition_invalid(struct cpuset *cs)
+{
+	if (is_partition_valid(cs))
+		cs->partition_root_state = -cs->partition_root_state;
+}
+
 /*
  * Send notification event of whenever partition_root_state changes.
  */
-static inline void notify_partition_change(struct cpuset *cs,
-					   int old_prs, int new_prs)
+static inline void notify_partition_change(struct cpuset *cs, int old_prs)
 {
-	if (old_prs == new_prs)
+	if (old_prs == cs->partition_root_state)
 		return;
 	cgroup_file_notify(&cs->partition_file);
 
 	/* Reset prs_err if not invalid */
-	if (new_prs != PRS_ERROR)
+	if (is_partition_valid(cs))
 		WRITE_ONCE(cs->prs_err, PERR_NONE);
 }
 
 static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
 		  (1 << CS_MEM_EXCLUSIVE)),
-	.partition_root_state = PRS_ENABLED,
+	.partition_root_state = PRS_ROOT,
 };
 
 /**
@@ -459,7 +472,7 @@ static inline bool partition_is_populated(struct cpuset *cs,
 	cpuset_for_each_child(child, css, cs) {
 		if (child == excluded_child)
 			continue;
-		if (is_partition_root(child))
+		if (is_partition_valid(child))
 			continue;
 		if (cgroup_is_populated(child->css.cgroup)) {
 			rcu_read_unlock();
@@ -656,6 +669,35 @@ static inline void free_cpuset(struct cpuset *cs)
 	kfree(cs);
 }
 
+/*
+ * validate_change_legacy() - Validate conditions specific to legacy (v1)
+ *                            behavior.
+ */
+static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
+{
+	struct cgroup_subsys_state *css;
+	struct cpuset *c, *par;
+	int ret;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	/* Each of our child cpusets must be a subset of us */
+	ret = -EBUSY;
+	cpuset_for_each_child(c, css, cur)
+		if (!is_cpuset_subset(c, trial))
+			goto out;
+
+	/* On legacy hierarchy, we must be a subset of our parent cpuset. */
+	ret = -EACCES;
+	par = parent_cs(cur);
+	if (par && !is_cpuset_subset(trial, par))
+		goto out;
+
+	ret = 0;
+out:
+	return ret;
+}
+
 /*
  * validate_change() - Used to validate that any proposed cpuset change
  *		       follows the structural rules for cpusets.
@@ -680,20 +722,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 {
 	struct cgroup_subsys_state *css;
 	struct cpuset *c, *par;
-	int ret;
-
-	/* The checks don't apply to root cpuset */
-	if (cur == &top_cpuset)
-		return 0;
+	int ret = 0;
 
 	rcu_read_lock();
-	par = parent_cs(cur);
 
-	/* On legacy hierarchy, we must be a subset of our parent cpuset. */
-	ret = -EACCES;
-	if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
+	if (!is_in_v2_mode())
+		ret = validate_change_legacy(cur, trial);
+	if (ret)
+		goto out;
+
+	/* Remaining checks don't apply to root cpuset */
+	if (cur == &top_cpuset)
 		goto out;
 
+	par = parent_cs(cur);
+
 	/*
 	 * If either I or some sibling (!= me) is exclusive, we can't
 	 * overlap
@@ -869,7 +912,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 			update_domain_attr_tree(dattr, &top_cpuset);
 		}
 		cpumask_and(doms[0], top_cpuset.effective_cpus,
-			    housekeeping_cpumask(HK_FLAG_DOMAIN));
+			    housekeeping_cpumask(HK_TYPE_DOMAIN));
 
 		goto done;
 	}
@@ -899,7 +942,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 		if (!cpumask_empty(cp->cpus_allowed) &&
 		    !(is_sched_load_balance(cp) &&
 		      cpumask_intersects(cp->cpus_allowed,
-					 housekeeping_cpumask(HK_FLAG_DOMAIN))))
+					 housekeeping_cpumask(HK_TYPE_DOMAIN))))
 			continue;
 
 		if (root_load_balance &&
@@ -911,7 +954,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 			csa[csn++] = cp;
 
 		/* skip @cp's subtree if not a partition root */
-		if (!is_partition_root(cp))
+		if (!is_partition_valid(cp))
 			pos_css = css_rightmost_descendant(pos_css);
 	}
 	rcu_read_unlock();
@@ -988,7 +1031,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 
 			if (apn == b->pn) {
 				cpumask_or(dp, dp, b->effective_cpus);
-				cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
+				cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
 				if (dattr)
 					update_domain_attr_tree(dattr + nslot, b);
 
@@ -1117,7 +1160,7 @@ static void rebuild_sched_domains_locked(void)
 	if (top_cpuset.nr_subparts_cpus) {
 		rcu_read_lock();
 		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
-			if (!is_partition_root(cs)) {
+			if (!is_partition_valid(cs)) {
 				pos_css = css_rightmost_descendant(pos_css);
 				continue;
 			}
@@ -1164,6 +1207,14 @@ static void update_tasks_cpumask(struct cpuset *cs)
 	struct css_task_iter it;
 	struct task_struct *task;
 
+	/*
+	 * TODO: With cpuset partition that takes CPUs away from the top
+	 * cpuset, we may want to properly adjust the cpus_allowed mask of
+	 * tasks in the top cpuset as well.
+	 */
+	if (cs == &top_cpuset)
+		return;
+
 	css_task_iter_start(&cs->css, 0, &it);
 	while ((task = css_task_iter_next(&it)))
 		set_cpus_allowed_ptr(task, cs->effective_cpus);
@@ -1203,13 +1254,15 @@ enum subparts_cmd {
 	partcmd_update,		/* Update parent's subparts_cpus */
 };
 
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+		       int turning_on);
 /**
  * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
  * @cpuset:  The cpuset that requests change in partition root state
  * @cmd:     Partition root state change command
  * @newmask: Optional new cpumask for partcmd_update
  * @tmp:     Temporary addmask and delmask
- * Return:   0, 1 or an error code
+ * Return:   0 or a partition root state error code
  *
  * For partcmd_enable, the cpuset is being transformed from a non-partition
  * root to a partition root. The cpus_allowed mask of the given cpuset will
@@ -1217,43 +1270,32 @@ enum subparts_cmd {
  * effective_cpus. The function will return 0 if all the CPUs listed in
  * cpus_allowed can be granted or an error code will be returned.
  *
- * For partcmd_disable, the cpuset is being transofrmed from a partition
+ * For partcmd_disable, the cpuset is being transformed from a partition
  * root back to a non-partition root. Any CPUs in cpus_allowed that are in
  * parent's subparts_cpus will be taken away from that cpumask and put back
- * into parent's effective_cpus. 0 should always be returned.
+ * into parent's effective_cpus. 0 will always be returned.
  *
- * For partcmd_update, if the optional newmask is specified, the cpu
- * list is to be changed from cpus_allowed to newmask. Otherwise,
- * cpus_allowed is assumed to remain the same. The cpuset should either
- * be a partition root or an invalid partition root. The partition root
- * state may change if newmask is NULL and none of the requested CPUs can
- * be granted by the parent. The function will return 1 if changes to
- * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
- * Error code should only be returned when newmask is non-NULL.
+ * For partcmd_update, if the optional newmask is specified, the cpu list is
+ * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is
+ * assumed to remain the same. The cpuset should either be a valid or invalid
+ * partition root. The partition root state may change from valid to invalid
+ * or vice versa. An error code will only be returned if transitioning from
+ * invalid to valid violates the exclusivity rule.
  *
  * The partcmd_enable and partcmd_disable commands are used by
  * update_prstate(). The partcmd_update command is used by
  * update_cpumasks_hier() with newmask NULL and update_cpumask() with
  * newmask set.
- *
- * The checking is more strict when enabling partition root than the
- * other two commands.
- *
- * Because of the implicit cpu exclusive nature of a partition root,
- * cpumask changes that violates the cpu exclusivity rule will not be
- * permitted when checked by validate_change(). The validate_change()
- * function will also prevent any changes to the cpu list if it is not
- * a superset of children's cpu lists.
  */
-static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
 					  struct cpumask *newmask,
 					  struct tmpmasks *tmp)
 {
-	struct cpuset *parent = parent_cs(cpuset);
+	struct cpuset *parent = parent_cs(cs);
 	int adding;	/* Moving cpus from effective_cpus to subparts_cpus */
 	int deleting;	/* Moving cpus from subparts_cpus to effective_cpus */
 	int old_prs, new_prs;
-	bool part_error = false;	/* Partition error? */
+	int part_error = PERR_NONE;	/* Partition error? */
 
 	percpu_rwsem_assert_held(&cpuset_rwsem);
 
@@ -1262,39 +1304,41 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
 	 * The new cpumask, if present, or the current cpus_allowed must
 	 * not be empty.
 	 */
-	if (!is_partition_root(parent) ||
-	   (newmask && cpumask_empty(newmask)) ||
-	   (!newmask && cpumask_empty(cpuset->cpus_allowed)))
-		return -EINVAL;
-
-	/*
-	 * Enabling partition root is not allowed if there are online children.
-	 */
-	if ((cmd == partcmd_enable) && css_has_online_children(&cpuset->css))
-		return -EBUSY;
+	if (!is_partition_valid(parent)) {
+		return is_partition_invalid(parent)
+		       ? PERR_INVPARENT : PERR_NOTPART;
+	}
+	if ((newmask && cpumask_empty(newmask)) ||
+	   (!newmask && cpumask_empty(cs->cpus_allowed)))
+		return PERR_CPUSEMPTY;
 
 	adding = deleting = false;
-	old_prs = new_prs = cpuset->partition_root_state;
+	old_prs = new_prs = cs->partition_root_state;
 	if (cmd == partcmd_enable) {
 		/*
 		 * Enabling partition root is not allowed if cpus_allowed
 		 * doesn't overlap parent's cpus_allowed.
 		 */
-		if (!cpumask_intersects(cpuset->cpus_allowed, parent->cpus_allowed))
-			return -EINVAL;
+		if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed))
+			return PERR_INVCPUS;
 
 		/*
 		 * A parent can be left with no CPU as long as there is no
 		 * task directly associated with the parent partition.
 		 */
-		if (partition_is_populated(parent, cpuset) &&
-		    cpumask_subset(parent->effective_cpus, cpuset->cpus_allowed))
-			return -EINVAL;
+		if (partition_is_populated(parent, cs) &&
+		   !cpumask_intersects(cs->cpus_allowed, parent->effective_cpus))
+			return PERR_NOCPUS;
 
-		cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+		cpumask_copy(tmp->addmask, cs->cpus_allowed);
 		adding = true;
 	} else if (cmd == partcmd_disable) {
-		deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+		/*
+		 * Need to remove cpus from parent's subparts_cpus for valid
+		 * partition root.
+		 */
+		deleting = !is_prs_invalid(old_prs) &&
+			   cpumask_and(tmp->delmask, cs->cpus_allowed,
 				       parent->subparts_cpus);
 	} else if (newmask) {
 		/*
@@ -1306,7 +1350,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
 		 * addmask = newmask & parent->cpus_allowed
 		 *		     & ~parent->subparts_cpus
 		 */
-		cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+		cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask);
 		deleting = cpumask_and(tmp->delmask, tmp->delmask,
 				       parent->subparts_cpus);
 
@@ -1317,77 +1361,85 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
 		 * Make partition invalid if parent's effective_cpus could
 		 * become empty and there are tasks in the parent.
 		 */
-		part_error = partition_is_populated(parent, cpuset) &&
-			cpumask_subset(parent->effective_cpus, tmp->addmask) &&
-			!cpumask_intersects(tmp->delmask, cpu_active_mask);
-
-		if ((READ_ONCE(cpuset->prs_err) == PERR_NONE) && part_error)
-			WRITE_ONCE(cpuset->prs_err, PERR_INVCPUS);
+		if (adding && partition_is_populated(parent, cs) &&
+		    cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+		    !cpumask_intersects(tmp->delmask, cpu_active_mask)) {
+			part_error = PERR_NOCPUS;
+			adding = false;
+			deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+					       parent->subparts_cpus);
+		}
 	} else {
 		/*
 		 * partcmd_update w/o newmask:
 		 *
-		 * addmask = cpus_allowed & parent->effective_cpus
-		 *
-		 * This gets invoked either due to a hotplug event or
-		 * from update_cpumasks_hier() where we can't return an
-		 * error. This can cause a partition root to become invalid
-		 * in the case of a hotplug.
+		 * delmask = cpus_allowed & parent->subparts_cpus
+		 * addmask = cpus_allowed & parent->cpus_allowed
+		 *			  & ~parent->subparts_cpus
 		 *
+		 * This gets invoked either due to a hotplug event or from
+		 * update_cpumasks_hier(). This can cause the state of a
+		 * partition root to transition from valid to invalid or vice
+		 * versa. So we still need to compute the addmask and delmask.
+
 		 * A partition error happens when:
 		 * 1) Cpuset is valid partition, but parent does not distribute
 		 *    out any CPUs.
 		 * 2) Parent has tasks and all its effective CPUs will have
 		 *    to be distributed out.
 		 */
-		adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
-				     parent->effective_cpus);
-		part_error = (is_partition_root(cpuset) &&
-			      !parent->nr_subparts_cpus) ||
-			     (cpumask_equal(parent->effective_cpus, tmp->addmask) &&
-			      partition_is_populated(parent, cpuset));
+		cpumask_and(tmp->addmask, cs->cpus_allowed,
+					  parent->cpus_allowed);
+		adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+					parent->subparts_cpus);
+		if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) ||
+		    (adding &&
+		     cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+		     partition_is_populated(parent, cs))) {
+			part_error = PERR_NOCPUS;
+			adding = false;
+		}
 
-		if (is_partition_root(cpuset) && part_error)
-			WRITE_ONCE(cpuset->prs_err, PERR_NOCPUS);
+		if (part_error && is_partition_valid(cs) &&
+		    parent->nr_subparts_cpus)
+			deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+					       parent->subparts_cpus);
 	}
+	if (part_error)
+		WRITE_ONCE(cs->prs_err, part_error);
 
 	if (cmd == partcmd_update) {
 		/*
-		 * Check for possible transition between PRS_ERROR and
-		 * PRS_ENABLED/PRS_ISOLATED.
+		 * Check for possible transition between valid and invalid
+		 * partition root.
 		 */
-		switch (cpuset->partition_root_state) {
-		case PRS_ENABLED:
+		switch (cs->partition_root_state) {
+		case PRS_ROOT:
 		case PRS_ISOLATED:
 			if (part_error)
-				new_prs = PRS_ERROR;
+				new_prs = -old_prs;
 			break;
-		case PRS_ERROR:
-			if (part_error)
-				break;
-			if (is_sched_load_balance(cpuset))
-				new_prs = PRS_ENABLED;
-			else
-				new_prs = PRS_ISOLATED;
+		case PRS_INVALID_ROOT:
+		case PRS_INVALID_ISOLATED:
+			if (!part_error)
+				new_prs = -old_prs;
 			break;
 		}
 	}
 
-	if ((old_prs == PRS_ERROR) && (new_prs == PRS_ERROR))
-		return 0;	/* Nothing need to be done */
-
-	if (new_prs == PRS_ERROR) {
-		/*
-		 * Remove all its cpus from parent's subparts_cpus.
-		 */
-		adding = false;
-		deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
-				       parent->subparts_cpus);
-	}
-
 	if (!adding && !deleting && (new_prs == old_prs))
 		return 0;
 
+	/*
+	 * Transitioning from invalid to valid (partcmd_update) may require
+	 * setting CS_CPU_EXCLUSIVE and clearing CS_SCHED_LOAD_BALANCE later.
+	 */
+	if ((old_prs != new_prs) && is_prs_invalid(old_prs)) {
+		if (!is_cpu_exclusive(cs) &&
+		    (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0))
+			return PERR_NOTEXCL;
+	}
+
 	/*
 	 * Change the parent's subparts_cpus.
 	 * Newly added CPUs will be removed from effective_cpus and
@@ -1414,12 +1466,25 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
 	parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
 
 	if (old_prs != new_prs)
-		cpuset->partition_root_state = new_prs;
+		cs->partition_root_state = new_prs;
 
 	spin_unlock_irq(&callback_lock);
-	notify_partition_change(cpuset, old_prs, new_prs);
 
-	return cmd == partcmd_update;
+	if (adding || deleting)
+		update_tasks_cpumask(parent);
+
+	/*
+	 * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
+	 * rebuild_sched_domains_locked() may be called.
+	 */
+	if ((old_prs != new_prs) && (cmd == partcmd_update)) {
+		if (old_prs == PRS_ISOLATED)
+			update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
+		else if (new_prs == PRS_ISOLATED)
+			update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+	}
+	notify_partition_change(cs, old_prs);
+	return 0;
 }
 
 /*
@@ -1457,7 +1522,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 		 * out all its CPUs.
 		 */
 		if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
-			if (is_partition_root(cp) &&
+			if (is_partition_valid(cp) &&
 			    cpumask_equal(cp->cpus_allowed, cp->subparts_cpus))
 				goto update_parent_subparts;
 
@@ -1474,7 +1539,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 
 		/*
 		 * Skip the whole subtree if the cpumask remains the same
-		 * with no partition root state and force flag not set.
+		 * and has no partition root state and force flag not set.
 		 */
 		if (!cp->partition_root_state && !force &&
 		    cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
@@ -1492,21 +1557,21 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 		old_prs = new_prs = cp->partition_root_state;
 		if ((cp != cs) && old_prs) {
 			switch (parent->partition_root_state) {
-			case PRS_ENABLED:
+			case PRS_ROOT:
 			case PRS_ISOLATED:
 				update_parent = true;
 				break;
 
-			case PRS_DISABLED:
-			case PRS_ERROR:
+			default:
 				/*
 				 * When parent is not a partition root or is
 				 * invalid, child partition roots become
 				 * invalid too.
 				 */
-				new_prs = PRS_ERROR;
+				if (is_partition_valid(cp))
+					new_prs = -cp->partition_root_state;
 				WRITE_ONCE(cp->prs_err,
-					  (parent->partition_root_state == PRS_ERROR)
+					   is_partition_invalid(parent)
 					   ? PERR_INVPARENT : PERR_NOTPART);
 				break;
 			}
@@ -1517,25 +1582,25 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 		rcu_read_unlock();
 
 		if (update_parent) {
-			if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
-				update_tasks_cpumask(parent);
+			update_parent_subparts_cpumask(cp, partcmd_update, NULL,
+						       tmp);
 			/*
-			 * The cpuset partition_root_state may be changed
-			 * to PRS_ERROR. Capture it.
+			 * The cpuset partition_root_state may become
+			 * invalid. Capture it.
 			 */
 			new_prs = cp->partition_root_state;
 		}
 
 		spin_lock_irq(&callback_lock);
 
-		if (cp->nr_subparts_cpus && (new_prs <= 0)) {
+		if (cp->nr_subparts_cpus && !is_partition_valid(cp)) {
 			/*
 			 * Put all active subparts_cpus back to effective_cpus.
 			 */
 			cpumask_or(tmp->new_cpus, tmp->new_cpus,
 				   cp->subparts_cpus);
 			cpumask_and(tmp->new_cpus, tmp->new_cpus,
-				    cpu_active_mask);
+				   cpu_active_mask);
 			cp->nr_subparts_cpus = 0;
 			cpumask_clear(cp->subparts_cpus);
 		}
@@ -1552,7 +1617,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 
 		cp->partition_root_state = new_prs;
 		spin_unlock_irq(&callback_lock);
-		notify_partition_change(cp, old_prs, new_prs);
+
+		notify_partition_change(cp, old_prs);
 
 		WARN_ON(!is_in_v2_mode() &&
 			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -1568,7 +1634,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 		if (!cpumask_empty(cp->cpus_allowed) &&
 		    is_sched_load_balance(cp) &&
 		   (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
-		    is_partition_root(cp)))
+		    is_partition_valid(cp)))
 			need_rebuild_sched_domains = true;
 
 		rcu_read_lock();
@@ -1592,10 +1658,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
 	struct cpuset *sibling;
 	struct cgroup_subsys_state *pos_css;
 
+	percpu_rwsem_assert_held(&cpuset_rwsem);
+
 	/*
 	 * Check all its siblings and call update_cpumasks_hier()
 	 * if their use_parent_ecpus flag is set in order for them
 	 * to use the right effective_cpus value.
+	 *
+	 * The update_cpumasks_hier() function may sleep. So we have to
+	 * release the RCU read lock before calling it.
 	 */
 	rcu_read_lock();
 	cpuset_for_each_child(sibling, pos_css, parent) {
@@ -1603,8 +1674,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
 			continue;
 		if (!sibling->use_parent_ecpus)
 			continue;
+		if (!css_tryget_online(&sibling->css))
+			continue;
 
+		rcu_read_unlock();
 		update_cpumasks_hier(sibling, tmp, false);
+		rcu_read_lock();
+		css_put(&sibling->css);
 	}
 	rcu_read_unlock();
 }
@@ -1662,27 +1738,35 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 #endif
 
 	if (cs->partition_root_state) {
-		/* Cpumask of a partition root cannot be empty */
-		if (cpumask_empty(trialcs->cpus_allowed))
-			return -EINVAL;
-		if (update_parent_subparts_cpumask(cs, partcmd_update,
-					trialcs->cpus_allowed, &tmp) < 0)
-			return -EINVAL;
+		update_parent_subparts_cpumask(cs, partcmd_update,
+					       trialcs->cpus_allowed, &tmp);
 	}
 
+	compute_effective_cpumask(trialcs->effective_cpus, trialcs,
+				  parent_cs(cs));
 	spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
 
 	/*
-	 * Make sure that subparts_cpus is a subset of cpus_allowed.
+	 * Make sure that subparts_cpus, if not empty, is a subset of
+	 * cpus_allowed. Clear subparts_cpus if there is an error or
+	 * empty effective cpus with tasks.
 	 */
 	if (cs->nr_subparts_cpus) {
-		cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
-			    cs->cpus_allowed);
-		cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+		if (cs->prs_err ||
+		   (partition_is_populated(cs, NULL) &&
+		    cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus))) {
+			cs->nr_subparts_cpus = 0;
+			cpumask_clear(cs->subparts_cpus);
+		} else {
+			cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
+				    cs->cpus_allowed);
+			cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+		}
 	}
 	spin_unlock_irq(&callback_lock);
 
+	/* effective_cpus will be updated here */
 	update_cpumasks_hier(cs, &tmp, false);
 
 	if (cs->partition_root_state) {
@@ -2059,16 +2143,17 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	return err;
 }
 
-/*
- * update_prstate - update partititon_root_state
- * cs: the cpuset to update
- * new_prs: new partition root state
+/**
+ * update_prstate - update partition_root_state
+ * @cs: the cpuset to update
+ * @new_prs: new partition root state
+ * Return: 0 if successful, != 0 if error
  *
  * Call with cpuset_rwsem held.
  */
 static int update_prstate(struct cpuset *cs, int new_prs)
 {
-	int err, old_prs = cs->partition_root_state;
+	int err = PERR_NONE, old_prs = cs->partition_root_state;
 	bool sched_domain_rebuilt = false;
 	struct cpuset *parent = parent_cs(cs);
 	struct tmpmasks tmpmask;
@@ -2077,28 +2162,33 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 		return 0;
 
 	/*
-	 * Cannot force a partial or invalid partition root to a full
-	 * partition root.
+	 * For a previously invalid partition root, leave it at being
+	 * invalid if new_prs is not "member".
 	 */
-	if (new_prs && (old_prs == PRS_ERROR))
-		return -EINVAL;
+	if (new_prs && is_prs_invalid(old_prs)) {
+		cs->partition_root_state = -new_prs;
+		return 0;
+	}
 
 	if (alloc_cpumasks(NULL, &tmpmask))
 		return -ENOMEM;
 
-	err = -EINVAL;
 	if (!old_prs) {
 		/*
 		 * Turning on partition root requires setting the
 		 * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
-		 * cannot be NULL.
+		 * cannot be empty.
 		 */
-		if (cpumask_empty(cs->cpus_allowed))
+		if (cpumask_empty(cs->cpus_allowed)) {
+			err = PERR_CPUSEMPTY;
 			goto out;
+		}
 
 		err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
-		if (err)
+		if (err) {
+			err = PERR_NOTEXCL;
 			goto out;
+		}
 
 		err = update_parent_subparts_cpumask(cs, partcmd_enable,
 						     NULL, &tmpmask);
@@ -2120,16 +2210,16 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 		 * A change in load balance state only, no change in cpumasks.
 		 */
 		update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED));
-		err = 0;
+		sched_domain_rebuilt = true;
 		goto out;	/* Sched domain is rebuilt in update_flag() */
 	} else {
 		/*
-		 * Switch back to member is always allowed even if it
+		 * Switching back to member is always allowed even if it
 		 * disables child partitions.
 		 */
-		err = 0;
 		update_parent_subparts_cpumask(cs, partcmd_disable, NULL,
 					       &tmpmask);
+
 		/*
 		 * If there are child partitions, they will all become invalid.
 		 */
@@ -2151,12 +2241,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 		}
 	}
 
-	/*
-	 * Update cpumask of parent's tasks except when it is the top
-	 * cpuset as some system daemons cannot be mapped to other CPUs.
-	 */
-	if (parent != &top_cpuset)
-		update_tasks_cpumask(parent);
+	update_tasks_cpumask(parent);
 
 	if (parent->child_ecpus_count)
 		update_sibling_cpumasks(parent, cs, &tmpmask);
@@ -2164,20 +2249,24 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 	if (!sched_domain_rebuilt)
 		rebuild_sched_domains_locked();
 out:
-	if (!err) {
-		spin_lock_irq(&callback_lock);
-		cs->partition_root_state = new_prs;
-		spin_unlock_irq(&callback_lock);
-		/*
-		 * Update child cpusets when disabling partition.
-		 */
-		if (new_prs == PRS_DISABLED && !list_empty(&cs->css.children))
-			update_cpumasks_hier(cs, &tmpmask, true);
-		notify_partition_change(cs, old_prs, new_prs);
-	}
+	/*
+	 * Make partition invalid if an error happen
+	 */
+	if (err)
+		new_prs = -new_prs;
+	spin_lock_irq(&callback_lock);
+	cs->partition_root_state = new_prs;
+	spin_unlock_irq(&callback_lock);
+	/*
+	 * Update child cpusets, if present.
+	 * Force update if switching back to member.
+	 */
+	if (!list_empty(&cs->css.children))
+		update_cpumasks_hier(cs, &tmpmask, !new_prs);
 
+	notify_partition_change(cs, old_prs);
 	free_cpumasks(NULL, &tmpmask);
-	return err;
+	return 0;
 }
 
 /*
@@ -2361,6 +2450,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	cgroup_taskset_first(tset, &css);
 	cs = css_cs(css);
 
+	cpus_read_lock();
 	percpu_down_write(&cpuset_rwsem);
 
 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2414,6 +2504,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 		wake_up(&cpuset_attach_wq);
 
 	percpu_up_write(&cpuset_rwsem);
+	cpus_read_unlock();
 }
 
 /* The various types of files and directories in a cpuset file system */
@@ -2669,26 +2760,32 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
 static int sched_partition_show(struct seq_file *seq, void *v)
 {
 	struct cpuset *cs = css_cs(seq_css(seq));
-	const char *err, *type;
+	const char *err, *type = NULL;
 
 	switch (cs->partition_root_state) {
-	case PRS_ENABLED:
+	case PRS_ROOT:
 		seq_puts(seq, "root\n");
 		break;
 	case PRS_ISOLATED:
 		seq_puts(seq, "isolated\n");
 		break;
-	case PRS_DISABLED:
+	case PRS_MEMBER:
 		seq_puts(seq, "member\n");
 		break;
-	case PRS_ERROR:
-		type = is_sched_load_balance(cs) ? "root" : "isolated";
+	case PRS_INVALID_ROOT:
+		type = "root";
+		fallthrough;
+	case PRS_INVALID_ISOLATED:
+		if (!type)
+			type = "isolated";
 		err = perr_strings[READ_ONCE(cs->prs_err)];
 		if (err)
 			seq_printf(seq, "%s invalid (%s)\n", type, err);
 		else
 			seq_printf(seq, "%s invalid\n", type);
 		break;
+		seq_puts(seq, "isolated invalid\n");
+		break;
 	}
 	return 0;
 }
@@ -2706,9 +2803,9 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
 	 * Convert "root" to ENABLED, and convert "member" to DISABLED.
 	 */
 	if (!strcmp(buf, "root"))
-		val = PRS_ENABLED;
+		val = PRS_ROOT;
 	else if (!strcmp(buf, "member"))
-		val = PRS_DISABLED;
+		val = PRS_MEMBER;
 	else if (!strcmp(buf, "isolated"))
 		val = PRS_ISOLATED;
 	else
@@ -2960,7 +3057,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	/*
 	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
 	 * set.  This flag handling is implemented in cgroup core for
-	 * histrical reasons - the flag may be specified during mount.
+	 * historical reasons - the flag may be specified during mount.
 	 *
 	 * Currently, if any sibling cpusets have exclusive cpus or mem, we
 	 * refuse to clone the configuration - thereby refusing the task to
@@ -3009,7 +3106,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 	cpus_read_lock();
 	percpu_down_write(&cpuset_rwsem);
 
-	if (is_partition_root(cs))
+	if (is_partition_valid(cs))
 		update_prstate(cs, 0);
 
 	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
@@ -3157,7 +3254,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 
 	/*
 	 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
-	 * as the tasks will be migratecd to an ancestor.
+	 * as the tasks will be migrated to an ancestor.
 	 */
 	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
 		update_tasks_cpumask(cs);
@@ -3186,7 +3283,7 @@ hotplug_update_tasks(struct cpuset *cs,
 		     bool cpus_updated, bool mems_updated)
 {
 	/* A partition root is allowed to have empty effective cpus */
-	if (cpumask_empty(new_cpus) && !is_partition_root(cs))
+	if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
 		cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
 	if (nodes_empty(*new_mems))
 		*new_mems = parent_cs(cs)->effective_mems;
@@ -3259,8 +3356,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 	 * partitions, if present, by setting nr_subparts_cpus to 0 to
 	 * reclaim their cpus.
 	 */
-	if (is_partition_root(cs) && cpumask_empty(&new_cpus) &&
-	    cs->nr_subparts_cpus && partition_is_populated(cs, NULL)) {
+	if (cs->nr_subparts_cpus && is_partition_valid(cs) &&
+	    cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) {
 		spin_lock_irq(&callback_lock);
 		cs->nr_subparts_cpus = 0;
 		cpumask_clear(cs->subparts_cpus);
@@ -3271,16 +3368,15 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 	/*
 	 * Force the partition to become invalid if either one of
 	 * the following conditions hold:
-	 * 1) empty effective cpus with tasks in partition
-	 * 2) parent is invalid or doesn't grant any cpus to child partitions.
+	 * 1) empty effective cpus but not valid empty partition.
+	 * 2) parent is invalid or doesn't grant any cpus to child
+	 *    partitions.
 	 */
-	if (is_partition_root(cs) &&
-	   ((cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) ||
-	    !parent->nr_subparts_cpus)) {
+	if (is_partition_valid(cs) && (!parent->nr_subparts_cpus ||
+	   (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) {
 		int old_prs, parent_prs;
 
-		update_parent_subparts_cpumask(cs, partcmd_disable,
-					       NULL, tmp);
+		update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp);
 		if (cs->nr_subparts_cpus) {
 			spin_lock_irq(&callback_lock);
 			cs->nr_subparts_cpus = 0;
@@ -3291,29 +3387,30 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 
 		old_prs = cs->partition_root_state;
 		parent_prs = parent->partition_root_state;
-		if (old_prs != PRS_ERROR) {
+		if (is_partition_valid(cs)) {
 			spin_lock_irq(&callback_lock);
-			cs->partition_root_state = PRS_ERROR;
+			set_partition_invalid(cs);
 			spin_unlock_irq(&callback_lock);
-			if (parent_prs == PRS_ERROR)
+			if (is_prs_invalid(parent_prs))
 				WRITE_ONCE(cs->prs_err, PERR_INVPARENT);
 			else if (!parent_prs)
 				WRITE_ONCE(cs->prs_err, PERR_NOTPART);
 			else
 				WRITE_ONCE(cs->prs_err, PERR_HOTPLUG);
-			notify_partition_change(cs, old_prs, PRS_ERROR);
+			notify_partition_change(cs, old_prs);
 		}
 		cpuset_force_rebuild();
 	}
 
 	/*
-	 * On the other hand, an erroneous partition root may be transitioned
+	 * On the other hand, an invalid partition root may be transitioned
 	 * back to a regular one.
 	 */
-	else if (is_partition_root(parent) &&
-		(cs->partition_root_state == PRS_ERROR) &&
-		 update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
-		cpuset_force_rebuild();
+	else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
+		update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp);
+		if (is_partition_valid(cs))
+			cpuset_force_rebuild();
+	}
 
 update_tasks:
 	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
@@ -3619,8 +3716,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 	return cs;
 }
 
-/**
- * cpuset_node_allowed - Can we allocate on a memory node?
+/*
+ * __cpuset_node_allowed - Can we allocate on a memory node?
  * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
@@ -3662,7 +3759,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
 	struct cpuset *cs;		/* current cpuset ancestors */
-	int allowed;			/* is allocation in zone z allowed? */
+	bool allowed;			/* is allocation in zone z allowed? */
 	unsigned long flags;
 
 	if (in_interrupt())
@@ -3791,8 +3888,8 @@ void cpuset_print_current_mems_allowed(void)
 
 int cpuset_memory_pressure_enabled __read_mostly;
 
-/**
- * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+/*
+ * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
  *
  * Keep a running average of the rate of synchronous (direct)
  * page reclaim efforts initiated by tasks in each cpuset.
@@ -3807,7 +3904,7 @@ int cpuset_memory_pressure_enabled __read_mostly;
  * "memory_pressure".  Value displayed is an integer
  * representing the recent rate of entry into the synchronous
  * (direct) page reclaim by any task attached to the cpuset.
- **/
+ */
 
 void __cpuset_memory_pressure_bump(void)
 {

[Index of Archives]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]     [Linux Resources]

  Powered by Linux