[PATCH 8/9] numa, sched: Improve directed convergence

Ingo Molnar <mingo@xxxxxxxxxx> · Fri, 7 Dec 2012 01:19:25 +0100

Improve a few aspects of directed convergence, which problems
have become more visible with the improved (and thus more
aggressive) task-flipping code.

We also have the new 'cpupid' sharing info which converges more
precisely and thus highlights weaknesses in group balancing more
visibly:

 - We should only balance over buddy groups that are smaller
   than the other (not fully filled) buddy groups

 - Do not 'spread' buddy groups that fully fill a node

 - Do not 'spread' singular buddy groups

These bugs were prominently visible with certain preemption
options and timings with the previous code as well, so this
is a regression fix as well.

Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Hugh Dickins <hughd@xxxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
 kernel/sched/fair.c | 101 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 71 insertions(+), 30 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fd49920..c393fba 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -926,6 +926,7 @@ static long calc_node_load(int node, bool use_higher)
 	long cpu_load_highfreq;
 	long cpu_load_lowfreq;
 	long cpu_load_curr;
+	long cpu_load_numa;
 	long min_cpu_load;
 	long max_cpu_load;
 	long node_load;
@@ -935,18 +936,22 @@ static long calc_node_load(int node, bool use_higher)
 
 	for_each_cpu(cpu, cpumask_of_node(node)) {
 		struct rq *rq = cpu_rq(cpu);
+		long cpu_load;
 
+		cpu_load_numa		= rq->numa_weight;
 		cpu_load_curr		= rq->load.weight;
 		cpu_load_lowfreq	= rq->cpu_load[NUMA_LOAD_IDX_LOWFREQ];
 		cpu_load_highfreq	= rq->cpu_load[NUMA_LOAD_IDX_HIGHFREQ];
 
-		min_cpu_load = min(min(cpu_load_curr, cpu_load_lowfreq), cpu_load_highfreq);
-		max_cpu_load = max(max(cpu_load_curr, cpu_load_lowfreq), cpu_load_highfreq);
+		min_cpu_load = min(min(min(cpu_load_numa, cpu_load_curr), cpu_load_lowfreq), cpu_load_highfreq);
+		max_cpu_load = max(max(max(cpu_load_numa, cpu_load_curr), cpu_load_lowfreq), cpu_load_highfreq);
 
 		if (use_higher)
-			node_load += max_cpu_load;
+			cpu_load = max_cpu_load;
 		else
-			node_load += min_cpu_load;
+			cpu_load = min_cpu_load;
+
+		node_load += cpu_load;
 	}
 
 	return node_load;
@@ -1087,6 +1092,7 @@ static int find_intranode_imbalance(int this_node, int this_cpu)
 	long cpu_load_lowfreq;
 	long this_cpu_load;
 	long cpu_load_curr;
+	long cpu_load_numa;
 	long min_cpu_load;
 	long cpu_load;
 	int min_cpu;
@@ -1102,14 +1108,15 @@ static int find_intranode_imbalance(int this_node, int this_cpu)
 	for_each_cpu(cpu, cpumask_of_node(this_node)) {
 		struct rq *rq = cpu_rq(cpu);
 
+		cpu_load_numa		= rq->numa_weight;
 		cpu_load_curr		= rq->load.weight;
 		cpu_load_lowfreq	= rq->cpu_load[NUMA_LOAD_IDX_LOWFREQ];
 		cpu_load_highfreq	= rq->cpu_load[NUMA_LOAD_IDX_HIGHFREQ];
 
-		if (cpu == this_cpu) {
-			this_cpu_load = min(min(cpu_load_curr, cpu_load_lowfreq), cpu_load_highfreq);
-		}
-		cpu_load = max(max(cpu_load_curr, cpu_load_lowfreq), cpu_load_highfreq);
+		if (cpu == this_cpu)
+			this_cpu_load = min(min(min(cpu_load_numa, cpu_load_curr), cpu_load_lowfreq), cpu_load_highfreq);
+
+		cpu_load = max(max(max(cpu_load_numa, cpu_load_curr), cpu_load_lowfreq), cpu_load_highfreq);
 
 		/* Find the idlest CPU: */
 		if (cpu_load < min_cpu_load) {
@@ -1128,16 +1135,18 @@ static int find_intranode_imbalance(int this_node, int this_cpu)
 /*
  * Search a node for the smallest-group task and return
  * it plus the size of the group it is in.
+ *
+ * TODO: can be done with a single pass.
  */
-static int buddy_group_size(int node, struct task_struct *p)
+static int buddy_group_size(int node, struct task_struct *p, bool *our_group_p)
 {
 	const cpumask_t *node_cpus_mask = cpumask_of_node(node);
 	cpumask_t cpus_to_check_mask;
-	bool our_group_found;
+	bool buddies_group_found;
 	int cpu1, cpu2;
 
 	cpumask_copy(&cpus_to_check_mask, node_cpus_mask);
-	our_group_found = false;
+	buddies_group_found = false;
 
 	if (WARN_ON_ONCE(cpumask_empty(&cpus_to_check_mask)))
 		return 0;
@@ -1156,7 +1165,12 @@ static int buddy_group_size(int node, struct task_struct *p)
 
 			group_size = 1;
 			if (tasks_buddies(group_head, p))
-				our_group_found = true;
+				buddies_group_found = true;
+
+			if (group_head == p) {
+				*our_group_p = true;
+				buddies_group_found = true;
+			}
 
 			/* Non-NUMA-shared tasks are 1-task groups: */
 			if (task_numa_shared(group_head) != 1)
@@ -1169,21 +1183,22 @@ static int buddy_group_size(int node, struct task_struct *p)
 				struct task_struct *p2 = rq2->curr;
 
 				WARN_ON_ONCE(rq1 == rq2);
+				if (p2 == p)
+					*our_group_p = true;
 				if (tasks_buddies(group_head, p2)) {
 					/* 'group_head' and 'rq2->curr' are in the same group: */
 					cpumask_clear_cpu(cpu2, &cpus_to_check_mask);
 					group_size++;
 					if (tasks_buddies(p2, p))
-						our_group_found = true;
+						buddies_group_found = true;
 				}
 			}
 next:
-
 			/*
 			 * If we just found our group and checked all
 			 * node local CPUs then return the result:
 			 */
-			if (our_group_found)
+			if (buddies_group_found)
 				return group_size;
 		}
 	} while (!cpumask_empty(&cpus_to_check_mask));
@@ -1261,8 +1276,15 @@ pick_non_numa_task:
 	return min_group_cpu;
 }
 
-static int find_max_node(struct task_struct *p, int *our_group_size)
+/*
+ * Find the node with the biggest buddy group of ours, but exclude
+ * our own local group on this node and also exclude fully filled
+ * nodes:
+ */
+static int
+find_max_node(struct task_struct *p, int *our_group_size_p, int *max_group_size_p, int full_size)
 {
+	bool our_group = false;
 	int max_group_size;
 	int group_size;
 	int max_node;
@@ -1272,9 +1294,12 @@ static int find_max_node(struct task_struct *p, int *our_group_size)
 	max_node = -1;
 
 	for_each_node(node) {
-		int full_size = cpumask_weight(cpumask_of_node(node));
 
-		group_size = buddy_group_size(node, p);
+		group_size = buddy_group_size(node, p, &our_group);
+		if (our_group) {
+			our_group = false;
+			*our_group_size_p = group_size;
+		}
 		if (group_size == full_size)
 			continue;
 
@@ -1284,7 +1309,7 @@ static int find_max_node(struct task_struct *p, int *our_group_size)
 		}
 	}
 
-	*our_group_size = max_group_size;
+	*max_group_size_p = max_group_size;
 
 	return max_node;
 }
@@ -1460,19 +1485,23 @@ static int find_max_node(struct task_struct *p, int *our_group_size)
  */
 static int improve_group_balance_compress(struct task_struct *p, int this_cpu, int this_node)
 {
-	int our_group_size = -1;
+	int full_size = cpumask_weight(cpumask_of_node(this_node));
+	int max_group_size = -1;
 	int min_group_size = -1;
+	int our_group_size = -1;
 	int max_node;
 	int min_cpu;
 
 	if (!sched_feat(NUMA_GROUP_LB_COMPRESS))
 		return -1;
 
-	max_node = find_max_node(p, &our_group_size);
+	max_node = find_max_node(p, &our_group_size, &max_group_size, full_size);
 	if (max_node == -1)
 		return -1;
+	if (our_group_size == -1)
+		return -1;
 
-	if (WARN_ON_ONCE(our_group_size == -1))
+	if (our_group_size == full_size || our_group_size > max_group_size)
 		return -1;
 
 	/* We are already in the right spot: */
@@ -1517,6 +1546,7 @@ static int improve_group_balance_spread(struct task_struct *p, int this_cpu, int
 	long this_node_load = -1;
 	long delta_load_before;
 	long delta_load_after;
+	int group_count = 0;
 	int idlest_cpu = -1;
 	int cpu1, cpu2;
 
@@ -1585,6 +1615,7 @@ static int improve_group_balance_spread(struct task_struct *p, int this_cpu, int
 				min_group_size = group_size;
 			else
 				min_group_size = min(group_size, min_group_size);
+			group_count++;
 		}
 	} while (!cpumask_empty(&cpus_to_check_mask));
 
@@ -1594,13 +1625,23 @@ static int improve_group_balance_spread(struct task_struct *p, int this_cpu, int
 	 */
 	if (!found_our_group)
 		return -1;
+
 	if (!our_group_smallest)
 		return -1;
+
 	if (WARN_ON_ONCE(min_group_size == -1))
 		return -1;
 	if (WARN_ON_ONCE(our_group_size == -1))
 		return -1;
 
+	/* Since the current task is shared, this should not happen: */
+	if (WARN_ON_ONCE(group_count < 1))
+		return -1;
+
+	/* No point in moving if we are a single group: */
+	if (group_count <= 1)
+		return -1;
+
 	idlest_node = find_idlest_node(&idlest_cpu);
 	if (idlest_node == -1)
 		return -1;
@@ -1622,7 +1663,7 @@ static int improve_group_balance_spread(struct task_struct *p, int this_cpu, int
 	 */
 	delta_load_before = this_node_load - idlest_node_load;
 	delta_load_after = (this_node_load-this_group_load) - (idlest_node_load+this_group_load);
-	
+
 	if (abs(delta_load_after)+SCHED_LOAD_SCALE > abs(delta_load_before))
 		return -1;
 
@@ -1806,7 +1847,7 @@ static int sched_update_ideal_cpu_shared(struct task_struct *p, int *flip_tasks)
 	this_node_capacity	= calc_node_capacity(this_node);
 
 	this_node_overloaded = false;
-	if (this_node_load > this_node_capacity + 512)
+	if (this_node_load > this_node_capacity + SCHED_LOAD_SCALE/2)
 		this_node_overloaded = true;
 
 	/* If we'd stay within this node then stay put: */
@@ -1922,7 +1963,7 @@ static int sched_update_ideal_cpu_private(struct task_struct *p)
 	this_node_capacity	= calc_node_capacity(this_node);
 
 	this_node_overloaded = false;
-	if (this_node_load > this_node_capacity + 512)
+	if (this_node_load > this_node_capacity + SCHED_LOAD_SCALE/2)
 		this_node_overloaded = true;
 
 	if (this_node == min_node)
@@ -1934,8 +1975,8 @@ static int sched_update_ideal_cpu_private(struct task_struct *p)
 
 	WARN_ON_ONCE(max_node_load < min_node_load);
 
-	/* Is the load difference at least 125% of one standard task load? */
-	if (this_node_load - min_node_load < 1536)
+	/* Is the load difference at least 150% of one standard task load? */
+	if (this_node_load - min_node_load < SCHED_LOAD_SCALE*3/2)
 		goto out_check_intranode;
 
 	/*
@@ -5476,7 +5517,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  *
  * The adjacency matrix of the resulting graph is given by:
  *
- *             log_2 n     
+ *             log_2 n
  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
  *             k = 0
  *
@@ -5522,7 +5563,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  *
  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
  *      rewrite all of this once again.]
- */ 
+ */
 
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
@@ -6133,7 +6174,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
 		/*
 		 * !SD_OVERLAP domains can assume that child groups
 		 * span the current group.
-		 */ 
+		 */
 
 		group = child->groups;
 		do {
-- 
1.7.11.7

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>