[PATCH 53/63] sched: numa: Decide whether to favour task or group weights based on swap candidate relationships

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Rik van Riel <riel@xxxxxxxxxx>

This patch separately considers task and group affinities when searching
for swap candidates during task NUMA placement. If tasks are not part of
a group or the same group then the task weights are considered.
Otherwise the group weights are compared.

Signed-off-by: Rik van Riel <riel@xxxxxxxxxx>
---
 kernel/sched/fair.c | 59 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 58d1070..e7da6f2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1039,13 +1039,15 @@ static void task_numa_assign(struct task_numa_env *env,
  * into account that it might be best if task running on the dst_cpu should
  * be exchanged with the source task
  */
-static void task_numa_compare(struct task_numa_env *env, long imp)
+static void task_numa_compare(struct task_numa_env *env,
+			      long taskimp, long groupimp)
 {
 	struct rq *src_rq = cpu_rq(env->src_cpu);
 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
 	struct task_struct *cur;
 	long dst_load, src_load;
 	long load;
+	long imp = (groupimp > 0) ? groupimp : taskimp;
 
 	rcu_read_lock();
 	cur = ACCESS_ONCE(dst_rq->curr);
@@ -1064,10 +1066,19 @@ static void task_numa_compare(struct task_numa_env *env, long imp)
 		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
 			goto unlock;
 
-		imp += task_weight(cur, env->src_nid) +
-		       group_weight(cur, env->src_nid) -
-		       task_weight(cur, env->dst_nid) -
-		       group_weight(cur, env->dst_nid);
+		/*
+		 * If dst and source tasks are in the same NUMA group, or not
+		 * in any group then look only at task weights otherwise give
+		 * priority to the group weights.
+		 */
+		if (!cur->numa_group || !env->p->numa_group ||
+		    cur->numa_group == env->p->numa_group) {
+			imp = taskimp + task_weight(cur, env->src_nid) -
+			      task_weight(cur, env->dst_nid);
+		} else {
+			imp = groupimp + group_weight(cur, env->src_nid) -
+			       group_weight(cur, env->dst_nid);
+		}
 	}
 
 	if (imp < env->best_imp)
@@ -1117,7 +1128,8 @@ unlock:
 	rcu_read_unlock();
 }
 
-static void task_numa_find_cpu(struct task_numa_env *env, long imp)
+static void task_numa_find_cpu(struct task_numa_env *env,
+				long taskimp, long groupimp)
 {
 	int cpu;
 
@@ -1127,7 +1139,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, long imp)
 			continue;
 
 		env->dst_cpu = cpu;
-		task_numa_compare(env, imp);
+		task_numa_compare(env, taskimp, groupimp);
 	}
 }
 
@@ -1146,9 +1158,9 @@ static int task_numa_migrate(struct task_struct *p)
 		.best_cpu = -1
 	};
 	struct sched_domain *sd;
-	unsigned long weight;
+	unsigned long taskweight, groupweight;
 	int nid, ret;
-	long imp;
+	long taskimp, groupimp;
 
 	/*
 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
@@ -1163,15 +1175,17 @@ static int task_numa_migrate(struct task_struct *p)
 	env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
 	rcu_read_unlock();
 
-	weight = task_weight(p, env.src_nid) + group_weight(p, env.src_nid);
+	taskweight = task_weight(p, env.src_nid);
+	groupweight = group_weight(p, env.src_nid);
 	update_numa_stats(&env.src_stats, env.src_nid);
 	env.dst_nid = p->numa_preferred_nid;
-	imp = task_weight(p, env.dst_nid) + group_weight(p, env.dst_nid) - weight;
+	taskimp = task_weight(p, env.dst_nid) - taskweight;
+	groupimp = group_weight(p, env.dst_nid) - groupweight;
 	update_numa_stats(&env.dst_stats, env.dst_nid);
 
 	/* If the preferred nid has capacity, try to use it. */
 	if (env.dst_stats.has_capacity)
-		task_numa_find_cpu(&env, imp);
+		task_numa_find_cpu(&env, taskimp, groupimp);
 
 	/* No space available on the preferred nid. Look elsewhere. */
 	if (env.best_cpu == -1) {
@@ -1180,13 +1194,14 @@ static int task_numa_migrate(struct task_struct *p)
 				continue;
 
 			/* Only consider nodes where both task and groups benefit */
-			imp = task_weight(p, nid) + group_weight(p, nid) - weight;
-			if (imp < 0)
+			taskimp = task_weight(p, nid) - taskweight;
+			groupimp = group_weight(p, nid) - groupweight;
+			if (taskimp < 0 && groupimp < 0)
 				continue;
 
 			env.dst_nid = nid;
 			update_numa_stats(&env.dst_stats, env.dst_nid);
-			task_numa_find_cpu(&env, imp);
+			task_numa_find_cpu(&env, taskimp, groupimp);
 		}
 	}
 
@@ -4678,10 +4693,9 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 	if (dst_nid == p->numa_preferred_nid)
 		return true;
 
-	/* After the task has settled, check if the new node is better. */
-	if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count &&
-			task_weight(p, dst_nid) + group_weight(p, dst_nid) >
-			task_weight(p, src_nid) + group_weight(p, src_nid))
+	/* If both task and group weight improve, this move is a winner. */
+	if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+	    group_weight(p, dst_nid) > group_weight(p, src_nid))
 		return true;
 
 	return false;
@@ -4708,10 +4722,9 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 	if (src_nid == p->numa_preferred_nid)
 		return true;
 
-	/* After the task has settled, check if the new node is worse. */
-	if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count &&
-			task_weight(p, dst_nid) + group_weight(p, dst_nid) <
-			task_weight(p, src_nid) + group_weight(p, src_nid))
+	/* If either task or group weight get worse, don't do it. */
+	if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
+	    group_weight(p, dst_nid) < group_weight(p, src_nid))
 		return true;
 
 	return false;
-- 
1.8.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]