There were a few inconsistencies in how numa_allow_migration() was used, in particular it did no always take into account high-imbalance scenarios, where affinity preferences are generally overriden. To fix this make use of numa_allow_migration() more consistent and also pass in the load-balancing environment to the function, where it can look at env->failed and env->sd->cache_nice_tries. Also add a NUMA check to ALB. Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Rik van Riel <riel@xxxxxxxxxx> Cc: Mel Gorman <mgorman@xxxxxxx> Cc: Hugh Dickins <hughd@xxxxxxxxxx> Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx> --- kernel/sched/fair.c | 103 +++++++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 46 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c393fba..503ec29 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4792,6 +4792,39 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif +#define LBF_ALL_PINNED 0x01 +#define LBF_NEED_BREAK 0x02 +#define LBF_SOME_PINNED 0x04 + +struct lb_env { + struct sched_domain *sd; + + struct rq *src_rq; + int src_cpu; + + int dst_cpu; + struct rq *dst_rq; + + struct cpumask *dst_grpmask; + int new_dst_cpu; + enum cpu_idle_type idle; + long imbalance; + /* The set of CPUs under consideration for load-balancing */ + struct cpumask *cpus; + + unsigned int flags; + unsigned int failed; + unsigned int iteration; + + unsigned int loop; + unsigned int loop_break; + unsigned int loop_max; + + struct rq * (*find_busiest_queue)(struct lb_env *, + struct sched_group *); +}; + + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; @@ -5011,30 +5044,35 @@ done: return target; } -static bool numa_allow_migration(struct task_struct *p, int prev_cpu, int new_cpu) +static bool numa_allow_migration(struct task_struct *p, int prev_cpu, int new_cpu, + struct lb_env *env) { #ifdef CONFIG_NUMA_BALANCING + if (sched_feat(NUMA_CONVERGE_MIGRATIONS)) { /* Help in the direction of expected convergence: */ if (p->convergence_node >= 0 && (cpu_to_node(new_cpu) != p->convergence_node)) return false; - return true; - } - - if (sched_feat(NUMA_BALANCE_ALL)) { - if (task_numa_shared(p) >= 0) - return false; - - return true; + if (!env || env->failed <= env->sd->cache_nice_tries) { + if (task_numa_shared(p) >= 0 && + cpu_to_node(prev_cpu) != cpu_to_node(new_cpu)) + return false; + } } if (sched_feat(NUMA_BALANCE_INTERNODE)) { if (task_numa_shared(p) >= 0) { - if (cpu_to_node(prev_cpu) != cpu_to_node(new_cpu)) + if (cpu_to_node(prev_cpu) != cpu_to_node(new_cpu)) return false; } } + + if (sched_feat(NUMA_BALANCE_ALL)) { + if (task_numa_shared(p) >= 0) + return false; + } + #endif return true; } @@ -5148,7 +5186,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) /* while loop will break here if sd == NULL */ } unlock: - if (!numa_allow_migration(p, prev0_cpu, new_cpu)) { + if (!numa_allow_migration(p, prev0_cpu, new_cpu, NULL)) { if (cpumask_test_cpu(prev0_cpu, tsk_cpus_allowed(p))) new_cpu = prev0_cpu; } @@ -5567,38 +5605,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp static unsigned long __read_mostly max_load_balance_interval = HZ/10; -#define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 -#define LBF_SOME_PINNED 0x04 - -struct lb_env { - struct sched_domain *sd; - - struct rq *src_rq; - int src_cpu; - - int dst_cpu; - struct rq *dst_rq; - - struct cpumask *dst_grpmask; - int new_dst_cpu; - enum cpu_idle_type idle; - long imbalance; - /* The set of CPUs under consideration for load-balancing */ - struct cpumask *cpus; - - unsigned int flags; - unsigned int failed; - unsigned int iteration; - - unsigned int loop; - unsigned int loop_break; - unsigned int loop_max; - - struct rq * (*find_busiest_queue)(struct lb_env *, - struct sched_group *); -}; - /* * move_task - move a task from one runqueue to another runqueue. * Both runqueues must be locked. @@ -5699,7 +5705,7 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) /* We do NUMA balancing elsewhere: */ if (env->failed <= env->sd->cache_nice_tries) { - if (!numa_allow_migration(p, env->src_rq->cpu, env->dst_cpu)) + if (!numa_allow_migration(p, env->src_rq->cpu, env->dst_cpu, env)) return false; } @@ -5760,7 +5766,7 @@ static int move_one_task(struct lb_env *env) if (!can_migrate_task(p, env)) continue; - if (!numa_allow_migration(p, env->src_rq->cpu, env->dst_cpu)) + if (!numa_allow_migration(p, env->src_rq->cpu, env->dst_cpu, env)) continue; move_task(p, env); @@ -5823,7 +5829,7 @@ static int move_tasks(struct lb_env *env) if (!can_migrate_task(p, env)) goto next; - if (!numa_allow_migration(p, env->src_rq->cpu, env->dst_cpu)) + if (!numa_allow_migration(p, env->src_rq->cpu, env->dst_cpu, env)) goto next; move_task(p, env); @@ -6944,6 +6950,11 @@ more_balance: goto out_pinned; } + /* Is this active load-balancing NUMA-beneficial? */ + if (!numa_allow_migration(busiest->curr, env.src_rq->cpu, env.dst_cpu, &env)) { + raw_spin_unlock_irqrestore(&busiest->lock, flags); + goto out; + } /* * ->active_balance synchronizes accesses to * ->active_balance_work. Once set, it's cleared -- 1.7.11.7 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>