While selecting a task to pull across a node, try to choose a task that improves locatity. i.e choose a task that has more affinity to the destination node. To achieve this, parse the list of tasks in multiple iterations. For now choose just two iterations. In the first iteration, a task is chosen to move if and only if moving such a task helps improve node locality. In the last iteration, choose the default behaviour, i.e, a task is chosen irrespective of whether it improves node locality or not.(behaviour before this change). This iteration logic is only for cross node migration and with CONFIG_NUMA_BALANCING enabled. So if there are two tasks in a runq, both eligible to be migrated to another runq belonging to a different node, then this change tries to chose a task among the two that improves locality. Similar logic was first used in Peter Zijlstra's numa core. Signed-off-by: Srikar Dronamraju <srikar@xxxxxxxxxxxxxxxxxx> --- kernel/sched/fair.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 48 insertions(+), 0 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3df7f76..8fcbf96 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3906,6 +3906,7 @@ struct lb_env { unsigned int loop; unsigned int loop_break; unsigned int loop_max; + unsigned int iterations; }; /* @@ -4030,6 +4031,21 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 1; } +#ifdef CONFIG_NUMA_BALANCING +static bool preferred_node(struct task_struct *p, struct lb_env *env) +{ + if (!(env->sd->flags & SD_NUMA)) + return false; + + return (can_numa_migrate_task(p, env->dst_rq, env->src_rq) == 1); +} +#else +static bool preferred_node(struct task_struct *p, struct lb_env *env) +{ + return false; +} +#endif + /* * move_one_task tries to move exactly one task from busiest to this_rq, as * part of active balancing operations within "domain". @@ -4041,7 +4057,11 @@ static int move_one_task(struct lb_env *env) { struct task_struct *p, *n; +again: list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (!preferred_node(p, env)) + continue; + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) continue; @@ -4049,6 +4069,7 @@ static int move_one_task(struct lb_env *env) continue; move_task(p, env); + /* * Right now, this is only the second place move_task() * is called, so we can safely collect move_task() @@ -4057,6 +4078,9 @@ static int move_one_task(struct lb_env *env) schedstat_inc(env->sd, lb_gained[env->idle]); return 1; } + if (!env->iterations++) + goto again; + return 0; } @@ -4096,6 +4120,9 @@ static int move_tasks(struct lb_env *env) break; } + if (!preferred_node(p, env)) + goto next; + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) goto next; @@ -5099,6 +5126,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, + .iterations = 1, }; cpumask_copy(cpus, cpu_active_mask); @@ -5130,6 +5158,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, ld_moved = 0; lb_iterations = 1; if (busiest->nr_running > 1) { +#ifdef CONFIG_NUMA_BALANCING + if (sd->flags & SD_NUMA) { + if (cpu_to_node(env.dst_cpu) != cpu_to_node(env.src_cpu)) + env.iterations = 0; + } +#endif /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is @@ -5160,6 +5194,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, goto more_balance; } + if (!ld_moved && !env.iterations++) { + env.loop = 0; + env.loop_break = sched_nr_migrate_break; + goto more_balance; + } + /* * some other cpu did the load balance for us. */ @@ -5407,8 +5447,16 @@ static int active_load_balance_cpu_stop(void *data) .src_cpu = busiest_rq->cpu, .src_rq = busiest_rq, .idle = CPU_IDLE, + .iterations = 1, }; +#ifdef CONFIG_NUMA_BALANCING + if ((sd->flags & SD_NUMA)) { + if (cpu_to_node(env.dst_cpu) != cpu_to_node(env.src_cpu)) + env.iterations = 0; + } +#endif + schedstat_inc(sd, alb_count); if (move_one_task(&env)) -- 1.7.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>