Commit-ID: bcdf5162b92dfc0999b0e0ecf25d778733cc4c4d Gitweb: http://git.kernel.org/tip/bcdf5162b92dfc0999b0e0ecf25d778733cc4c4d Author: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> AuthorDate: Thu, 17 May 2012 15:07:31 +0200 Committer: Ingo Molnar <mingo@xxxxxxxxxx> CommitDate: Fri, 18 May 2012 09:48:59 +0200 sched/numa: Provide sysctl knob to disable numa scheduling and turn it off by default Provide a knob to make all this numa scheduling go-away. Also provide a Kconfig entry to set the default for this new knob. Requested-by: Ingo Molnar <mingo@xxxxxxxxxx> Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Link: http://lkml.kernel.org/n/tip-lz8zudea6tqgbxduk9mcs7x3@xxxxxxxxxxxxxx Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx> --- include/linux/sched.h | 13 +++++++ init/Kconfig | 18 ++++++++++ kernel/sched/numa.c | 89 +++++++++++++++++++++++++++++++++++++++--------- kernel/sysctl.c | 11 ++++++ 4 files changed, 114 insertions(+), 17 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 024a5f9..4879103 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -90,6 +90,7 @@ struct sched_param { #include <linux/latencytop.h> #include <linux/cred.h> #include <linux/llist.h> +#include <linux/jump_label.h> #include <asm/processor.h> @@ -1584,9 +1585,14 @@ struct task_struct { /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +extern struct static_key sched_numa_disabled; + static inline int tsk_home_node(struct task_struct *p) { #ifdef CONFIG_NUMA + if (static_key_false(&sched_numa_disabled)) + return -1; + return p->node; #else return -1; @@ -2058,6 +2064,13 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { } extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif +#ifdef CONFIG_NUMA +extern int sysctl_sched_numa; +int sched_numa_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); diff --git a/init/Kconfig b/init/Kconfig index e4e84f2..2f6bfc1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -865,6 +865,24 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_NUMA_DEFAULT + bool "Enable NUMA scheduling by default" + depends on NUMA + default n + help + This option selects the default enablement of a scheduler feature + that gives each process a home-node and allocates all its memory + from there and tries to schedule all the process tasks on that node + (or as near to it) while trying to maintain fairness. + + Without this feature memory is allocated on whatever node a task + happens to run on and the scheduler is free to migrate tasks around + at will -- which can result in significant cross-node memory + traffic. + + Regardless of this setting it can always be changed at runtime + by changing /proc/sys/kernel/sched_numa. + config MM_OWNER def_bool NUMA diff --git a/kernel/sched/numa.c b/kernel/sched/numa.c index 7b74a15..b98338b 100644 --- a/kernel/sched/numa.c +++ b/kernel/sched/numa.c @@ -18,6 +18,9 @@ #include "sched.h" +struct static_key sched_numa_disabled = STATIC_KEY_INIT_FALSE; +static DEFINE_MUTEX(sched_numa_mutex); +int sysctl_sched_numa = IS_ENABLED(CONFIG_SCHED_NUMA_DEFAULT); static const int numa_balance_interval = 2 * HZ; /* 2 seconds */ @@ -137,7 +140,7 @@ bool account_numa_enqueue(struct task_struct *p) void account_numa_dequeue(struct task_struct *p) { - int home_node = tsk_home_node(p); + int home_node = p->node; /* ignore sched_numa_disabled */ struct numa_cpu_load *nl; struct rq *rq; @@ -444,7 +447,7 @@ void select_task_node(struct task_struct *p, struct mm_struct *mm, int sd_flags) { int node; - if (!sched_feat(NUMA_SELECT)) { + if (!sched_feat(NUMA_SELECT) || !sysctl_sched_numa) { p->node = -1; return; } @@ -766,13 +769,74 @@ static int numad_thread(void *data) return 0; } +static int numad_create(struct node_queue *nq) +{ + struct task_struct *numad; + + if (!sysctl_sched_numa) + return 0; + + numad = kthread_create_on_node(numad_thread, + nq, nq->node, "numad/%d", nq->node); + if (IS_ERR(numad)) + return PTR_ERR(numad); + + nq->numad = numad; + nq->next_schedule = jiffies + HZ; + + return 0; +} + +static void numad_destroy(struct node_queue *nq) +{ + kthread_stop(nq->numad); + nq->numad = NULL; +} + +int sched_numa_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old, new, ret, node; + + mutex_lock(&sched_numa_mutex); + get_online_cpus(); + + old = sysctl_sched_numa; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + new = sysctl_sched_numa; + + if (old == new) + goto unlock; + + if (new) + static_key_slow_dec(&sched_numa_disabled); + else + static_key_slow_inc(&sched_numa_disabled); + + for_each_online_node(node) { + struct node_queue *nq = nq_of(node); + + if (new && !nq->numad) { + if (!numad_create(nq)) + wake_up_process(nq->numad); + } else if (!new && nq->numad) + numad_destroy(nq); + } + +unlock: + put_online_cpus(); + mutex_unlock(&sched_numa_mutex); + + return ret; +} + static int __cpuinit numa_hotplug(struct notifier_block *nb, unsigned long action, void *hcpu) { int cpu = (long)hcpu; int node = cpu_to_node(cpu); struct node_queue *nq = nq_of(node); - struct task_struct *numad; int err = 0; switch (action & ~CPU_TASKS_FROZEN) { @@ -780,19 +844,12 @@ numa_hotplug(struct notifier_block *nb, unsigned long action, void *hcpu) if (nq->numad) break; - numad = kthread_create_on_node(numad_thread, - nq, node, "numad/%d", node); - if (IS_ERR(numad)) { - err = PTR_ERR(numad); - break; - } - - nq->numad = numad; - nq->next_schedule = jiffies + HZ; // XXX sync-up? + err = numad_create(nq); break; case CPU_ONLINE: - wake_up_process(nq->numad); + if (nq->numad) + wake_up_process(nq->numad); break; case CPU_DEAD: @@ -801,10 +858,8 @@ numa_hotplug(struct notifier_block *nb, unsigned long action, void *hcpu) break; if (cpumask_any_and(cpu_online_mask, - cpumask_of_node(node)) >= nr_cpu_ids) { - kthread_stop(nq->numad); - nq->numad = NULL; - } + cpumask_of_node(node)) >= nr_cpu_ids) + numad_destroy(nq); break; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4ab1187..40ecba2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -373,6 +373,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#ifdef CONFIG_NUMA + { + .procname = "sched_numa", + .data = &sysctl_sched_numa, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_numa_handler, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", -- To unsubscribe from this list: send the line "unsubscribe linux-tip-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html