On Thu, Sep 17, 2015 at 04:35:27PM +0200, Peter Zijlstra wrote: > I'd be happy to fail a CPU down for user tasks where this is the last > runnable CPU of. A little like so. Completely untested. --- Subject: sched: Refuse to unplug a CPU if this will violate user task affinity Its bad policy to allow unplugging a CPU for which a user set explicit affinity, either strictly on this CPU or in case this was the last online CPU in its mask. Either would end up forcing the thread on a random other CPU, violating the sys_sched_setaffinity() constraint. Disallow this by default; root might not be aware of all user affinities, but can negotiate and change affinities for all tasks. Provide a sysctl to go back to the old behaviour. Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> --- include/linux/sched/sysctl.h | 1 + kernel/sched/core.c | 46 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 9 +++++++++ 3 files changed, 56 insertions(+) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c9e4731cf10b..9444b549914b 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,6 +39,7 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_strict_affinity; enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6ab415aa15c4..457c8b912fc6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -284,6 +284,11 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; +/* + * Disallows cpu unplug if that would result in a task without runnable CPUs. + */ +unsigned int sysctl_sched_strict_affinity = 1; + /* cpus with isolated domains */ cpumask_var_t cpu_isolated_map; @@ -5430,6 +5435,42 @@ static void set_rq_offline(struct rq *rq) } /* + * Test if there's a user task for which @cpu is the last runnable CPU + */ +static bool migration_possible(int cpu) +{ + struct task_struct *g, *p; + bool ret = true; + int next; + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + /* if its running elsewhere, this cannot be its last cpu */ + if (task_cpu(p) != cpu) + continue; + + /* we only care about user state */ + if (p->flags & PF_KTHREAD) + continue; + + next = -1; +again: + next = cpumask_next_and(next, tsk_cpus_allowed(p), cpu_active_mask); + if (next >= nr_cpu_ids) { + printk(KERN_WARNING "task %s-%d refused unplug of CPU %d\n", + p->comm, p->pid, cpu); + ret = false; + break; + } + if (next == cpu) + goto again; + } + read_unlock(&tasklist_lock); + + return ret; +} + +/* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. */ @@ -5440,6 +5481,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) unsigned long flags; struct rq *rq = cpu_rq(cpu); + if (action == CPU_DOWN_PREPARE && sysctl_sched_strict_affinity) { + if (!migration_possible(cpu)) + return notifier_from_errno(-EBUSY); + } + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e69201d8094e..9d0edcc73cc3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -283,6 +283,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SMP + { + .procname = "sched_strict_affinity", + .data = &sysctl_sched_strict_affinity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_SMP */ #ifdef CONFIG_SCHED_DEBUG { .procname = "sched_min_granularity_ns", -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html