On Mon, Oct 09, 2017 at 06:57:46PM +0530, Prateek Sood wrote: > On 09/07/2017 11:21 PM, Peter Zijlstra wrote: > > But if you invert these locks, the need for cpuset_hotplug_workfn() goes > > away, at least for the CPU part, and we can make in synchronous again. > > Yay!! > The callback making a call to cpuset_hotplug_workfn()in hotplug path are > [CPUHP_AP_ACTIVE] = { > .name = "sched:active", > .startup.single = sched_cpu_activate, > .teardown.single = sched_cpu_deactivate, > }, > > if we make cpuset_hotplug_workfn() synchronous, deadlock might happen: > _cpu_down() > cpus_write_lock() //held > cpuhp_kick_ap_work() > cpuhp_kick_ap() > __cpuhp_kick_ap() > wake_up_process() //cpuhp_thread_fun > wait_for_ap_thread() //wait for complete from cpuhp_thread_fun() > > cpuhp_thread_fun() > cpuhp_invoke_callback() > sched_cpu_deactivate() > cpuset_cpu_inactive() > cpuset_update_active_cpus() > cpuset_hotplug_work() > rebuild_sched_domains() > cpus_read_lock() //waiting as acquired in _cpu_down() Well, duh, don't use rebuild_sched_domains() 'obviously' :-) use rebuild_sched_domains_cpuslocked() instead and it works just fine. After applying your patch, the below boots and survives a hotplug. --- include/linux/cpuset.h | 6 ------ kernel/cgroup/cpuset.c | 30 +++++++++--------------------- kernel/power/process.c | 2 -- kernel/sched/core.c | 1 - 4 files changed, 9 insertions(+), 30 deletions(-) --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -51,9 +51,7 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); -extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); -extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -166,15 +164,11 @@ static inline bool cpusets_enabled(void) static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} -static inline void cpuset_force_rebuild(void) { } - static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } -static inline void cpuset_wait_for_hotplug(void) { } - static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -833,7 +833,12 @@ static void rebuild_sched_domains_cpuslo cpumask_var_t *doms; int ndoms; + /* + * When called during hotplug, this lock is held by the calling + * thread, not cpuhp_thread_fun :/ + * lockdep_assert_cpus_held(); + */ lockdep_assert_held(&cpuset_mutex); /* @@ -2281,13 +2286,6 @@ static void cpuset_hotplug_update_tasks( mutex_unlock(&cpuset_mutex); } -static bool force_rebuild; - -void cpuset_force_rebuild(void) -{ - force_rebuild = true; -} - /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * @@ -2362,25 +2360,15 @@ static void cpuset_hotplug_workfn(struct } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; + if (cpus_updated) rebuild_sched_domains(); - } } void cpuset_update_active_cpus(void) { - /* - * We're inside cpu hotplug critical region which usually nests - * inside cgroup synchronization. Bounce actual hotplug processing - * to a work item to avoid reverse locking order. - */ - schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ - flush_work(&cpuset_hotplug_work); + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_cpuslocked(); + mutex_unlock(&cpuset_mutex); } /* --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -203,8 +203,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5598,7 +5598,6 @@ static void cpuset_cpu_active(void) * restore the original sched domains by considering the * cpuset configurations. */ - cpuset_force_rebuild(); } cpuset_update_active_cpus(); } -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html