On 10/30/2017 12:46 PM, Prateek Sood wrote: > Remove circular dependency deadlock in a scenario where hotplug of CPU is > being done while there is updation in cgroup and cpuset triggered from > userspace. > > Process A => kthreadd => Process B => Process C => Process A > > Process A > cpu_subsys_offline(); > cpu_down(); > _cpu_down(); > percpu_down_write(&cpu_hotplug_lock); //held > cpuhp_invoke_callback(); > workqueue_offline_cpu(); > queue_work_on(); // unbind_work on system_highpri_wq > __queue_work(); > insert_work(); > wake_up_worker(); > flush_work(); > wait_for_completion(); > > worker_thread(); > manage_workers(); > create_worker(); > kthread_create_on_node(); > wake_up_process(kthreadd_task); > > kthreadd > kthreadd(); > kernel_thread(); > do_fork(); > copy_process(); > percpu_down_read(&cgroup_threadgroup_rwsem); > __rwsem_down_read_failed_common(); //waiting > > Process B > kernfs_fop_write(); > cgroup_file_write(); > cgroup_procs_write(); > percpu_down_write(&cgroup_threadgroup_rwsem); //held > cgroup_attach_task(); > cgroup_migrate(); > cgroup_migrate_execute(); > cpuset_can_attach(); > mutex_lock(&cpuset_mutex); //waiting > > Process C > kernfs_fop_write(); > cgroup_file_write(); > cpuset_write_resmask(); > mutex_lock(&cpuset_mutex); //held > update_cpumask(); > update_cpumasks_hier(); > rebuild_sched_domains_locked(); > get_online_cpus(); > percpu_down_read(&cpu_hotplug_lock); //waiting > > Eliminating deadlock by reversing the locking order for cpuset_mutex and > cpu_hotplug_lock. After inverting the locking sequence of cpu_hotplug_lock > and cpuset_mutex, cpuset_hotplug_workfn() related functionality can be > done synchronously from the context doing cpu hotplug. For memory hotplug > it still gets queued as a work item. > > Signed-off-by: Prateek Sood <prsood@xxxxxxxxxxxxxx> > --- > include/linux/cpuset.h | 6 ---- > kernel/cgroup/cpuset.c | 94 +++++++++++++++++++++++++++----------------------- > kernel/power/process.c | 2 -- > kernel/sched/core.c | 1 - > 4 files changed, 50 insertions(+), 53 deletions(-) > > diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h > index a1e6a33..e74655d 100644 > --- a/include/linux/cpuset.h > +++ b/include/linux/cpuset.h > @@ -51,9 +51,7 @@ static inline void cpuset_dec(void) > > extern int cpuset_init(void); > extern void cpuset_init_smp(void); > -extern void cpuset_force_rebuild(void); > extern void cpuset_update_active_cpus(void); > -extern void cpuset_wait_for_hotplug(void); > extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); > extern void cpuset_cpus_allowed_fallback(struct task_struct *p); > extern nodemask_t cpuset_mems_allowed(struct task_struct *p); > @@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask) > static inline int cpuset_init(void) { return 0; } > static inline void cpuset_init_smp(void) {} > > -static inline void cpuset_force_rebuild(void) { } > - > static inline void cpuset_update_active_cpus(void) > { > partition_sched_domains(1, NULL, NULL); > } > > -static inline void cpuset_wait_for_hotplug(void) { } > - > static inline void cpuset_cpus_allowed(struct task_struct *p, > struct cpumask *mask) > { > diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c > index 4657e29..ec44aaa 100644 > --- a/kernel/cgroup/cpuset.c > +++ b/kernel/cgroup/cpuset.c > @@ -817,6 +817,18 @@ static int generate_sched_domains(cpumask_var_t **domains, > return ndoms; > } > > +static void cpuset_sched_change_begin(void) > +{ > + cpus_read_lock(); > + mutex_lock(&cpuset_mutex); > +} > + > +static void cpuset_sched_change_end(void) > +{ > + mutex_unlock(&cpuset_mutex); > + cpus_read_unlock(); > +} > + > /* > * Rebuild scheduler domains. > * > @@ -826,16 +838,14 @@ static int generate_sched_domains(cpumask_var_t **domains, > * 'cpus' is removed, then call this routine to rebuild the > * scheduler's dynamic sched domains. > * > - * Call with cpuset_mutex held. Takes get_online_cpus(). > */ > -static void rebuild_sched_domains_locked(void) > +static void rebuild_sched_domains_cpuslocked(void) > { > struct sched_domain_attr *attr; > cpumask_var_t *doms; > int ndoms; > > lockdep_assert_held(&cpuset_mutex); > - get_online_cpus(); > > /* > * We have raced with CPU hotplug. Don't do anything to avoid > @@ -843,27 +853,25 @@ static void rebuild_sched_domains_locked(void) > * Anyways, hotplug work item will rebuild sched domains. > */ > if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) > - goto out; > + return; > > /* Generate domain masks and attrs */ > ndoms = generate_sched_domains(&doms, &attr); > > /* Have scheduler rebuild the domains */ > partition_sched_domains(ndoms, doms, attr); > -out: > - put_online_cpus(); > } > #else /* !CONFIG_SMP */ > -static void rebuild_sched_domains_locked(void) > +static void rebuild_sched_domains_cpuslocked(void) > { > } > #endif /* CONFIG_SMP */ > > void rebuild_sched_domains(void) > { > - mutex_lock(&cpuset_mutex); > - rebuild_sched_domains_locked(); > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_begin(); > + rebuild_sched_domains_cpuslocked(); > + cpuset_sched_change_end(); > } > > /** > @@ -949,7 +957,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) > rcu_read_unlock(); > > if (need_rebuild_sched_domains) > - rebuild_sched_domains_locked(); > + rebuild_sched_domains_cpuslocked(); > } > > /** > @@ -1281,7 +1289,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) > cs->relax_domain_level = val; > if (!cpumask_empty(cs->cpus_allowed) && > is_sched_load_balance(cs)) > - rebuild_sched_domains_locked(); > + rebuild_sched_domains_cpuslocked(); > } > > return 0; > @@ -1314,7 +1322,6 @@ static void update_tasks_flags(struct cpuset *cs) > * > * Call with cpuset_mutex held. > */ > - > static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, > int turning_on) > { > @@ -1347,7 +1354,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, > spin_unlock_irq(&callback_lock); > > if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) > - rebuild_sched_domains_locked(); > + rebuild_sched_domains_cpuslocked(); > > if (spread_flag_changed) > update_tasks_flags(cs); > @@ -1615,7 +1622,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, > cpuset_filetype_t type = cft->private; > int retval = 0; > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > if (!is_cpuset_online(cs)) { > retval = -ENODEV; > goto out_unlock; > @@ -1651,7 +1658,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, > break; > } > out_unlock: > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > return retval; > } > > @@ -1662,7 +1669,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, > cpuset_filetype_t type = cft->private; > int retval = -ENODEV; > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > if (!is_cpuset_online(cs)) > goto out_unlock; > > @@ -1675,7 +1682,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, > break; > } > out_unlock: > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > return retval; > } > > @@ -1714,7 +1721,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, > kernfs_break_active_protection(of->kn); > flush_work(&cpuset_hotplug_work); > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > if (!is_cpuset_online(cs)) > goto out_unlock; > > @@ -1738,7 +1745,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, > > free_trial_cpuset(trialcs); > out_unlock: > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > kernfs_unbreak_active_protection(of->kn); > css_put(&cs->css); > flush_workqueue(cpuset_migrate_mm_wq); > @@ -2039,14 +2046,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) > /* > * If the cpuset being removed has its flag 'sched_load_balance' > * enabled, then simulate turning sched_load_balance off, which > - * will call rebuild_sched_domains_locked(). > + * will call rebuild_sched_domains_cpuslocked(). > */ > > static void cpuset_css_offline(struct cgroup_subsys_state *css) > { > struct cpuset *cs = css_cs(css); > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > > if (is_sched_load_balance(cs)) > update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); > @@ -2054,7 +2061,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) > cpuset_dec(); > clear_bit(CS_ONLINE, &cs->flags); > > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > } > > static void cpuset_css_free(struct cgroup_subsys_state *css) > @@ -2275,15 +2282,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) > mutex_unlock(&cpuset_mutex); > } > > -static bool force_rebuild; > - > -void cpuset_force_rebuild(void) > -{ > - force_rebuild = true; > -} > - > /** > - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset > + * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset > * > * This function is called after either CPU or memory configuration has > * changed and updates cpuset accordingly. The top_cpuset is always > @@ -2298,7 +2298,7 @@ void cpuset_force_rebuild(void) > * Note that CPU offlining during suspend is ignored. We don't modify > * cpusets across suspend/resume cycles at all. > */ > -static void cpuset_hotplug_workfn(struct work_struct *work) > +static void cpuset_hotplug(bool use_cpu_hp_lock) > { > static cpumask_t new_cpus; > static nodemask_t new_mems; > @@ -2356,25 +2356,31 @@ static void cpuset_hotplug_workfn(struct work_struct *work) > } > > /* rebuild sched domains if cpus_allowed has changed */ > - if (cpus_updated || force_rebuild) { > - force_rebuild = false; > - rebuild_sched_domains(); > + if (cpus_updated) { > + if (use_cpu_hp_lock) > + rebuild_sched_domains(); > + else { > + /* Acquiring cpu_hotplug_lock is not required. > + * When cpuset_hotplug() is called in hotplug path, > + * cpu_hotplug_lock is held by the hotplug context > + * which is waiting for cpuhp_thread_fun to indicate > + * completion of callback. > + */ > + mutex_lock(&cpuset_mutex); > + rebuild_sched_domains_cpuslocked(); > + mutex_unlock(&cpuset_mutex); > + } > } > } > > -void cpuset_update_active_cpus(void) > +static void cpuset_hotplug_workfn(struct work_struct *work) > { > - /* > - * We're inside cpu hotplug critical region which usually nests > - * inside cgroup synchronization. Bounce actual hotplug processing > - * to a work item to avoid reverse locking order. > - */ > - schedule_work(&cpuset_hotplug_work); > + cpuset_hotplug(true); > } > > -void cpuset_wait_for_hotplug(void) > +void cpuset_update_active_cpus(void) > { > - flush_work(&cpuset_hotplug_work); > + cpuset_hotplug(false); > } > > /* > diff --git a/kernel/power/process.c b/kernel/power/process.c > index 50f25cb..28772b405 100644 > --- a/kernel/power/process.c > +++ b/kernel/power/process.c > @@ -203,8 +203,6 @@ void thaw_processes(void) > __usermodehelper_set_disable_depth(UMH_FREEZING); > thaw_workqueues(); > > - cpuset_wait_for_hotplug(); > - > read_lock(&tasklist_lock); > for_each_process_thread(g, p) { > /* No other threads should have PF_SUSPEND_TASK set */ > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index d17c5da..25b8717 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -5590,7 +5590,6 @@ static void cpuset_cpu_active(void) > * restore the original sched domains by considering the > * cpuset configurations. > */ > - cpuset_force_rebuild(); > } > cpuset_update_active_cpus(); > } > Any improvement/suggestion for this patch? -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html