On Fri, Jun 28, 2019 at 10:06:16AM +0200, Juri Lelli wrote: > cpuset_rwsem is going to be acquired from sched_setscheduler() with a > following patch. There are however paths (e.g., spawn_ksoftirqd) in > which sched_scheduler() is eventually called while holding hotplug lock; > this creates a dependecy between hotplug lock (to be always acquired > first) and cpuset_rwsem (to be always acquired after hotplug lock). > > Fix paths which currently take the two locks in the wrong order (after > a following patch is applied). > Signed-off-by: Juri Lelli <juri.lelli@xxxxxxxxxx> This all reminds me of this: https://lkml.kernel.org/r/1510755615-25906-1-git-send-email-prsood@xxxxxxxxxxxxxx Which sadly got reverted again. If we do this now (I've always been a proponent), then we can make that rebuild synchronous again, which should also help here IIRC. > --- > include/linux/cpuset.h | 8 ++++---- > kernel/cgroup/cpuset.c | 22 +++++++++++++++++----- > 2 files changed, 21 insertions(+), 9 deletions(-) > > diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h > index 934633a05d20..7f1478c26a33 100644 > --- a/include/linux/cpuset.h > +++ b/include/linux/cpuset.h > @@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void) > > static inline void cpuset_inc(void) > { > - static_branch_inc(&cpusets_pre_enable_key); > - static_branch_inc(&cpusets_enabled_key); > + static_branch_inc_cpuslocked(&cpusets_pre_enable_key); > + static_branch_inc_cpuslocked(&cpusets_enabled_key); > } > > static inline void cpuset_dec(void) > { > - static_branch_dec(&cpusets_enabled_key); > - static_branch_dec(&cpusets_pre_enable_key); > + static_branch_dec_cpuslocked(&cpusets_enabled_key); > + static_branch_dec_cpuslocked(&cpusets_pre_enable_key); > } > > extern int cpuset_init(void); > diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c > index a7c0c8d8f132..d92b351f89e3 100644 > --- a/kernel/cgroup/cpuset.c > +++ b/kernel/cgroup/cpuset.c > @@ -1026,8 +1026,8 @@ static void rebuild_sched_domains_locked(void) > cpumask_var_t *doms; > int ndoms; > > + lockdep_assert_cpus_held(); > percpu_rwsem_assert_held(&cpuset_rwsem); > - get_online_cpus(); > > /* > * We have raced with CPU hotplug. Don't do anything to avoid > @@ -1036,19 +1036,17 @@ static void rebuild_sched_domains_locked(void) > */ > if (!top_cpuset.nr_subparts_cpus && > !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) > - goto out; > + return; > > if (top_cpuset.nr_subparts_cpus && > !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) > - goto out; > + return; > > /* Generate domain masks and attrs */ > ndoms = generate_sched_domains(&doms, &attr); > > /* Have scheduler rebuild the domains */ > partition_and_rebuild_sched_domains(ndoms, doms, attr); > -out: > - put_online_cpus(); > } > #else /* !CONFIG_SMP */ > static void rebuild_sched_domains_locked(void) > @@ -1058,9 +1056,11 @@ static void rebuild_sched_domains_locked(void) > > void rebuild_sched_domains(void) > { > + get_online_cpus(); > percpu_down_write(&cpuset_rwsem); > rebuild_sched_domains_locked(); > percpu_up_write(&cpuset_rwsem); > + put_online_cpus(); > } > > /** > @@ -2298,6 +2298,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, > cpuset_filetype_t type = cft->private; > int retval = 0; > > + get_online_cpus(); > percpu_down_write(&cpuset_rwsem); > if (!is_cpuset_online(cs)) { > retval = -ENODEV; > @@ -2335,6 +2336,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, > } > out_unlock: > percpu_up_write(&cpuset_rwsem); > + put_online_cpus(); > return retval; > } > > @@ -2345,6 +2347,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, > cpuset_filetype_t type = cft->private; > int retval = -ENODEV; > > + get_online_cpus(); > percpu_down_write(&cpuset_rwsem); > if (!is_cpuset_online(cs)) > goto out_unlock; > @@ -2359,6 +2362,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, > } > out_unlock: > percpu_up_write(&cpuset_rwsem); > + put_online_cpus(); > return retval; > } > > @@ -2397,6 +2401,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, > kernfs_break_active_protection(of->kn); > flush_work(&cpuset_hotplug_work); > > + get_online_cpus(); > percpu_down_write(&cpuset_rwsem); > if (!is_cpuset_online(cs)) > goto out_unlock; > @@ -2422,6 +2427,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, > free_cpuset(trialcs); > out_unlock: > percpu_up_write(&cpuset_rwsem); > + put_online_cpus(); > kernfs_unbreak_active_protection(of->kn); > css_put(&cs->css); > flush_workqueue(cpuset_migrate_mm_wq); > @@ -2552,6 +2558,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, > return -EINVAL; > > css_get(&cs->css); > + get_online_cpus(); > percpu_down_write(&cpuset_rwsem); > if (!is_cpuset_online(cs)) > goto out_unlock; > @@ -2559,6 +2566,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, > retval = update_prstate(cs, val); > out_unlock: > percpu_up_write(&cpuset_rwsem); > + put_online_cpus(); > css_put(&cs->css); > return retval ?: nbytes; > } > @@ -2764,6 +2772,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) > if (!parent) > return 0; > > + get_online_cpus(); > percpu_down_write(&cpuset_rwsem); > > set_bit(CS_ONLINE, &cs->flags); > @@ -2816,6 +2825,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) > spin_unlock_irq(&callback_lock); > out_unlock: > percpu_up_write(&cpuset_rwsem); > + put_online_cpus(); > return 0; > } > > @@ -2834,6 +2844,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) > { > struct cpuset *cs = css_cs(css); > > + get_online_cpus(); > percpu_down_write(&cpuset_rwsem); > > if (is_partition_root(cs)) > @@ -2854,6 +2865,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) > clear_bit(CS_ONLINE, &cs->flags); > > percpu_up_write(&cpuset_rwsem); > + put_online_cpus(); > } > > static void cpuset_css_free(struct cgroup_subsys_state *css) > -- > 2.17.2 >