This is a note to let you know that I've just added the patch titled sched/cpuset: Bring back cpuset_mutex to the 5.10-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary The filename of the patch is: sched-cpuset-bring-back-cpuset_mutex.patch and it can be found in the queue-5.10 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable@xxxxxxxxxxxxxxx> know about it. >From stable-owner@xxxxxxxxxxxxxxx Sun Aug 20 17:25:55 2023 From: Qais Yousef <qyousef@xxxxxxxxxxx> Date: Sun, 20 Aug 2023 16:21:40 +0100 Subject: sched/cpuset: Bring back cpuset_mutex To: stable@xxxxxxxxxxxxxxx Cc: Juri Lelli <juri.lelli@xxxxxxxxxx>, Waiman Long <longman@xxxxxxxxxx>, Tejun Heo <tj@xxxxxxxxxx>, Dietmar Eggemann <dietmar.eggemann@xxxxxxx>, Peter Zijlstra <peterz@xxxxxxxxxxxxx>, Vincent Guittot <vincent.guittot@xxxxxxxxxx>, Ingo Molnar <mingo@xxxxxxxxxx>, Hao Luo <haoluo@xxxxxxxxxx>, John Stultz <jstultz@xxxxxxxxxx>, cgroups@xxxxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx, Qais Yousef <qyousef@xxxxxxxxxxx> Message-ID: <20230820152144.517461-3-qyousef@xxxxxxxxxxx> From: Juri Lelli <juri.lelli@xxxxxxxxxx> commit 111cd11bbc54850f24191c52ff217da88a5e639b upstream. Turns out percpu_cpuset_rwsem - commit 1243dc518c9d ("cgroup/cpuset: Convert cpuset_mutex to percpu_rwsem") - wasn't such a brilliant idea, as it has been reported to cause slowdowns in workloads that need to change cpuset configuration frequently and it is also not implementing priority inheritance (which causes troubles with realtime workloads). Convert percpu_cpuset_rwsem back to regular cpuset_mutex. Also grab it only for SCHED_DEADLINE tasks (other policies don't care about stable cpusets anyway). Signed-off-by: Juri Lelli <juri.lelli@xxxxxxxxxx> Reviewed-by: Waiman Long <longman@xxxxxxxxxx> Signed-off-by: Tejun Heo <tj@xxxxxxxxxx> [ Fix conflict in kernel/cgroup/cpuset.c due to pulling new functions or comment that don't exist on 5.10 or the usage of different cpu hotplug lock whenever replacing the rwsem with mutex. Remove BUG_ON() for rwsem that doesn't exist on mainline. ] Signed-off-by: Qais Yousef (Google) <qyousef@xxxxxxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- include/linux/cpuset.h | 8 ++--- kernel/cgroup/cpuset.c | 78 +++++++++++++++++++++++-------------------------- kernel/sched/core.c | 22 +++++++++---- 3 files changed, 57 insertions(+), 51 deletions(-) --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -55,8 +55,8 @@ extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); extern void cpuset_wait_for_hotplug(void); -extern void cpuset_read_lock(void); -extern void cpuset_read_unlock(void); +extern void cpuset_lock(void); +extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -178,8 +178,8 @@ static inline void cpuset_update_active_ static inline void cpuset_wait_for_hotplug(void) { } -static inline void cpuset_read_lock(void) { } -static inline void cpuset_read_unlock(void) { } +static inline void cpuset_lock(void) { } +static inline void cpuset_unlock(void) { } static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -334,16 +334,16 @@ static struct cpuset top_cpuset = { * guidelines for accessing subsystem state in kernel/cgroup.c */ -DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); +static DEFINE_MUTEX(cpuset_mutex); -void cpuset_read_lock(void) +void cpuset_lock(void) { - percpu_down_read(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); } -void cpuset_read_unlock(void) +void cpuset_unlock(void) { - percpu_up_read(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } static DEFINE_SPINLOCK(callback_lock); @@ -930,7 +930,7 @@ static void dl_rebuild_rd_accounting(voi struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); @@ -991,7 +991,7 @@ static void rebuild_sched_domains_locked int ndoms; lockdep_assert_cpus_held(); - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); /* * If we have raced with CPU hotplug, return early to avoid @@ -1042,9 +1042,9 @@ static void rebuild_sched_domains_locked void rebuild_sched_domains(void) { get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); } @@ -1160,7 +1160,7 @@ static int update_parent_subparts_cpumas int new_prs; bool part_error = false; /* Partition error? */ - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); /* * The parent must be a partition root. @@ -1490,7 +1490,7 @@ static void update_sibling_cpumasks(stru struct cpuset *sibling; struct cgroup_subsys_state *pos_css; - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); /* * Check all its siblings and call update_cpumasks_hier() @@ -2157,7 +2157,7 @@ static int cpuset_can_attach(struct cgro cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); cs = css_cs(css); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; @@ -2181,7 +2181,7 @@ static int cpuset_can_attach(struct cgro cs->attach_in_progress++; ret = 0; out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); return ret; } @@ -2193,11 +2193,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset_first(tset, &css); cs = css_cs(css); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* @@ -2221,7 +2221,7 @@ static void cpuset_attach(struct cgroup_ cs = css_cs(css); lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* prepare for attach */ if (cs == &top_cpuset) @@ -2275,7 +2275,7 @@ static void cpuset_attach(struct cgroup_ if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* The various types of files and directories in a cpuset file system */ @@ -2307,7 +2307,7 @@ static int cpuset_write_u64(struct cgrou int retval = 0; get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -2343,7 +2343,7 @@ static int cpuset_write_u64(struct cgrou break; } out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); return retval; } @@ -2356,7 +2356,7 @@ static int cpuset_write_s64(struct cgrou int retval = -ENODEV; get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2369,7 +2369,7 @@ static int cpuset_write_s64(struct cgrou break; } out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); return retval; } @@ -2410,7 +2410,7 @@ static ssize_t cpuset_write_resmask(stru flush_work(&cpuset_hotplug_work); get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2434,7 +2434,7 @@ static ssize_t cpuset_write_resmask(stru free_cpuset(trialcs); out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); @@ -2567,13 +2567,13 @@ static ssize_t sched_partition_write(str css_get(&cs->css); get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; retval = update_prstate(cs, val); out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); css_put(&cs->css); return retval ?: nbytes; @@ -2781,7 +2781,7 @@ static int cpuset_css_online(struct cgro return 0; get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) @@ -2832,7 +2832,7 @@ static int cpuset_css_online(struct cgro cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); return 0; } @@ -2853,7 +2853,7 @@ static void cpuset_css_offline(struct cg struct cpuset *cs = css_cs(css); get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (is_partition_root(cs)) update_prstate(cs, 0); @@ -2872,7 +2872,7 @@ static void cpuset_css_offline(struct cg cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); } @@ -2885,7 +2885,7 @@ static void cpuset_css_free(struct cgrou static void cpuset_bind(struct cgroup_subsys_state *root_css) { - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { @@ -2898,7 +2898,7 @@ static void cpuset_bind(struct cgroup_su } spin_unlock_irq(&callback_lock); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* @@ -2940,8 +2940,6 @@ struct cgroup_subsys cpuset_cgrp_subsys int __init cpuset_init(void) { - BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); - BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); @@ -3013,7 +3011,7 @@ hotplug_update_tasks_legacy(struct cpuse is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); /* * Move tasks to the nearest ancestor with execution resources, @@ -3023,7 +3021,7 @@ hotplug_update_tasks_legacy(struct cpuse if (is_empty) remove_tasks_in_empty_cpuset(cs); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); } static void @@ -3073,14 +3071,14 @@ static void cpuset_hotplug_update_tasks( retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* * We have raced with task attaching. We wait until attaching * is finished, so we won't attach a task to an empty cpuset. */ if (cs->attach_in_progress) { - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); goto retry; } @@ -3152,7 +3150,7 @@ update_tasks: hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /** @@ -3182,7 +3180,7 @@ static void cpuset_hotplug_workfn(struct if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); @@ -3239,7 +3237,7 @@ static void cpuset_hotplug_workfn(struct update_tasks_nodemask(&top_cpuset); } - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5221,6 +5221,7 @@ static int __sched_setscheduler(struct t int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; + bool cpuset_locked = false; /* The pi code expects interrupts enabled */ BUG_ON(pi && in_interrupt()); @@ -5318,8 +5319,14 @@ recheck: return retval; } - if (pi) - cpuset_read_lock(); + /* + * SCHED_DEADLINE bandwidth accounting relies on stable cpusets + * information. + */ + if (dl_policy(policy) || dl_policy(p->policy)) { + cpuset_locked = true; + cpuset_lock(); + } /* * Make sure no PI-waiters arrive (or leave) while we are @@ -5395,8 +5402,8 @@ change: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); - if (pi) - cpuset_read_unlock(); + if (cpuset_locked) + cpuset_unlock(); goto recheck; } @@ -5462,7 +5469,8 @@ change: task_rq_unlock(rq, p, &rf); if (pi) { - cpuset_read_unlock(); + if (cpuset_locked) + cpuset_unlock(); rt_mutex_adjust_pi(p); } @@ -5474,8 +5482,8 @@ change: unlock: task_rq_unlock(rq, p, &rf); - if (pi) - cpuset_read_unlock(); + if (cpuset_locked) + cpuset_unlock(); return retval; } Patches currently in stable-queue which might be from stable-owner@xxxxxxxxxxxxxxx are queue-5.10/cgroup-cpuset-free-dl-bw-in-case-can_attach-fails.patch queue-5.10/sched-cpuset-keep-track-of-sched_deadline-task-in-cpusets.patch queue-5.10/cgroup-cpuset-rename-functions-dealing-with-deadline-accounting.patch queue-5.10/cgroup-cpuset-iterate-only-if-deadline-tasks-are-present.patch queue-5.10/sched-cpuset-bring-back-cpuset_mutex.patch queue-5.10/sched-deadline-create-dl-bw-alloc-free-check-overflow-interface.patch