>From 1b07d36b074acb8a97c8bb5c0f1604960763578e Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@xxxxxxxxxx> Date: Tue, 19 Oct 2021 10:12:27 -1000 Generalize threadgroup stabilization through threadgroup_rwsem so that it can be used outside cgroup. * A new config option CONFIG_THREADGROUP_RWSEM which is selected by CONFIG_CGROUPS enables threadgroup_rwsem. * The declarations are moved to linux/sched/threadgroup_rwsem.h and the rwsem is now defined in kernel/sched/core.c. * cgroup_mutex nests outside threadgroup_rwsem. During fork, cgroup_css_set_fork() which is called from cgroup_can_fork() was acquiring both. However, generalizing threadgroup_rwsem means that it needs to be acquired and released in the outer copy_process(). To maintain the locking order, break out cgroup_mutex acquisition into a separate function cgroup_prep_fork() which is called from copy_process() before acquiring threadgroup_rwsem. No functional changes. Signed-off-by: Tejun Heo <tj@xxxxxxxxxx> Cc: Christian Brauner <christian.brauner@xxxxxxxxxx> --- fs/exec.c | 1 + include/linux/cgroup-defs.h | 33 ------------------ include/linux/cgroup.h | 11 +++--- include/linux/sched/threadgroup_rwsem.h | 46 +++++++++++++++++++++++++ init/Kconfig | 4 +++ kernel/cgroup/cgroup-v1.c | 1 + kernel/cgroup/cgroup.c | 38 +++++++++++++------- kernel/fork.c | 10 +++++- kernel/sched/core.c | 4 +++ kernel/sched/sched.h | 1 + kernel/signal.c | 1 + 11 files changed, 98 insertions(+), 52 deletions(-) create mode 100644 include/linux/sched/threadgroup_rwsem.h diff --git a/fs/exec.c b/fs/exec.c index caedd06a6d472..b18abc76e1ce0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -39,6 +39,7 @@ #include <linux/sched/signal.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> +#include <linux/sched/threadgroup_rwsem.h> #include <linux/pagemap.h> #include <linux/perf_event.h> #include <linux/highmem.h> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1a77731e33096..b7e89b0c17057 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -16,7 +16,6 @@ #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/percpu-refcount.h> -#include <linux/percpu-rwsem.h> #include <linux/u64_stats_sync.h> #include <linux/workqueue.h> #include <linux/bpf-cgroup.h> @@ -708,42 +707,10 @@ struct cgroup_subsys { unsigned int depends_on; }; -extern struct percpu_rw_semaphore threadgroup_rwsem; - -/** - * threadgroup_change_begin - threadgroup exclusion for cgroups - * @tsk: target task - * - * Allows cgroup operations to synchronize against threadgroup changes - * using a percpu_rw_semaphore. - */ -static inline void threadgroup_change_begin(struct task_struct *tsk) -{ - percpu_down_read(&threadgroup_rwsem); -} - -/** - * threadgroup_change_end - threadgroup exclusion for cgroups - * @tsk: target task - * - * Counterpart of threadgroup_change_begin(). - */ -static inline void threadgroup_change_end(struct task_struct *tsk) -{ - percpu_up_read(&threadgroup_rwsem); -} - #else /* CONFIG_CGROUPS */ #define CGROUP_SUBSYS_COUNT 0 -static inline void threadgroup_change_begin(struct task_struct *tsk) -{ - might_sleep(); -} - -static inline void threadgroup_change_end(struct task_struct *tsk) {} - #endif /* CONFIG_CGROUPS */ #ifdef CONFIG_SOCK_CGROUP_DATA diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 75c151413fda8..aa3df6361105f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -121,12 +121,10 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); -extern int cgroup_can_fork(struct task_struct *p, - struct kernel_clone_args *kargs); -extern void cgroup_cancel_fork(struct task_struct *p, - struct kernel_clone_args *kargs); -extern void cgroup_post_fork(struct task_struct *p, - struct kernel_clone_args *kargs); +void cgroup_prep_fork(struct kernel_clone_args *kargs); +int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs); +void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); +void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); @@ -713,6 +711,7 @@ static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} +static inline void cgroup_prep_fork(struct kernel_clone_args *kargs) { } static inline int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p, diff --git a/include/linux/sched/threadgroup_rwsem.h b/include/linux/sched/threadgroup_rwsem.h new file mode 100644 index 0000000000000..31ab72703724b --- /dev/null +++ b/include/linux/sched/threadgroup_rwsem.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_THREADGROUP_RWSEM_H +#define _LINUX_SCHED_THREADGROUP_RWSEM_H + +#ifdef CONFIG_THREADGROUP_RWSEM +/* including before task_struct definition causes dependency loop */ +#include <linux/percpu-rwsem.h> + +extern struct percpu_rw_semaphore threadgroup_rwsem; + +/** + * threadgroup_change_begin - mark the beginning of changes to a threadgroup + * @tsk: task causing the changes + * + * All operations which modify a threadgroup - a new thread joining the group, + * death of a member thread (the assertion of PF_EXITING) and exec(2) + * dethreading the process and replacing the leader - read-locks + * threadgroup_rwsem so that write-locking stabilizes thread groups. + */ +static inline void threadgroup_change_begin(struct task_struct *tsk) +{ + percpu_down_read(&threadgroup_rwsem); +} + +/** + * threadgroup_change_end - mark the end of changes to a threadgroup + * @tsk: task causing the changes + * + * See threadgroup_change_begin(). + */ +static inline void threadgroup_change_end(struct task_struct *tsk) +{ + percpu_up_read(&threadgroup_rwsem); +} +#else +static inline void threadgroup_change_begin(struct task_struct *tsk) +{ + might_sleep(); +} + +static inline void threadgroup_change_end(struct task_struct *tsk) +{ +} +#endif + +#endif diff --git a/init/Kconfig b/init/Kconfig index 11f8a845f259d..3a3699ccff3ce 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -917,8 +917,12 @@ config NUMA_BALANCING_DEFAULT_ENABLED If set, automatic NUMA balancing will be enabled if running on a NUMA machine. +config THREADGROUP_RWSEM + bool + menuconfig CGROUPS bool "Control Group support" + select THREADGROUP_RWSEM select KERNFS help This option adds support for grouping sets of processes together, for diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 03808e7deb2ea..9c747e258ae7c 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -8,6 +8,7 @@ #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> +#include <linux/sched/threadgroup_rwsem.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 2fd01c901b1ae..937888386210a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -42,6 +42,7 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/task.h> +#include <linux/sched/threadgroup_rwsem.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/percpu-rwsem.h> @@ -109,8 +110,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); -DEFINE_PERCPU_RWSEM(threadgroup_rwsem); - #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&cgroup_mutex), \ @@ -6050,7 +6049,6 @@ static struct cgroup *cgroup_get_from_file(struct file *f) * to the target cgroup. */ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) - __acquires(&cgroup_mutex) __acquires(&threadgroup_rwsem) { int ret; struct cgroup *dst_cgrp = NULL; @@ -6058,11 +6056,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) struct super_block *sb; struct file *f; - if (kargs->flags & CLONE_INTO_CGROUP) - mutex_lock(&cgroup_mutex); - - threadgroup_change_begin(current); - spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); @@ -6118,7 +6111,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) return ret; err: - threadgroup_change_end(current); mutex_unlock(&cgroup_mutex); if (f) fput(f); @@ -6138,10 +6130,8 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) * CLONE_INTO_CGROUP was requested. */ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) - __releases(&threadgroup_rwsem) __releases(&cgroup_mutex) + __releases(&cgroup_mutex) { - threadgroup_change_end(current); - if (kargs->flags & CLONE_INTO_CGROUP) { struct cgroup *cgrp = kargs->cgrp; struct css_set *cset = kargs->cset; @@ -6160,9 +6150,26 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) } } +/** + * cgroup_prep_fork - called during fork before threadgroup_rwsem is acquired + * @kargs: the arguments passed to create the child process + * + * CLONE_INTO_CGROUP requires cgroup_mutex as we're migrating while forking. + * However, cgroup_mutex must nest outside threadgroup_rwsem which is + * read-locked before cgroup_can_fork(). Break out cgroup_mutex locking to this + * function to follow the locking order. + */ +void cgroup_prep_fork(struct kernel_clone_args *kargs) + __acquires(&cgroup_mutex) +{ + if (kargs->flags & CLONE_INTO_CGROUP) + mutex_lock(&cgroup_mutex); +} + /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process + * @kargs: the arguments passed to create the child process * * This prepares a new css_set for the child process which the child will * be attached to in cgroup_post_fork(). @@ -6175,6 +6182,13 @@ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) struct cgroup_subsys *ss; int i, j, ret; + /* + * cgroup_mutex should have been acquired by cgroup_prep_fork() if + * CLONE_INTO_CGROUP + */ + if (kargs->flags & CLONE_INTO_CGROUP) + lockdep_assert_held(&cgroup_mutex); + ret = cgroup_css_set_fork(kargs); if (ret) return ret; diff --git a/kernel/fork.c b/kernel/fork.c index 38681ad44c76b..34fb9db59148b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/sched/threadgroup_rwsem.h> #include <linux/seq_file.h> #include <linux/rtmutex.h> #include <linux/init.h> @@ -2285,6 +2286,10 @@ static __latent_entropy struct task_struct *copy_process( p->kretprobe_instances.first = NULL; #endif + cgroup_prep_fork(args); + + threadgroup_change_begin(current); + /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted that the new process's css_set can be changed @@ -2293,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process( */ retval = cgroup_can_fork(p, args); if (retval) - goto bad_fork_put_pidfd; + goto bad_fork_threadgroup_change_end; /* * From this point on we must avoid any synchronous user-space @@ -2407,6 +2412,7 @@ static __latent_entropy struct task_struct *copy_process( proc_fork_connector(p); sched_post_fork(p); cgroup_post_fork(p, args); + threadgroup_change_end(current); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -2421,6 +2427,8 @@ static __latent_entropy struct task_struct *copy_process( spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p, args); +bad_fork_threadgroup_change_end: + threadgroup_change_end(current); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { fput(pidfile); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1bba4128a3e68..bee6bf6d9659d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -84,6 +84,10 @@ unsigned int sysctl_sched_rt_period = 1000000; __read_mostly int scheduler_running; +#ifdef CONFIG_THREADGROUP_RWSEM +DEFINE_PERCPU_RWSEM(threadgroup_rwsem); +#endif + #ifdef CONFIG_SCHED_CORE DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3d3e5793e1172..135e4265fd259 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -28,6 +28,7 @@ #include <linux/sched/sysctl.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> +#include <linux/sched/threadgroup_rwsem.h> #include <linux/sched/topology.h> #include <linux/sched/user.h> #include <linux/sched/wake_q.h> diff --git a/kernel/signal.c b/kernel/signal.c index f01b249369ce2..d46e63266faf4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -20,6 +20,7 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/sched/threadgroup_rwsem.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/proc_fs.h> -- 2.33.1