On Wed, Feb 07, 2024 at 07:08:46PM +0800, Zqiang wrote: > From: Paul E. McKenney <paulmck@xxxxxxxxxx> > > commit bc31e6cb27a9334140ff2f0a209d59b08bc0bc8c upstream. > > Holding a mutex across synchronize_rcu_tasks() and acquiring > that same mutex in code called from do_exit() after its call to > exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop() > results in deadlock. This is by design, because tasks that are far > enough into do_exit() are no longer present on the tasks list, making > it a bit difficult for RCU Tasks to find them, let alone wait on them > to do a voluntary context switch. However, such deadlocks are becoming > more frequent. In addition, lockdep currently does not detect such > deadlocks and they can be difficult to reproduce. > > In addition, if a task voluntarily context switches during that time > (for example, if it blocks acquiring a mutex), then this task is in an > RCU Tasks quiescent state. And with some adjustments, RCU Tasks could > just as well take advantage of that fact. > > This commit therefore eliminates these deadlock by replacing the > SRCU-based wait for do_exit() completion with per-CPU lists of tasks > currently exiting. A given task will be on one of these per-CPU lists for > the same period of time that this task would previously have been in the > previous SRCU read-side critical section. These lists enable RCU Tasks > to find the tasks that have already been removed from the tasks list, > but that must nevertheless be waited upon. > > The RCU Tasks grace period gathers any of these do_exit() tasks that it > must wait on, and adds them to the list of holdouts. Per-CPU locking > and get_task_struct() are used to synchronize addition to and removal > from these lists. > > Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@xxxxxxxxxx/ > > Reported-by: Chen Zhongjin <chenzhongjin@xxxxxxxxxx> > Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxx> > Signed-off-by: Zqiang <qiang.zhang1211@xxxxxxxxx> This looks good to me, though it would be good to get other eyes on it. Thank you for putting this together!!! Thanx, Paul > --- > include/linux/sched.h | 1 + > init/init_task.c | 1 + > kernel/fork.c | 1 + > kernel/rcu/update.c | 65 ++++++++++++++++++++++++++++++------------- > 4 files changed, 49 insertions(+), 19 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index fd4899236037..0b555d8e9d5e 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -679,6 +679,7 @@ struct task_struct { > u8 rcu_tasks_idx; > int rcu_tasks_idle_cpu; > struct list_head rcu_tasks_holdout_list; > + struct list_head rcu_tasks_exit_list; > #endif /* #ifdef CONFIG_TASKS_RCU */ > > struct sched_info sched_info; > diff --git a/init/init_task.c b/init/init_task.c > index 994ffe018120..f741cbfd891c 100644 > --- a/init/init_task.c > +++ b/init/init_task.c > @@ -139,6 +139,7 @@ struct task_struct init_task > .rcu_tasks_holdout = false, > .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), > .rcu_tasks_idle_cpu = -1, > + .rcu_tasks_exit_list = LIST_HEAD_INIT(init_task.rcu_tasks_exit_list), > #endif > #ifdef CONFIG_CPUSETS > .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), > diff --git a/kernel/fork.c b/kernel/fork.c > index b65871600507..d416d16df62f 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1626,6 +1626,7 @@ static inline void rcu_copy_process(struct task_struct *p) > p->rcu_tasks_holdout = false; > INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); > p->rcu_tasks_idle_cpu = -1; > + INIT_LIST_HEAD(&p->rcu_tasks_exit_list); > #endif /* #ifdef CONFIG_TASKS_RCU */ > } > > diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c > index 81688a133552..5227cb5c1bea 100644 > --- a/kernel/rcu/update.c > +++ b/kernel/rcu/update.c > @@ -527,7 +527,8 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); > static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); > > /* Track exiting tasks in order to allow them to be waited for. */ > -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); > +static LIST_HEAD(rtp_exit_list); > +static DEFINE_RAW_SPINLOCK(rtp_exit_list_lock); > > /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ > #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) > @@ -661,6 +662,17 @@ static void check_holdout_task(struct task_struct *t, > sched_show_task(t); > } > > +static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) > +{ > + if (t != current && READ_ONCE(t->on_rq) && > + !is_idle_task(t)) { > + get_task_struct(t); > + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); > + WRITE_ONCE(t->rcu_tasks_holdout, true); > + list_add(&t->rcu_tasks_holdout_list, hop); > + } > +} > + > /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ > static int __noreturn rcu_tasks_kthread(void *arg) > { > @@ -726,14 +738,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) > */ > rcu_read_lock(); > for_each_process_thread(g, t) { > - if (t != current && READ_ONCE(t->on_rq) && > - !is_idle_task(t)) { > - get_task_struct(t); > - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); > - WRITE_ONCE(t->rcu_tasks_holdout, true); > - list_add(&t->rcu_tasks_holdout_list, > - &rcu_tasks_holdouts); > - } > + rcu_tasks_pertask(t, &rcu_tasks_holdouts); > } > rcu_read_unlock(); > > @@ -744,8 +749,12 @@ static int __noreturn rcu_tasks_kthread(void *arg) > * where they have disabled preemption, allowing the > * later synchronize_sched() to finish the job. > */ > - synchronize_srcu(&tasks_rcu_exit_srcu); > - > + raw_spin_lock_irqsave(&rtp_exit_list_lock, flags); > + list_for_each_entry(t, &rtp_exit_list, rcu_tasks_exit_list) { > + if (list_empty(&t->rcu_tasks_holdout_list)) > + rcu_tasks_pertask(t, &rcu_tasks_holdouts); > + } > + raw_spin_unlock_irqrestore(&rtp_exit_list_lock, flags); > /* > * Each pass through the following loop scans the list > * of holdout tasks, removing any that are no longer > @@ -802,8 +811,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) > * > * In addition, this synchronize_sched() waits for exiting > * tasks to complete their final preempt_disable() region > - * of execution, cleaning up after the synchronize_srcu() > - * above. > + * of execution. > */ > synchronize_sched(); > > @@ -834,20 +842,39 @@ static int __init rcu_spawn_tasks_kthread(void) > } > core_initcall(rcu_spawn_tasks_kthread); > > -/* Do the srcu_read_lock() for the above synchronize_srcu(). */ > +/* > + * Protect against tasklist scan blind spot while the task is exiting and > + * may be removed from the tasklist. Do this by adding the task to yet > + * another list. > + */ > void exit_tasks_rcu_start(void) > { > + unsigned long flags; > + struct task_struct *t = current; > + > + WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list)); > + get_task_struct(t); > preempt_disable(); > - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); > + raw_spin_lock_irqsave(&rtp_exit_list_lock, flags); > + list_add(&t->rcu_tasks_exit_list, &rtp_exit_list); > + raw_spin_unlock_irqrestore(&rtp_exit_list_lock, flags); > preempt_enable(); > } > > -/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ > +/* > + * Remove the task from the "yet another list" because do_exit() is now > + * non-preemptible, allowing synchronize_rcu() to wait beyond this point. > + */ > void exit_tasks_rcu_finish(void) > { > - preempt_disable(); > - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); > - preempt_enable(); > + unsigned long flags; > + struct task_struct *t = current; > + > + WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list)); > + raw_spin_lock_irqsave(&rtp_exit_list_lock, flags); > + list_del_init(&t->rcu_tasks_exit_list); > + raw_spin_unlock_irqrestore(&rtp_exit_list_lock, flags); > + put_task_struct(t); > } > > #endif /* #ifdef CONFIG_TASKS_RCU */ > -- > 2.17.1 >