On Wed, Feb 07, 2024 at 07:09:51PM +0800, Zqiang wrote: > From: Paul E. McKenney <paulmck@xxxxxxxxxx> > > commit bc31e6cb27a9334140ff2f0a209d59b08bc0bc8c upstream. > > Holding a mutex across synchronize_rcu_tasks() and acquiring > that same mutex in code called from do_exit() after its call to > exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop() > results in deadlock. This is by design, because tasks that are far > enough into do_exit() are no longer present on the tasks list, making > it a bit difficult for RCU Tasks to find them, let alone wait on them > to do a voluntary context switch. However, such deadlocks are becoming > more frequent. In addition, lockdep currently does not detect such > deadlocks and they can be difficult to reproduce. > > In addition, if a task voluntarily context switches during that time > (for example, if it blocks acquiring a mutex), then this task is in an > RCU Tasks quiescent state. And with some adjustments, RCU Tasks could > just as well take advantage of that fact. > > This commit therefore eliminates these deadlock by replacing the > SRCU-based wait for do_exit() completion with per-CPU lists of tasks > currently exiting. A given task will be on one of these per-CPU lists for > the same period of time that this task would previously have been in the > previous SRCU read-side critical section. These lists enable RCU Tasks > to find the tasks that have already been removed from the tasks list, > but that must nevertheless be waited upon. > > The RCU Tasks grace period gathers any of these do_exit() tasks that it > must wait on, and adds them to the list of holdouts. Per-CPU locking > and get_task_struct() are used to synchronize addition to and removal > from these lists. > > Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@xxxxxxxxxx/ > > Reported-by: Chen Zhongjin <chenzhongjin@xxxxxxxxxx> > Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxx> > Signed-off-by: Zqiang <qiang.zhang1211@xxxxxxxxx> And this one also looks good, though it would also be good to get other eyes on it. And again, thank you for putting this together! Thanx, Paul > --- > include/linux/sched.h | 1 + > init/init_task.c | 1 + > kernel/fork.c | 1 + > kernel/rcu/tasks.h | 54 ++++++++++++++++++++++++++++--------------- > 4 files changed, 39 insertions(+), 18 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index aa015416c569..80499f7ab39a 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -740,6 +740,7 @@ struct task_struct { > u8 rcu_tasks_idx; > int rcu_tasks_idle_cpu; > struct list_head rcu_tasks_holdout_list; > + struct list_head rcu_tasks_exit_list; > #endif /* #ifdef CONFIG_TASKS_RCU */ > > #ifdef CONFIG_TASKS_TRACE_RCU > diff --git a/init/init_task.c b/init/init_task.c > index 5fa18ed59d33..59454d6e2c2a 100644 > --- a/init/init_task.c > +++ b/init/init_task.c > @@ -151,6 +151,7 @@ struct task_struct init_task > .rcu_tasks_holdout = false, > .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), > .rcu_tasks_idle_cpu = -1, > + .rcu_tasks_exit_list = LIST_HEAD_INIT(init_task.rcu_tasks_exit_list), > #endif > #ifdef CONFIG_TASKS_TRACE_RCU > .trc_reader_nesting = 0, > diff --git a/kernel/fork.c b/kernel/fork.c > index 633b0af1d1a7..86803165aa00 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1699,6 +1699,7 @@ static inline void rcu_copy_process(struct task_struct *p) > p->rcu_tasks_holdout = false; > INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); > p->rcu_tasks_idle_cpu = -1; > + INIT_LIST_HEAD(&p->rcu_tasks_exit_list); > #endif /* #ifdef CONFIG_TASKS_RCU */ > #ifdef CONFIG_TASKS_TRACE_RCU > p->trc_reader_nesting = 0; > diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h > index c5624ab0580c..901cd7bc78ed 100644 > --- a/kernel/rcu/tasks.h > +++ b/kernel/rcu/tasks.h > @@ -81,9 +81,6 @@ static struct rcu_tasks rt_name = \ > .kname = #rt_name, \ > } > > -/* Track exiting tasks in order to allow them to be waited for. */ > -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); > - > /* Avoid IPIing CPUs early in the grace period. */ > #define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) > static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; > @@ -383,6 +380,9 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) > // rates from multiple CPUs. If this is required, per-CPU callback lists > // will be needed. > > +static LIST_HEAD(rtp_exit_list); > +static DEFINE_RAW_SPINLOCK(rtp_exit_list_lock); > + > /* Pre-grace-period preparation. */ > static void rcu_tasks_pregp_step(void) > { > @@ -416,15 +416,18 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) > /* Processing between scanning taskslist and draining the holdout list. */ > static void rcu_tasks_postscan(struct list_head *hop) > { > + unsigned long flags; > + struct task_struct *t; > + > /* > * Exiting tasks may escape the tasklist scan. Those are vulnerable > * until their final schedule() with TASK_DEAD state. To cope with > * this, divide the fragile exit path part in two intersecting > * read side critical sections: > * > - * 1) An _SRCU_ read side starting before calling exit_notify(), > - * which may remove the task from the tasklist, and ending after > - * the final preempt_disable() call in do_exit(). > + * 1) A task_struct list addition before calling exit_notify(), > + * which may remove the task from the tasklist, with the > + * removal after the final preempt_disable() call in do_exit(). > * > * 2) An _RCU_ read side starting with the final preempt_disable() > * call in do_exit() and ending with the final call to schedule() > @@ -433,7 +436,12 @@ static void rcu_tasks_postscan(struct list_head *hop) > * This handles the part 1). And postgp will handle part 2) with a > * call to synchronize_rcu(). > */ > - synchronize_srcu(&tasks_rcu_exit_srcu); > + raw_spin_lock_irqsave(&rtp_exit_list_lock, flags); > + list_for_each_entry(t, &rtp_exit_list, rcu_tasks_exit_list) { > + if (list_empty(&t->rcu_tasks_holdout_list)) > + rcu_tasks_pertask(t, hop); > + } > + raw_spin_unlock_irqrestore(&rtp_exit_list_lock, flags); > } > > /* See if tasks are still holding out, complain if so. */ > @@ -498,7 +506,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp) > * > * In addition, this synchronize_rcu() waits for exiting tasks > * to complete their final preempt_disable() region of execution, > - * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu), > * enforcing the whole region before tasklist removal until > * the final schedule() with TASK_DEAD state to be an RCU TASKS > * read side critical section. > @@ -591,25 +598,36 @@ static void show_rcu_tasks_classic_gp_kthread(void) > #endif /* #ifndef CONFIG_TINY_RCU */ > > /* > - * Contribute to protect against tasklist scan blind spot while the > - * task is exiting and may be removed from the tasklist. See > - * corresponding synchronize_srcu() for further details. > + * Protect against tasklist scan blind spot while the task is exiting and > + * may be removed from the tasklist. Do this by adding the task to yet > + * another list. > */ > -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) > +void exit_tasks_rcu_start(void) > { > - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); > + unsigned long flags; > + struct task_struct *t = current; > + > + WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list)); > + get_task_struct(t); > + raw_spin_lock_irqsave(&rtp_exit_list_lock, flags); > + list_add(&t->rcu_tasks_exit_list, &rtp_exit_list); > + raw_spin_unlock_irqrestore(&rtp_exit_list_lock, flags); > } > > /* > - * Contribute to protect against tasklist scan blind spot while the > - * task is exiting and may be removed from the tasklist. See > - * corresponding synchronize_srcu() for further details. > + * Remove the task from the "yet another list" because do_exit() is now > + * non-preemptible, allowing synchronize_rcu() to wait beyond this point. > */ > -void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu) > +void exit_tasks_rcu_stop(void) > { > + unsigned long flags; > struct task_struct *t = current; > > - __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); > + WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list)); > + raw_spin_lock_irqsave(&rtp_exit_list_lock, flags); > + list_del_init(&t->rcu_tasks_exit_list); > + raw_spin_unlock_irqrestore(&rtp_exit_list_lock, flags); > + put_task_struct(t); > } > > /* > -- > 2.17.1 >