On Wed, Oct 09, 2024 at 11:05:08AM -0700, Paul E. McKenney wrote: > Currently, once an RCU CPU stall warning decides to dump the stalling > CPUs' stacks, the rcu_dump_cpu_stacks() function persists until it > has gone through the full list. Unfortunately, if the stalled grace > periods ends midway through, this function will be dumping stacks of > innocent-bystander CPUs that happen to be blocking not the old grace > period, but instead the new one. This can cause serious confusion. > > This commit therefore stops dumping stacks if and when the stalled grace > period ends. > > Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxx> > --- > kernel/rcu/tree_stall.h | 17 +++++++++++------ > 1 file changed, 11 insertions(+), 6 deletions(-) > > diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h > index d7cdd535e50b1..9d79133377ff6 100644 > --- a/kernel/rcu/tree_stall.h > +++ b/kernel/rcu/tree_stall.h > @@ -335,13 +335,17 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) > * that don't support NMI-based stack dumps. The NMI-triggered stack > * traces are more accurate because they are printed by the target CPU. > */ > -static void rcu_dump_cpu_stacks(void) > +static void rcu_dump_cpu_stacks(unsigned long gp_seq) > { > int cpu; > unsigned long flags; > struct rcu_node *rnp; > > rcu_for_each_leaf_node(rnp) { > + if (gp_seq != rcu_state.gp_seq) { > + pr_err("INFO: Stall ended during stack backtracing.\n"); > + return; > + } small nit, this also needs data_race() like you did in next patch? Although you did delete this code in the next patch. thanks, - Joel > printk_deferred_enter(); > raw_spin_lock_irqsave_rcu_node(rnp, flags); > for_each_leaf_node_possible_cpu(rnp, cpu) > @@ -608,7 +612,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) > (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, > data_race(rcu_state.n_online_cpus)); // Diagnostic read > if (ndetected) { > - rcu_dump_cpu_stacks(); > + rcu_dump_cpu_stacks(gp_seq); > > /* Complain about tasks blocking the grace period. */ > rcu_for_each_leaf_node(rnp) > @@ -640,7 +644,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) > rcu_force_quiescent_state(); /* Kick them all. */ > } > > -static void print_cpu_stall(unsigned long gps) > +static void print_cpu_stall(unsigned long gp_seq, unsigned long gps) > { > int cpu; > unsigned long flags; > @@ -677,7 +681,7 @@ static void print_cpu_stall(unsigned long gps) > rcu_check_gp_kthread_expired_fqs_timer(); > rcu_check_gp_kthread_starvation(); > > - rcu_dump_cpu_stacks(); > + rcu_dump_cpu_stacks(gp_seq); > > raw_spin_lock_irqsave_rcu_node(rnp, flags); > /* Rewrite if needed in case of slow consoles. */ > @@ -759,7 +763,8 @@ static void check_cpu_stall(struct rcu_data *rdp) > gs2 = READ_ONCE(rcu_state.gp_seq); > if (gs1 != gs2 || > ULONG_CMP_LT(j, js) || > - ULONG_CMP_GE(gps, js)) > + ULONG_CMP_GE(gps, js) || > + !rcu_seq_state(gs2)) > return; /* No stall or GP completed since entering function. */ > rnp = rdp->mynode; > jn = jiffies + ULONG_MAX / 2; > @@ -780,7 +785,7 @@ static void check_cpu_stall(struct rcu_data *rdp) > pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); > } else if (self_detected) { > /* We haven't checked in, so go dump stack. */ > - print_cpu_stall(gps); > + print_cpu_stall(gs2, gps); > } else { > /* They had a few time units to dump stack, so complain. */ > print_other_cpu_stall(gs2, gps); > -- > 2.40.1 > >