Re: [tip: locking/core] lockdep: Fix lockdep recursion

"Paul E. McKenney" <paulmck@xxxxxxxxxx> · Wed, 14 Oct 2020 16:55:53 -0700

On Thu, Oct 15, 2020 at 12:39:54AM +0200, Peter Zijlstra wrote:
> On Wed, Oct 14, 2020 at 03:11:52PM -0700, Paul E. McKenney wrote:
> > On Wed, Oct 14, 2020 at 11:53:19PM +0200, Peter Zijlstra wrote:
> > > On Wed, Oct 14, 2020 at 11:34:05AM -0700, Paul E. McKenney wrote:
> > > > commit 7deaa04b02298001426730ed0e6214ac20d1a1c1
> > > > Author: Paul E. McKenney <paulmck@xxxxxxxxxx>
> > > > Date:   Tue Oct 13 12:39:23 2020 -0700
> > > > 
> > > >     rcu: Prevent lockdep-RCU splats on lock acquisition/release
> > > >     
> > > >     The rcu_cpu_starting() and rcu_report_dead() functions transition the
> > > >     current CPU between online and offline state from an RCU perspective.
> > > >     Unfortunately, this means that the rcu_cpu_starting() function's lock
> > > >     acquisition and the rcu_report_dead() function's lock releases happen
> > > >     while the CPU is offline from an RCU perspective, which can result in
> > > >     lockdep-RCU splats about using RCU from an offline CPU.  In reality,
> > > >     aside from the splats, both transitions are safe because a new grace
> > > >     period cannot start until these functions release their locks.
> > > 
> > > But we call the trace_* crud before we acquire the lock. Are you sure
> > > that's a false-positive? 
> > 
> > You lost me on this one.
> > 
> > I am assuming that you are talking about rcu_cpu_starting(), because
> > that is the one where RCU is not initially watching, that is, the
> > case where tracing before the lock acquisition would be a problem.
> > You cannot be talking about rcu_cpu_starting() itself, because it does
> > not do any tracing before acquiring the lock.  But if you are talking
> > about the caller of rcu_cpu_starting(), then that caller should put the
> > rcu_cpu_starting() before the tracing.  But that would be the other
> > patch earlier in this thread that was proposing moving the call to
> > rcu_cpu_starting() much earlier in CPU bringup.
> > 
> > So what am I missing here?
> 
> rcu_cpu_starting();
>   raw_spin_lock_irqsave();
>     local_irq_save();
>     preempt_disable();
>     spin_acquire()
>       lock_acquire()
>         trace_lock_acquire() <--- *whoopsie-doodle*
> 	  /* uses RCU for tracing */
>     arch_spin_lock_flags() <--- the actual spinlock

Gah!  Idiot here left out the most important part, so good catch!!!
Much easier this way than finding out about it the hard way...

I should have asked myself harder questions earlier today about moving
the counter from the rcu_node structure to the rcu_data structure.

Perhaps something like the following untested patch on top of the
earlier patch?

							Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 286dc0a..8b5215e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1159,8 +1159,8 @@ bool rcu_lockdep_current_cpu_online(void)
 	preempt_disable_notrace();
 	rdp = this_cpu_ptr(&rcu_data);
 	rnp = rdp->mynode;
-	seq = READ_ONCE(rdp->ofl_seq) & ~0x1;
-	if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || seq != READ_ONCE(rdp->ofl_seq))
+	seq = READ_ONCE(rnp->ofl_seq) & ~0x1;
+	if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || seq != READ_ONCE(rnp->ofl_seq))
 		ret = true;
 	preempt_enable_notrace();
 	return ret;
@@ -1982,6 +1982,7 @@ static void rcu_gp_fqs_loop(void)
 static void rcu_gp_cleanup(void)
 {
 	int cpu;
+	unsigned long firstseq;
 	bool needgp = false;
 	unsigned long gp_duration;
 	unsigned long new_gp_seq;
@@ -2019,6 +2020,12 @@ static void rcu_gp_cleanup(void)
 	new_gp_seq = rcu_state.gp_seq;
 	rcu_seq_end(&new_gp_seq);
 	rcu_for_each_node_breadth_first(rnp) {
+		smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
+		firstseq = READ_ONCE(rnp->ofl_seq);
+		if (firstseq & 0x1)
+			while (firstseq == smp_load_acquire(&rnp->ofl_seq))
+				schedule_timeout_idle(1);  // Can't wake unless RCU is watching.
+		smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
 		raw_spin_lock_irq_rcu_node(rnp);
 		if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
 			dump_blkd_tasks(rnp, 10);
@@ -4067,8 +4074,9 @@ void rcu_cpu_starting(unsigned int cpu)
 
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
-	WRITE_ONCE(rdp->ofl_seq, rdp->ofl_seq + 1);
-	WARN_ON_ONCE(!(rdp->ofl_seq & 0x1));
+	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+	WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
 	newcpu = !(rnp->expmaskinitnext & mask);
@@ -4088,8 +4096,9 @@ void rcu_cpu_starting(unsigned int cpu)
 	} else {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
-	WRITE_ONCE(rdp->ofl_seq, rdp->ofl_seq + 1);
-	WARN_ON_ONCE(rdp->ofl_seq & 0x1);
+	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
+	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+	WARN_ON_ONCE(rnp->ofl_seq & 0x1);
 	smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
 }
 
@@ -4117,8 +4126,9 @@ void rcu_report_dead(unsigned int cpu)
 
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
 	mask = rdp->grpmask;
-	WRITE_ONCE(rdp->ofl_seq, rdp->ofl_seq + 1);
-	WARN_ON_ONCE(!(rdp->ofl_seq & 0x1));
+	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+	WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
 	raw_spin_lock(&rcu_state.ofl_lock);
 	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
 	rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4131,8 +4141,9 @@ void rcu_report_dead(unsigned int cpu)
 	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	raw_spin_unlock(&rcu_state.ofl_lock);
-	WRITE_ONCE(rdp->ofl_seq, rdp->ofl_seq + 1);
-	WARN_ON_ONCE(rdp->ofl_seq & 0x1);
+	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
+	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+	WARN_ON_ONCE(rnp->ofl_seq & 0x1);
 
 	rdp->cpu_started = false;
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf0198d..7708ed1 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -56,6 +56,7 @@ struct rcu_node {
 				/*  Initialized from ->qsmaskinitnext at the */
 				/*  beginning of each grace period. */
 	unsigned long qsmaskinitnext;
+	unsigned long ofl_seq;	/* CPU-hotplug operation sequence count. */
 				/* Online CPUs for next grace period. */
 	unsigned long expmask;	/* CPUs or groups that need to check in */
 				/*  to allow the current expedited GP */
@@ -250,7 +251,6 @@ struct rcu_data {
 	unsigned long rcu_onl_gp_seq;	/* ->gp_seq at last online. */
 	short rcu_onl_gp_flags;		/* ->gp_flags at last online. */
 	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
-	unsigned long ofl_seq;		/* CPU-hotplug operation sequence count. */
 
 	int cpu;
 };