Re: [PATCH 02/11] rcu: Kill rnp->ofl_seq and use only rcu_state.ofl_lock for exclusion

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

On 12/9/2021 8:39 PM, David Woodhouse wrote:
From: David Woodhouse <dwmw@xxxxxxxxxxxx>

If we allow architectures to bring APs online in parallel, then we end
up requiring rcu_cpu_starting() to be reentrant. But currently, the
manipulation of rnp->ofl_seq is not thread-safe.

However, rnp->ofl_seq is also fairly much pointless anyway since both
rcu_cpu_starting() and rcu_report_dead() hold rcu_state.ofl_lock for
fairly much the whole time that rnp->ofl_seq is set to an odd number
to indicate that an operation is in progress.

So drop rnp->ofl_seq completely, and use only rcu_state.ofl_lock.

This has a couple of minor complexities: lockdep will complain when we
take rcu_state.ofl_lock, and currently accepts the 'excuse' of having
an odd value in rnp->ofl_seq. So switch it to an arch_spinlock_t to
avoid that false positive complaint. Since we're killing rnp->ofl_seq
of course that 'excuse' has to be changed too, so make it check for
arch_spin_is_locked(rcu_state.ofl_lock).

There's no arch_spin_lock_irqsave() so we have to manually save and
restore local interrupts around the locking.

At Paul's request, make rcu_gp_init not just wait but *exclude* any
CPU online/offline activity, which was fairly much true already by
virtue of it holding rcu_state.ofl_lock.

Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
---
  kernel/rcu/tree.c | 64 +++++++++++++++++++++++------------------------
  kernel/rcu/tree.h |  4 +--
  2 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ef8d36f580fc..a1bb0b1229ed 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,7 +91,7 @@ static struct rcu_state rcu_state = {
  	.abbr = RCU_ABBR,
  	.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
  	.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
-	.ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
+	.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
  };
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -1168,7 +1168,15 @@ bool rcu_lockdep_current_cpu_online(void)
  	preempt_disable_notrace();
  	rdp = this_cpu_ptr(&rcu_data);
  	rnp = rdp->mynode;
-	if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1)
+	/*
+	 * Strictly, we care here about the case where the current CPU is
+	 * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
+	 * not being up to date. So arch_spin_is_locked() might have a
+	 * false positive if it's held by some *other* CPU, but that's
+	 * OK because that just means a false *negative* on the warning.
+	 */
+	if (rdp->grpmask & rcu_rnp_online_cpus(rnp) ||
+	    arch_spin_is_locked(&rcu_state.ofl_lock))
  		ret = true;
  	preempt_enable_notrace();
  	return ret;
@@ -1731,7 +1739,6 @@ static void rcu_strict_gp_boundary(void *unused)
   */
  static noinline_for_stack bool rcu_gp_init(void)
  {
-	unsigned long firstseq;
  	unsigned long flags;
  	unsigned long oldmask;
  	unsigned long mask;
@@ -1774,22 +1781,17 @@ static noinline_for_stack bool rcu_gp_init(void)
  	 * of RCU's Requirements documentation.
  	 */
  	WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
+	/* Exclude CPU hotplug operations. */
  	rcu_for_each_leaf_node(rnp) {
-		// Wait for CPU-hotplug operations that might have
-		// started before this grace period did.
-		smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
-		firstseq = READ_ONCE(rnp->ofl_seq);
-		if (firstseq & 0x1)
-			while (firstseq == READ_ONCE(rnp->ofl_seq))
-				schedule_timeout_idle(1);  // Can't wake unless RCU is watching.
-		smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
-		raw_spin_lock(&rcu_state.ofl_lock);
-		raw_spin_lock_irq_rcu_node(rnp);
+		local_irq_save(flags);
+		arch_spin_lock(&rcu_state.ofl_lock);
+		raw_spin_lock_rcu_node(rnp);
  		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
  		    !rnp->wait_blkd_tasks) {
  			/* Nothing to do on this leaf rcu_node structure. */
-			raw_spin_unlock_irq_rcu_node(rnp);
-			raw_spin_unlock(&rcu_state.ofl_lock);
+			raw_spin_unlock_rcu_node(rnp);
+			arch_spin_unlock(&rcu_state.ofl_lock);
+			local_irq_restore(flags);
  			continue;
  		}
@@ -1824,8 +1826,9 @@ static noinline_for_stack bool rcu_gp_init(void)
  				rcu_cleanup_dead_rnp(rnp);
  		}
- raw_spin_unlock_irq_rcu_node(rnp);
-		raw_spin_unlock(&rcu_state.ofl_lock);
+		raw_spin_unlock_rcu_node(rnp);
+		arch_spin_unlock(&rcu_state.ofl_lock);
+		local_irq_restore(flags);
  	}
  	rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
@@ -4233,7 +4236,7 @@ int rcutree_offline_cpu(unsigned int cpu)
   */
  void rcu_cpu_starting(unsigned int cpu)
  {
-	unsigned long flags;
+	unsigned long flags, seq_flags;
  	unsigned long mask;
  	struct rcu_data *rdp;
  	struct rcu_node *rnp;
@@ -4246,11 +4249,11 @@ void rcu_cpu_starting(unsigned int cpu)
rnp = rdp->mynode;
  	mask = rdp->grpmask;
-	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
-	WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+	local_irq_save(seq_flags);
+	arch_spin_lock(&rcu_state.ofl_lock);
  	rcu_dynticks_eqs_online();
  	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().

Can we drop this smp_mb(),as arch_spin_lock(&rcu_state.ofl_lock) provides the ordering now?

-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	raw_spin_lock_rcu_node(rnp);
  	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
  	newcpu = !(rnp->expmaskinitnext & mask);
  	rnp->expmaskinitnext |= mask;
@@ -4269,9 +4272,8 @@ void rcu_cpu_starting(unsigned int cpu)
  	} else {
  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);

'flags' is uninitialized now?



Thanks
Neeraj

  	}
-	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
-	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
-	WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+	arch_spin_unlock(&rcu_state.ofl_lock);
+	local_irq_restore(seq_flags);
  	smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
  }
@@ -4285,7 +4287,7 @@ void rcu_cpu_starting(unsigned int cpu)
   */
  void rcu_report_dead(unsigned int cpu)
  {
-	unsigned long flags;
+	unsigned long flags, seq_flags;
  	unsigned long mask;
  	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
  	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
@@ -4299,10 +4301,8 @@ void rcu_report_dead(unsigned int cpu)
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
  	mask = rdp->grpmask;
-	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
-	WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
-	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
-	raw_spin_lock(&rcu_state.ofl_lock);
+	local_irq_save(seq_flags);
+	arch_spin_lock(&rcu_state.ofl_lock);
  	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
  	rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
  	rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
@@ -4313,10 +4313,8 @@ void rcu_report_dead(unsigned int cpu)
  	}
  	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
  	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-	raw_spin_unlock(&rcu_state.ofl_lock);
-	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
-	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
-	WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+	arch_spin_unlock(&rcu_state.ofl_lock);
+	local_irq_restore(seq_flags);
rdp->cpu_started = false;
  }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 305cf6aeb408..aff4cc9303fb 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -56,8 +56,6 @@ struct rcu_node {
  				/*  Initialized from ->qsmaskinitnext at the */
  				/*  beginning of each grace period. */
  	unsigned long qsmaskinitnext;
-	unsigned long ofl_seq;	/* CPU-hotplug operation sequence count. */
-				/* Online CPUs for next grace period. */
  	unsigned long expmask;	/* CPUs or groups that need to check in */
  				/*  to allow the current expedited GP */
  				/*  to complete. */
@@ -358,7 +356,7 @@ struct rcu_state {
  	const char *name;			/* Name of structure. */
  	char abbr;				/* Abbreviated name. */
- raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
+	arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
  						/* Synchronize offline with */
  						/*  GP pre-initialization. */
  };




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux