[PATCH v4 1/2] rcu: Use READ_ONCE() and WRITE_ONCE() for shared variable rcu_gp_ctr

Junchang Wang <junchangwang@xxxxxxxxx> · Wed, 1 May 2019 22:22:24 +0800

Replace racy plain accesses with READ_ONCE()/WRITE_ONCE(). And then update
the tex file accordingly. A new scheme, which directly extracts snippet
from rcu.[hc], is applied.

Signed-off-by: Junchang Wang <junchangwang@xxxxxxxxx>
Reviewed-by: Akira Yokosawa <akiyks@xxxxxxxxx>
---
 CodeSamples/defer/rcu.c    | 34 +++++++++-------
 CodeSamples/defer/rcu.h    | 21 ++++++----
 appendix/toyrcu/toyrcu.tex | 97 ++++++++++++++++++++--------------------------
 3 files changed, 74 insertions(+), 78 deletions(-)

diff --git a/CodeSamples/defer/rcu.c b/CodeSamples/defer/rcu.c
index 4b868cc..483b5fa 100644
--- a/CodeSamples/defer/rcu.c
+++ b/CodeSamples/defer/rcu.c
@@ -21,45 +21,51 @@
 #include "../api.h"
 #include "rcu.h"
 
-void synchronize_rcu(void)
+//\begin{snippet}[labelbase=ln:defer:rcu:synchronize,gobbleblank=yes,commandchars=\%\@\$]
+void synchronize_rcu(void)			//\lnlbl{syn:b}
 {
 	int t;
-
+							//\fcvblank
 	/* Memory barrier ensures mutation seen before grace period. */
 
-	smp_mb();
+	smp_mb();				//\lnlbl{syn:mb1}
 
 	/* Only one synchronize_rcu() at a time. */
 
-	spin_lock(&rcu_gp_lock);
+	spin_lock(&rcu_gp_lock);		//\lnlbl{syn:spinlock}
 
 	/* Advance to a new grace-period number, enforce ordering. */
 
-	rcu_gp_ctr += 2;
-	smp_mb();
+	WRITE_ONCE(rcu_gp_ctr, rcu_gp_ctr + 2); //\lnlbl{syn:increasegp}
+	smp_mb();				//\lnlbl{syn:mb2}
 
 	/*
 	 * Wait until all threads are either out of their RCU read-side
 	 * critical sections or are aware of the new grace period.
 	 */
 
-	for_each_thread(t) {
-		while ((per_thread(rcu_reader_gp, t) & 0x1) &&
-		       ((per_thread(rcu_reader_gp, t) - rcu_gp_ctr) < 0)) {
+	for_each_thread(t) {			//\lnlbl{syn:scan:b}
+		while ((per_thread(rcu_reader_gp, t) & 0x1) &&//\lnlbl{syn:even}
+		       ((per_thread(rcu_reader_gp, t) -	//\lnlbl{syn:gt1}
+		         rcu_gp_ctr) < 0)) {		//\lnlbl{syn:gt2}
+#ifndef FCV_SNIPPET
 			/*@@@ poll(NULL, 0, 10); */
 			barrier();
+#else
+			poll(NULL, 0, 10);	//\lnlbl{syn:poll}
+#endif
 		}
-	}
+	}					//\lnlbl{syn:scan:e}
 
 	/* Let other synchronize_rcu() instances move ahead. */
 
-	spin_unlock(&rcu_gp_lock);
+	spin_unlock(&rcu_gp_lock);		//\lnlbl{syn:spinunlock}
 
 	/* Ensure that any subsequent free()s happen -after- above checks. */
 
-	smp_mb();
-}
-
+	smp_mb();				//\lnlbl{syn:mb3}
+}						//\lnlbl{syn:e}
+//\end{snippet}
 #ifdef TEST
 #include "rcutorture.h"
 #endif /* #ifdef TEST */
diff --git a/CodeSamples/defer/rcu.h b/CodeSamples/defer/rcu.h
index f493f8b..3641c33 100644
--- a/CodeSamples/defer/rcu.h
+++ b/CodeSamples/defer/rcu.h
@@ -31,7 +31,8 @@ static inline void rcu_init(void)
 	init_per_thread(rcu_reader_gp_snap, 0);
 }
 
-static inline void rcu_read_lock(void)
+//\begin{snippet}[labelbase=ln:defer:rcu:read_lock_unlock,gobbleblank=yes,commandchars=\%\@\$]
+static inline void rcu_read_lock(void)			//\lnlbl{lock:b}
 {
 	/*
 	 * Copy the current GP counter to this thread's counter, setting
@@ -41,11 +42,12 @@ static inline void rcu_read_lock(void)
 	 * periodic per-thread processing.)
 	 */
 
-	__get_thread_var(rcu_reader_gp) = rcu_gp_ctr + 1;
-	smp_mb();
-}
-
-static inline void rcu_read_unlock(void)
+	__get_thread_var(rcu_reader_gp) =		//\lnlbl{lock:gp1}
+		READ_ONCE(rcu_gp_ctr) + 1;		//\lnlbl{lock:gp2}
+	smp_mb();					//\lnlbl{lock:mb}
+}							//\lnlbl{lock:e}
+								//\fcvblank
+static inline void rcu_read_unlock(void)		//\lnlbl{unlock:b}
 {
 	/*
 	 * Copy the current GP counter to this thread's counter, but
@@ -54,8 +56,11 @@ static inline void rcu_read_unlock(void)
 	 * the memory barrier requires periodic per-thread processing.)
 	 */
 
-	smp_mb();
-	__get_thread_var(rcu_reader_gp) = rcu_gp_ctr;
+	smp_mb();					//\lnlbl{unlock:mb}
+	__get_thread_var(rcu_reader_gp) =		//\lnlbl{unlock:gp1}
+		READ_ONCE(rcu_gp_ctr);			//\lnlbl{unlock:gp2}
 }
+								//\fcvblank
+//\end{snippet}
 
 extern void synchronize_rcu(void);
diff --git a/appendix/toyrcu/toyrcu.tex b/appendix/toyrcu/toyrcu.tex
index c0a45a8..7762925 100644
--- a/appendix/toyrcu/toyrcu.tex
+++ b/appendix/toyrcu/toyrcu.tex
@@ -1218,44 +1218,8 @@ thread-local accesses to one, as is done in the next section.
 \end{listing}
 
 \begin{listing}[tbp]
-{ \scriptsize
-\begin{verbbox}
- 1 static void rcu_read_lock(void)
- 2 {
- 3   __get_thread_var(rcu_reader_gp) =
- 4     ACCESS_ONCE(rcu_gp_ctr) + 1;
- 5   smp_mb();
- 6 }
- 7 
- 8 static void rcu_read_unlock(void)
- 9 {
-10   smp_mb();
-11   __get_thread_var(rcu_reader_gp) =
-12     ACCESS_ONCE(rcu_gp_ctr);
-13 }
-14 
-15 void synchronize_rcu(void)
-16 {
-17   int t;
-18 
-19   smp_mb();
-20   spin_lock(&rcu_gp_lock);
-21   ACCESS_ONCE(rcu_gp_ctr) += 2;
-22   smp_mb();
-23   for_each_thread(t) {
-24     while ((per_thread(rcu_reader_gp, t) & 0x1) &&
-25             ((per_thread(rcu_reader_gp, t) -
-26               ACCESS_ONCE(rcu_gp_ctr)) < 0)) {
-27       poll(NULL, 0, 10);
-28     }
-29   }
-30   spin_unlock(&rcu_gp_lock);
-31   smp_mb();
-32 }
-\end{verbbox}
-}
-\centering
-\theverbbox
+\input{CodeSamples/defer/rcu@read_lock_unlock.fcv}\vspace*{-11pt}\fvset{firstnumber=last}
+\input{CodeSamples/defer/rcu@xxxxxxxxxxxxxxx}\fvset{firstnumber=auto}
 \caption{Free-Running Counter Using RCU}
 \label{lst:app:toyrcu:Free-Running Counter Using RCU}
 \end{listing}
@@ -1267,25 +1231,34 @@ that takes on only even-numbered values, with data shown in
 Listing~\ref{lst:app:toyrcu:Data for Free-Running Counter Using RCU}.
 The resulting \co{rcu_read_lock()} implementation is extremely
 straightforward.
-Lines~3 and~4 simply add one to the global free-running \co{rcu_gp_ctr}
+\begin{lineref}[ln:defer:rcu:read_lock_unlock:lock]
+Lines~\lnref{gp1} and~\lnref{gp2} simply
+add one to the global free-running \co{rcu_gp_ctr}
 variable and stores the resulting odd-numbered value into the
 \co{rcu_reader_gp} per-thread variable.
-Line~5 executes a memory barrier to prevent the content of the
+Line~\lnref{mb} executes a memory barrier to prevent the content of the
 subsequent RCU read-side critical section from ``leaking out''.
+\end{lineref}
 
+\begin{lineref}[ln:defer:rcu:read_lock_unlock:unlock]
 The \co{rcu_read_unlock()} implementation is similar.
-Line~10 executes a memory barrier, again to prevent the prior RCU
+Line~\lnref{mb} executes a memory barrier, again to prevent the prior RCU
 read-side critical section from ``leaking out''.
-Lines~11 and~12 then copy the \co{rcu_gp_ctr} global variable to the
+Lines~\lnref{gp1} and~\lnref{gp2} then
+copy the \co{rcu_gp_ctr} global variable to the
 \co{rcu_reader_gp} per-thread variable, leaving this per-thread
 variable with an even-numbered value so that a concurrent instance
 of \co{synchronize_rcu()} will know to ignore it.
+\end{lineref}
 
 \QuickQuiz{}
+	\begin{lineref}[ln:defer:rcu:read_lock_unlock:unlock]
 	If any even value is sufficient to tell \co{synchronize_rcu()}
-	to ignore a given task, why don't lines~10 and~11 of
+	to ignore a given task, why don't lines~\lnref{gp1}
+	and~\lnref{gp2} of
 	Listing~\ref{lst:app:toyrcu:Free-Running Counter Using RCU}
 	simply assign zero to \co{rcu_reader_gp}?
+	\end{lineref}
 \QuickQuizAnswer{
 	Assigning zero (or any other even-numbered constant)
 	would in fact work, but assigning the value of
@@ -1294,39 +1267,46 @@ of \co{synchronize_rcu()} will know to ignore it.
 	thread last exited an RCU read-side critical section.
 } \QuickQuizEnd
 
+\begin{lineref}[ln:defer:rcu:synchronize:syn]
 Thus, \co{synchronize_rcu()} could wait for all of the per-thread
 \co{rcu_reader_gp} variables to take on even-numbered values.
 However, it is possible to do much better than that because
 \co{synchronize_rcu()} need only wait on \emph{pre-existing}
 RCU read-side critical sections.
-Line~19 executes a memory barrier to prevent prior manipulations
+Line~\lnref{mb1} executes a memory barrier to prevent prior manipulations
 of RCU-protected data structures from being reordered (by either
-the CPU or the compiler) to follow the increment on line~21.
-Line~20 acquires the \co{rcu_gp_lock} (and line~30 releases it)
+the CPU or the compiler) to follow the increment on
+line~\lnref{increasegp}.
+Line~\lnref{spinlock} acquires the \co{rcu_gp_lock}
+(and line~\lnref{spinunlock} releases it)
 in order to prevent multiple
 \co{synchronize_rcu()} instances from running concurrently.
-Line~21 then increments the global \co{rcu_gp_ctr} variable by
+Line~\lnref{increasegp} then increments the global \co{rcu_gp_ctr} variable by
 two, so that all pre-existing RCU read-side critical sections will
 have corresponding per-thread \co{rcu_reader_gp} variables with
 values less than that of \co{rcu_gp_ctr}, modulo the machine's
 word size.
 Recall also that threads with even-numbered values of \co{rcu_reader_gp}
 are not in an RCU read-side critical section,
-so that lines~23-29 scan the \co{rcu_reader_gp} values until they
-all are either even (line~24) or are greater than the global
-\co{rcu_gp_ctr} (lines~25-26).
-Line~27 blocks for a short period of time to wait for a
+so that lines~\lnref{scan:b}-\lnref{scan:e}
+scan the \co{rcu_reader_gp} values until they
+all are either even (line~\lnref{even}) or are greater than the global
+\co{rcu_gp_ctr} (lines~\lnref{gt1}-\lnref{gt2}).
+Line~\lnref{poll} blocks for a short period of time to wait for a
 pre-existing RCU read-side critical section, but this can be replaced with
 a spin-loop if grace-period latency is of the essence.
-Finally, the memory barrier at line~31 ensures that any subsequent
+Finally, the memory barrier at line~\lnref{mb3} ensures that any subsequent
 destruction will not be reordered into the preceding loop.
+\end{lineref}
 
 \QuickQuiz{}
-	Why are the memory barriers on lines~19 and~31 of
+	\begin{lineref}[ln:defer:rcu:synchronize:syn]
+	Why are the memory barriers on lines~\lnref{mb1} and~\lnref{mb3} of
 	Listing~\ref{lst:app:toyrcu:Free-Running Counter Using RCU}
 	needed?
 	Aren't the memory barriers inherent in the locking
-	primitives on lines~20 and~30 sufficient?
+	primitives on lines~\lnref{spinlock} and~\lnref{spinunlock} sufficient?
+	\end{lineref}
 \QuickQuizAnswer{
 	These memory barriers are required because the locking
 	primitives are only guaranteed to confine the critical
@@ -1355,11 +1335,12 @@ such CPUs.
 	This work is left as an exercise for the reader.
 } \QuickQuizEnd
 
+\begin{lineref}[ln:defer:rcu:read_lock_unlock:lock]
 This implementation suffers from some serious shortcomings in
 addition to the high update-side overhead noted earlier.
 First, it is no longer permissible to nest RCU read-side critical
 sections, a topic that is taken up in the next section.
-Second, if a reader is preempted at line~3 of
+Second, if a reader is preempted at line~\lnref{gp1} of
 Listing~\ref{lst:app:toyrcu:Free-Running Counter Using RCU} after fetching from
 \co{rcu_gp_ctr} but before storing to \co{rcu_reader_gp},
 and if the \co{rcu_gp_ctr} counter then runs through more than half
@@ -1368,15 +1349,19 @@ will ignore the subsequent RCU read-side critical section.
 Third and finally, this implementation requires that the enclosing software
 environment be able to enumerate threads and maintain per-thread
 variables.
+\end{lineref}
 
 \QuickQuiz{}
+	\begin{lineref}[ln:defer:rcu:read_lock_unlock:lock]
 	Is the possibility of readers being preempted in
-	lines~3-4 of Listing~\ref{lst:app:toyrcu:Free-Running Counter Using RCU}
+	lines~\lnref{gp1}-\lnref{gp2} of
+	Listing~\ref{lst:app:toyrcu:Free-Running Counter Using RCU}
 	a real problem, in other words, is there a real sequence
 	of events that could lead to failure?
 	If not, why not?
 	If so, what is the sequence of events, and how can the
 	failure be addressed?
+	\end{lineref}
 \QuickQuizAnswer{
 	It is a real problem, there is a sequence of events leading to
 	failure, and there are a number of possible ways of
-- 
2.7.4