Replace racy plain accesses with READ_ONCE()/WRITE_ONCE(). And then update the tex file accordingly. A new scheme, which directly extracts snippet from rcu_qs.[hc], is applied. Signed-off-by: Junchang Wang <junchangwang@xxxxxxxxx> Reviewed-by: Akira Yokosawa <akiyks@xxxxxxxxx> --- CodeSamples/defer/rcu_qs.c | 31 ++++++++----- CodeSamples/defer/rcu_qs.h | 42 +++++++++-------- appendix/toyrcu/toyrcu.tex | 109 +++++++++++++++------------------------------ 3 files changed, 78 insertions(+), 104 deletions(-) diff --git a/CodeSamples/defer/rcu_qs.c b/CodeSamples/defer/rcu_qs.c index 27e45f7..6263798 100644 --- a/CodeSamples/defer/rcu_qs.c +++ b/CodeSamples/defer/rcu_qs.c @@ -27,44 +27,51 @@ #include "../api.h" #include "rcu_qs.h" -void synchronize_rcu(void) +//\begin{snippet}[labelbase=ln:defer:rcu_qs:synchronize,gobbleblank=yes,commandchars=\%\@\$] +void synchronize_rcu(void) //\lnlbl{syn:b} { int t; - + //\fcvblank /* Memory barrier ensures mutation seen before grace period. */ - smp_mb(); + smp_mb(); //\lnlbl{syn:mb1} /* Only one synchronize_rcu() at a time. */ - spin_lock(&rcu_gp_lock); + spin_lock(&rcu_gp_lock); //\lnlbl{syn:spinlock} /* Advance to a new grace-period number, enforce ordering. */ - rcu_gp_ctr += 2; - smp_mb(); + WRITE_ONCE(rcu_gp_ctr, rcu_gp_ctr + 2); //\lnlbl{syn:increasegp} + smp_mb(); //\lnlbl{syn:mb2} /* * Wait until all threads are either out of their RCU read-side * critical sections or are aware of the new grace period. */ - for_each_thread(t) { - while (rcu_gp_ongoing(t) && - ((per_thread(rcu_reader_qs_gp, t) - rcu_gp_ctr) < 0)) { + for_each_thread(t) { //\lnlbl{syn:scan:b} + while (rcu_gp_ongoing(t) && //\lnlbl{syn:ongoing} + ((per_thread(rcu_reader_qs_gp, t)//\lnlbl{syn:gp1} + - rcu_gp_ctr) < 0)) { //\lnlbl{syn:gp2} +#ifndef FCV_SNIPPET /* @@@ poll(NULL, 0, 10); */ barrier(); +#else + poll(NULL, 0, 10); //\lnlbl{syn:poll} +#endif } - } + } //\lnlbl{syn:scan:e} /* Let other synchronize_rcu() instances move ahead. */ - spin_unlock(&rcu_gp_lock); + spin_unlock(&rcu_gp_lock); //\lnlbl{syn:spinunlock} /* Ensure that any subsequent free()s happen -after- above checks. */ - smp_mb(); + smp_mb(); //\lnlbl{syn:mb3} } +//\end{snippet} #ifdef TEST #include "rcutorture.h" diff --git a/CodeSamples/defer/rcu_qs.h b/CodeSamples/defer/rcu_qs.h index 3bb45a4..26c3fb5 100644 --- a/CodeSamples/defer/rcu_qs.h +++ b/CodeSamples/defer/rcu_qs.h @@ -48,31 +48,35 @@ static void rcu_init(void) init_per_thread(rcu_reader_qs_gp, rcu_gp_ctr); } -static void rcu_read_lock(void) +//\begin{snippet}[labelbase=ln:defer:rcu_qs:read_lock_unlock,gobbleblank=yes,commandchars=\%\@\$] +static void rcu_read_lock(void) //\lnlbl{lock:b} { } - + //\fcvblank static void rcu_read_unlock(void) { -} - -static void rcu_quiescent_state(void) +} //\lnlbl{unlock:e} + //\fcvblank +static void rcu_quiescent_state(void) //\lnlbl{qs:b} { - smp_mb(); - __get_thread_var(rcu_reader_qs_gp) = ACCESS_ONCE(rcu_gp_ctr) + 1; - smp_mb(); -} - -static void rcu_thread_offline(void) + smp_mb(); //\lnlbl{qs:mb1} + __get_thread_var(rcu_reader_qs_gp) = //\lnlbl{qs:gp1} + READ_ONCE(rcu_gp_ctr) + 1; //\lnlbl{qs:gp2} + smp_mb(); //\lnlbl{qs:mb2} +} //\lnlbl{qs:e} + //\fcvblank +static void rcu_thread_offline(void) //\lnlbl{offline:b} { - smp_mb(); - __get_thread_var(rcu_reader_qs_gp) = ACCESS_ONCE(rcu_gp_ctr); - smp_mb(); -} - -static void rcu_thread_online(void) + smp_mb(); //\lnlbl{offline:mb1} + __get_thread_var(rcu_reader_qs_gp) = //\lnlbl{offline:gp1} + READ_ONCE(rcu_gp_ctr); //\lnlbl{offline:gp2} + smp_mb(); //\lnlbl{offline:mb2} +} //\lnlbl{offline:e} + //\fcvblank +static void rcu_thread_online(void) //\lnlbl{online:b} { - rcu_quiescent_state(); -} + rcu_quiescent_state(); //\lnlbl{online:qs} +} //\lnlbl{online:e} +//\end{snippet} extern void synchronize_rcu(void); diff --git a/appendix/toyrcu/toyrcu.tex b/appendix/toyrcu/toyrcu.tex index 7762925..ded754d 100644 --- a/appendix/toyrcu/toyrcu.tex +++ b/appendix/toyrcu/toyrcu.tex @@ -1675,50 +1675,19 @@ overhead. \end{listing} \begin{listing}[tbp] -{ \scriptsize -\begin{verbbox} - 1 static void rcu_read_lock(void) - 2 { - 3 } - 4 - 5 static void rcu_read_unlock(void) - 6 { - 7 } - 8 - 9 rcu_quiescent_state(void) - 10 { - 11 smp_mb(); - 12 __get_thread_var(rcu_reader_qs_gp) = - 13 ACCESS_ONCE(rcu_gp_ctr) + 1; - 14 smp_mb(); - 15 } - 16 - 17 static void rcu_thread_offline(void) - 18 { - 19 smp_mb(); - 20 __get_thread_var(rcu_reader_qs_gp) = - 21 ACCESS_ONCE(rcu_gp_ctr); - 22 smp_mb(); - 23 } - 24 - 25 static void rcu_thread_online(void) - 26 { - 27 rcu_quiescent_state(); - 28 } -\end{verbbox} -} -\centering -\theverbbox +\input{CodeSamples/defer/rcu_qs@read_lock_unlock.fcv} \caption{Quiescent-State-Based RCU Read Side} \label{lst:app:toyrcu:Quiescent-State-Based RCU Read Side} \end{listing} +\begin{lineref}[ln:defer:rcu_qs:read_lock_unlock] Listing~\ref{lst:app:toyrcu:Quiescent-State-Based RCU Read Side} (\path{rcu_qs.h}) shows the read-side primitives used to construct a user-level implementation of RCU based on quiescent states, with the data shown in Listing~\ref{lst:app:toyrcu:Data for Quiescent-State-Based RCU}. -As can be seen from lines~1-7 in the listing, the \co{rcu_read_lock()} +As can be seen from lines~\lnref{lock:b}-\lnref{unlock:e} in the listing, +the \co{rcu_read_lock()} and \co{rcu_read_unlock()} primitives do nothing, and can in fact be expected to be inlined and optimized away, as they are in server builds of the Linux kernel. @@ -1726,11 +1695,14 @@ This is due to the fact that quiescent-state-based RCU implementations \emph{approximate} the extents of RCU read-side critical sections using the aforementioned quiescent states. Each of these quiescent states contains a call to -\co{rcu_quiescent_state()}, which is shown from lines~9-15 in the listing. +\co{rcu_quiescent_state()}, which is shown from +lines~\lnref{qs:b}-\lnref{qs:e} in the listing. Threads entering extended quiescent states (for example, when blocking) -may instead call \co{rcu_thread_offline()} (lines~17-23) when entering +may instead call \co{rcu_thread_offline()} +(lines~\lnref{offline:b}-\lnref{offline:e}) when entering an extended quiescent state and then call -\co{rcu_thread_online()} (lines~25-28) when leaving it. +\co{rcu_thread_online()} +(lines~\lnref{online:b}-\lnref{online:e}) when leaving it. As such, \co{rcu_thread_online()} is analogous to \co{rcu_read_lock()} and \co{rcu_thread_offline()} is analogous to \co{rcu_read_unlock()}. In addition, \co{rcu_quiescent_state()} can be thought of as a @@ -1743,13 +1715,16 @@ In addition, \co{rcu_quiescent_state()} can be thought of as a performance optimizations.} It is illegal to invoke \co{rcu_quiescent_state()}, \co{rcu_thread_offline()}, or \co{rcu_thread_online()} from an RCU read-side critical section. +\end{lineref} -In \co{rcu_quiescent_state()}, line~11 executes a memory barrier +\begin{lineref}[ln:defer:rcu_qs:read_lock_unlock:qs] +In \co{rcu_quiescent_state()}, line~\lnref{mb1} executes a memory barrier to prevent any code prior to the quiescent state (including possible RCU read-side critical sections) from being reordered into the quiescent state. -Lines~12-13 pick up a copy of the global \co{rcu_gp_ctr}, using -\co{ACCESS_ONCE()} to ensure that the compiler does not employ any +Lines~\lnref{gp1}-\lnref{gp2} pick up +a copy of the global \co{rcu_gp_ctr}, using +\co{READ_ONCE()} to ensure that the compiler does not employ any optimizations that would result in \co{rcu_gp_ctr} being fetched more than once, and then adds one to the value fetched and stores it into @@ -1758,15 +1733,19 @@ instance of \co{synchronize_rcu()} will see an odd-numbered value, thus becoming aware that a new RCU read-side critical section has started. Instances of \co{synchronize_rcu()} that are waiting on older RCU read-side critical sections will thus know to ignore this new one. -Finally, line~14 executes a memory barrier, which prevents subsequent +Finally, line~\lnref{mb2} executes a memory barrier, which prevents subsequent code (including a possible RCU read-side critical section) from being -re-ordered with the lines~12-13. +re-ordered with the lines~\lnref{gp1}-\lnref{gp2}. +\end{lineref} \QuickQuiz{} - Doesn't the additional memory barrier shown on line~14 of + \begin{lineref}[ln:defer:rcu_qs:read_lock_unlock:qs] + Doesn't the additional memory barrier shown on line~\lnref{mb2} of Listing~\ref{lst:app:toyrcu:Quiescent-State-Based RCU Read Side} greatly increase the overhead of \co{rcu_quiescent_state}? + \end{lineref} \QuickQuizAnswer{ + \begin{lineref}[ln:defer:rcu_qs:read_lock_unlock:qs] Indeed it does! An application using this implementation of RCU should therefore invoke \co{rcu_quiescent_state} sparingly, instead using @@ -1774,9 +1753,11 @@ re-ordered with the lines~12-13. time. However, this memory barrier is absolutely required so that - other threads will see the store on lines~12-13 before any + other threads will see the store on + lines~\lnref{gp1}-\lnref{gp2} before any subsequent RCU read-side critical sections executed by the caller. + \end{lineref} } \QuickQuizEnd Some applications might use RCU only occasionally, but use it very heavily @@ -1796,17 +1777,22 @@ Any concurrent instances of \co{synchronize_rcu()} will thus know to ignore this thread. \QuickQuiz{} - Why are the two memory barriers on lines~19 and 22 of + \begin{lineref}[ln:defer:rcu_qs:read_lock_unlock:qs] + Why are the two memory barriers on lines~\lnref{mb1} and \lnref{mb2} of Listing~\ref{lst:app:toyrcu:Quiescent-State-Based RCU Read Side} needed? + \end{lineref} \QuickQuizAnswer{ - The memory barrier on line~19 prevents any RCU read-side + \begin{lineref}[ln:defer:rcu_qs:read_lock_unlock:qs] + The memory barrier on line~\lnref{mb1} prevents any RCU read-side critical sections that might precede the call to \co{rcu_thread_offline()} won't be reordered by either - the compiler or the CPU to follow the assignment on lines~20-21. - The memory barrier on line~22 is, strictly speaking, unnecessary, + the compiler or the CPU to follow the assignment on + lines~\lnref{gp1}-\lnref{gp2}. + The memory barrier on line~\lnref{mb2} is, strictly speaking, unnecessary, as it is illegal to have any RCU read-side critical sections following the call to \co{rcu_thread_offline()}. + \end{lineref} } \QuickQuizEnd The \co{rcu_thread_online()} function simply invokes @@ -1814,30 +1800,7 @@ The \co{rcu_thread_online()} function simply invokes quiescent state. \begin{listing}[tbp] -{ \scriptsize -\begin{verbbox} - 1 void synchronize_rcu(void) - 2 { - 3 int t; - 4 - 5 smp_mb(); - 6 spin_lock(&rcu_gp_lock); - 7 rcu_gp_ctr += 2; - 8 smp_mb(); - 9 for_each_thread(t) { - 10 while (rcu_gp_ongoing(t) && - 11 ((per_thread(rcu_reader_qs_gp, t) - - 12 rcu_gp_ctr) < 0)) { - 13 poll(NULL, 0, 10); - 14 } - 15 } - 16 spin_unlock(&rcu_gp_lock); - 17 smp_mb(); - 18 } -\end{verbbox} -} -\centering -\theverbbox +\input{CodeSamples/defer/rcu_qs@xxxxxxxxxxxxxxx} \caption{RCU Update Side Using Quiescent States} \label{lst:app:toyrcu:RCU Update Side Using Quiescent States} \end{listing} -- 2.7.4