[PATCH bpf-next v2 17/26] rqspinlock: Hardcode cond_acquire loops to asm-generic implementation

Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> · Thu, 6 Feb 2025 02:54:25 -0800

Currently, for rqspinlock usage, the implementation of
smp_cond_load_acquire (and thus, atomic_cond_read_acquire) are
susceptible to stalls on arm64, because they do not guarantee that the
conditional expression will be repeatedly invoked if the address being
loaded from is not written to by other CPUs. When support for
event-streams is absent (which unblocks stuck WFE-based loops every
~100us), we may end up being stuck forever.

This causes a problem for us, as we need to repeatedly invoke the
RES_CHECK_TIMEOUT in the spin loop to break out when the timeout
expires.

Hardcode the implementation to the asm-generic version in rqspinlock.c
until support for smp_cond_load_acquire_timewait [0] lands upstream.

  [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@xxxxxxxxxx

Cc: Ankur Arora <ankur.a.arora@xxxxxxxxxx>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx>
---
 kernel/locking/rqspinlock.c | 41 ++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/rqspinlock.c b/kernel/locking/rqspinlock.c
index 49b4f3c75a3e..b4cceeecf29c 100644
--- a/kernel/locking/rqspinlock.c
+++ b/kernel/locking/rqspinlock.c
@@ -325,6 +325,41 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock, u64 timeout)
  */
 static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]);
 
+/*
+ * Hardcode smp_cond_load_acquire and atomic_cond_read_acquire implementations
+ * to the asm-generic implementation. In rqspinlock code, our conditional
+ * expression involves checking the value _and_ additionally a timeout. However,
+ * on arm64, the WFE-based implementation may never spin again if no stores
+ * occur to the locked byte in the lock word. As such, we may be stuck forever
+ * if event-stream based unblocking is not available on the platform for WFE
+ * spin loops (arch_timer_evtstrm_available).
+ *
+ * Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this
+ * workaround.
+ *
+ * [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@xxxxxxxxxx
+ */
+#define res_smp_cond_load_relaxed(ptr, cond_expr) ({		\
+	typeof(ptr) __PTR = (ptr);				\
+	__unqual_scalar_typeof(*ptr) VAL;			\
+	for (;;) {						\
+		VAL = READ_ONCE(*__PTR);			\
+		if (cond_expr)					\
+			break;					\
+		cpu_relax();					\
+	}							\
+	(typeof(*ptr))VAL;					\
+})
+
+#define res_smp_cond_load_acquire(ptr, cond_expr) ({		\
+	__unqual_scalar_typeof(*ptr) _val;			\
+	_val = res_smp_cond_load_relaxed(ptr, cond_expr);	\
+	smp_acquire__after_ctrl_dep();				\
+	(typeof(*ptr))_val;					\
+})
+
+#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
+
 /**
  * resilient_queued_spin_lock_slowpath - acquire the queued spinlock
  * @lock: Pointer to queued spinlock structure
@@ -419,7 +454,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val,
 	 */
 	if (val & _Q_LOCKED_MASK) {
 		RES_RESET_TIMEOUT(ts);
-		smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
+		res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
 	}
 
 	if (ret) {
@@ -568,8 +603,8 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val,
 	 * does not imply a full barrier.
 	 */
 	RES_RESET_TIMEOUT(ts);
-	val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
-				       RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
+	val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
+					   RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
 
 waitq_timeout:
 	if (ret) {
-- 
2.43.5