On Sat, Jan 18, 2025 at 8:22 AM Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> wrote: > > Implement queued spin lock algorithm as BPF program for lock words > living in BPF arena. > > The algorithm is copied from kernel/locking/qspinlock.c and adapted for > BPF use. > > We first implement abstract helpers for portable atomics and > acquire/release load instructions, by relying on X86_64 presence to > elide expensive barriers and rely on implementation details of the JIT, > and fall back to slow but correct implementations elsewhere. When > support for acquire/release load/stores lands, we can improve this > state. > > Then, the qspinlock algorithm is adapted to remove dependence on > multi-word atomics due to lack of support in BPF ISA. For instance, > xchg_tail cannot use 16-bit xchg, and needs to be a implemented as a > 32-bit try_cmpxchg loop. > > Loops which are seemingly infinite from verifier PoV are annotated with > cond_break. > > No feedback is given when loops containing cond_break break due to > stalling, the arena will basically be corrupt if a deadlock is > triggered. This can be changed in the future with a better cancellation > primitive for stuck programs, or integrating resilient spin lock > support. > > Only 1024 NR_CPUs are supported. > > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> > --- > .../selftests/bpf/bpf_arena_qspinlock.h | 441 ++++++++++++++++++ > tools/testing/selftests/bpf/bpf_atomic.h | 121 +++++ > 2 files changed, 562 insertions(+) > create mode 100644 tools/testing/selftests/bpf/bpf_arena_qspinlock.h > create mode 100644 tools/testing/selftests/bpf/bpf_atomic.h > > diff --git a/tools/testing/selftests/bpf/bpf_arena_qspinlock.h b/tools/testing/selftests/bpf/bpf_arena_qspinlock.h > new file mode 100644 > index 000000000000..cf8c5b1eced9 > --- /dev/null > +++ b/tools/testing/selftests/bpf/bpf_arena_qspinlock.h > @@ -0,0 +1,441 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ > +#ifndef BPF_ARENA_QSPINLOCK_H > + > +#include <vmlinux.h> > +#include <bpf/bpf_helpers.h> > +#include "bpf_atomic.h" > + > +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) > + > +#ifndef __arena > +#define __arena __attribute__((address_space(1))) > +#endif > + > +extern unsigned long CONFIG_NR_CPUS __kconfig; > + > +struct arena_mcs_spinlock { > + struct arena_mcs_spinlock __arena *next; > + int locked; > + int count; > +}; > + > +struct arena_qnode { > + struct arena_mcs_spinlock mcs; > +}; > + > +#define _Q_MAX_NODES 4 > +#define _Q_PENDING_LOOPS 1 > + > +/* > + * Bitfields in the atomic value: > + * > + * 0- 7: locked byte > + * 8: pending > + * 9-15: not used > + * 16-17: tail index > + * 18-31: tail cpu (+1) > + */ > +#define _Q_MAX_CPUS 1024 > + > +#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ > + << _Q_ ## type ## _OFFSET) > +#define _Q_LOCKED_OFFSET 0 > +#define _Q_LOCKED_BITS 8 > +#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) > + > +#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) > +#define _Q_PENDING_BITS 8 > +#define _Q_PENDING_MASK _Q_SET_MASK(PENDING) > + > +#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) > +#define _Q_TAIL_IDX_BITS 2 > +#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) > + > +#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) > +#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) > +#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) > + > +#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET > +#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) > + > +#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) > +#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) > + > +#define __pure __attribute__((pure)) > +#define likely(x) __builtin_expect(!!(x), 1) > +#define unlikely(x) __builtin_expect(!!(x), 0) > + > +static struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; > + > +static inline __pure u32 encode_tail(int cpu, int idx) Pls remove __pure. The compiler is smart enough without this aid. > +{ > + u32 tail; > + > + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; > + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ > + > + return tail; > +} > + > +static inline __pure struct arena_mcs_spinlock __arena * > +decode_tail(u32 tail, struct arena_qnode (__arena *qnodes)[_Q_MAX_CPUS][_Q_MAX_NODES]) > +{ > + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; > + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; > + struct arena_qnode __arena (*qnode)[_Q_MAX_NODES] = qnodes[cpu]; > + > + return &qnode[idx]->mcs; > +} > + > +static inline __pure > +struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) > +{ > + return &((struct arena_qnode __arena *)base + idx)->mcs; > +} > + > +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) > + > +/** > + * xchg_tail - Put in the new queue tail code word & retrieve previous one > + * @lock : Pointer to queued spinlock structure > + * @tail : The new queue tail code word > + * Return: The previous queue tail code word > + * > + * xchg(lock, tail) > + * > + * p,*,* -> n,*,* ; prev = xchg(lock, node) > + */ > +static __always_inline u32 xchg_tail(struct qspinlock __arena *lock, u32 tail) > +{ > + u32 old, new; > + > + old = atomic_read(&lock->val); > + do { > + new = (old & _Q_LOCKED_PENDING_MASK) | tail; > + /* > + * We can use relaxed semantics since the caller ensures that > + * the MCS node is properly initialized before updating the > + * tail. > + */ > + cond_break; > + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); > + > + return old; > +} > + > +/** > + * clear_pending - clear the pending bit. > + * @lock: Pointer to queued spinlock structure > + * > + * *,1,* -> *,0,* > + */ > +static __always_inline void clear_pending(struct qspinlock __arena *lock) > +{ > + WRITE_ONCE(lock->pending, 0); > +} > + > +/** > + * clear_pending_set_locked - take ownership and clear the pending bit. > + * @lock: Pointer to queued spinlock structure > + * > + * *,1,0 -> *,0,1 > + * > + * Lock stealing is not allowed if this function is used. > + */ > +static __always_inline void clear_pending_set_locked(struct qspinlock __arena *lock) > +{ > + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); > +} > + > +/** > + * set_locked - Set the lock bit and own the lock > + * @lock: Pointer to queued spinlock structure > + * > + * *,*,0 -> *,0,1 > + */ > +static __always_inline void set_locked(struct qspinlock __arena *lock) > +{ > + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); > +} > + > +static __always_inline > +u32 queued_fetch_set_pending_acquire(struct qspinlock __arena *lock) > +{ > + u32 old, new; > + > + old = atomic_read(&lock->val); > + do { > + new = old | _Q_PENDING_VAL; > + cond_break; > + } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); > + > + return old; > +} > + > +/** > + * queued_spin_trylock - try to acquire the queued spinlock > + * @lock : Pointer to queued spinlock structure > + * Return: 1 if lock acquired, 0 if failed > + */ > +static __always_inline int queued_spin_trylock(struct qspinlock __arena *lock) > +{ > + int val = atomic_read(&lock->val); > + > + if (unlikely(val)) > + return 0; > + > + return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); > +} > + > +static void queued_spin_lock_slowpath(struct qspinlock __arena *lock, u32 val); > + > +#define EOPNOTSUPP 95 > + > +/** > + * queued_spin_lock - acquire a queued spinlock > + * @lock: Pointer to queued spinlock structure > + */ > +static __always_inline int queued_spin_lock(struct qspinlock __arena *lock) > +{ > + int val = 0; > + > + if (CONFIG_NR_CPUS > 1024) > + return -EOPNOTSUPP; > + > + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) > + return 0; > + > + queued_spin_lock_slowpath(lock, val); > + return 0; Since bpf prog has to do: if (queued_spin_lock(&lock))... Let's do return queued_spin_lock_slowpath(...); right away. > +} > + > +/** > + * queued_spin_unlock - release a queued spinlock > + * @lock : Pointer to queued spinlock structure > + */ > +static __always_inline void queued_spin_unlock(struct qspinlock __arena *lock) > +{ > + /* > + * unlock() needs release semantics: > + */ > + smp_store_release(&lock->locked, 0); > +} > + > +static void queued_spin_lock_slowpath(struct qspinlock __arena *lock, u32 val) > +{ > + struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; > + u32 old, tail; > + int idx; > + > + /* > + * Wait for in-progress pending->locked hand-overs with a bounded > + * number of spins so that we guarantee forward progress. > + * > + * 0,1,0 -> 0,0,1 > + */ > + if (val == _Q_PENDING_VAL) { > + int cnt = _Q_PENDING_LOOPS; > + val = atomic_cond_read_relaxed(&lock->val, > + (VAL != _Q_PENDING_VAL) || !cnt--); > + } > + > + /* > + * If we observe any contention; queue. > + */ > + if (val & ~_Q_LOCKED_MASK) > + goto queue; > + > + /* > + * trylock || pending > + * > + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock > + */ > + val = queued_fetch_set_pending_acquire(lock); > + > + /* > + * If we observe contention, there is a concurrent locker. > + * > + * Undo and queue; our setting of PENDING might have made the > + * n,0,0 -> 0,0,0 transition fail and it will now be waiting > + * on @next to become !NULL. > + */ > + if (unlikely(val & ~_Q_LOCKED_MASK)) { > + > + /* Undo PENDING if we set it. */ > + if (!(val & _Q_PENDING_MASK)) > + clear_pending(lock); > + > + goto queue; > + } > + > + /* > + * We're pending, wait for the owner to go away. > + * > + * 0,1,1 -> *,1,0 > + * > + * this wait loop must be a load-acquire such that we match the > + * store-release that clears the locked bit and create lock > + * sequentiality; this is because not all > + * clear_pending_set_locked() implementations imply full > + * barriers. > + */ > + if (val & _Q_LOCKED_MASK) > + smp_cond_load_acquire(&lock->locked, !VAL); > + > + /* > + * take ownership and clear the pending bit. > + * > + * 0,1,0 -> 0,0,1 > + */ > + clear_pending_set_locked(lock); > + return; > + > + /* > + * End of pending bit optimistic spinning and beginning of MCS > + * queuing. > + */ > +queue: > + node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; > + idx = node0->count++; > + tail = encode_tail(bpf_get_smp_processor_id(), idx); > + > + /* > + * 4 nodes are allocated based on the assumption that there will > + * not be nested NMIs taking spinlocks. That may not be true in > + * some architectures even though the chance of needing more than > + * 4 nodes will still be extremely unlikely. When that happens, > + * we fall back to spinning on the lock directly without using > + * any MCS node. This is not the most elegant solution, but is > + * simple enough. > + */ > + if (unlikely(idx >= _Q_MAX_NODES)) { > + while (!queued_spin_trylock(lock)) { > + cpu_relax(); > + cond_break; > + } > + goto release; > + } > + > + node = grab_mcs_node(node0, idx); > + > + /* > + * Ensure that we increment the head node->count before initialising > + * the actual node. If the compiler is kind enough to reorder these > + * stores, then an IRQ could overwrite our assignments. > + */ > + barrier(); > + > + node->locked = 0; > + node->next = NULL; > + > + /* > + * We touched a (possibly) cold cacheline in the per-cpu queue node; > + * attempt the trylock once more in the hope someone let go while we > + * weren't watching. > + */ > + if (queued_spin_trylock(lock)) > + goto release; > + > + /* > + * Ensure that the initialisation of @node is complete before we > + * publish the updated tail via xchg_tail() and potentially link > + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. > + */ > + smp_wmb(); > + > + /* > + * Publish the updated tail. > + * We have already touched the queueing cacheline; don't bother with > + * pending stuff. > + * > + * p,*,* -> n,*,* > + */ > + old = xchg_tail(lock, tail); > + next = NULL; > + > + /* > + * if there was a previous node; link it and wait until reaching the > + * head of the waitqueue. > + */ > + if (old & _Q_TAIL_MASK) { > + prev = decode_tail(old, &qnodes); > + > + /* Link @node into the waitqueue. */ > + WRITE_ONCE(prev->next, node); > + > + arch_mcs_spin_lock_contended(&node->locked); > + > + /* > + * While waiting for the MCS lock, the next pointer may have > + * been set by another lock waiter. We cannot prefetch here > + * due to lack of equivalent instruction in BPF ISA. > + */ > + next = READ_ONCE(node->next); > + } > + > + /* > + * we're at the head of the waitqueue, wait for the owner & pending to > + * go away. > + * > + * *,x,y -> *,0,0 > + * > + * this wait loop must use a load-acquire such that we match the > + * store-release that clears the locked bit and create lock > + * sequentiality; this is because the set_locked() function below > + * does not imply a full barrier. > + */ > + val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK)); cond_break in xchg loops is not a concern, but this one can spin for long time. Have you tried asm ("may_goto out"); out: return -ETIMEDOUT; as suggested earlier? I think it should work already and will give a clear signal instead of silent corruption. > + > + /* > + * claim the lock: > + * > + * n,0,0 -> 0,0,1 : lock, uncontended > + * *,*,0 -> *,*,1 : lock, contended > + * > + * If the queue head is the only one in the queue (lock value == tail) > + * and nobody is pending, clear the tail code and grab the lock. > + * Otherwise, we only need to grab the lock. > + */ > + > + /* > + * In the PV case we might already have _Q_LOCKED_VAL set, because > + * of lock stealing; therefore we must also allow: > + * > + * n,0,1 -> 0,0,1 > + * > + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the > + * above wait condition, therefore any concurrent setting of > + * PENDING will make the uncontended transition fail. > + */ > + if ((val & _Q_TAIL_MASK) == tail) { > + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) > + goto release; /* No contention */ > + } > + > + /* > + * Either somebody is queued behind us or _Q_PENDING_VAL got set > + * which will then detect the remaining tail and queue behind us > + * ensuring we'll see a @next. > + */ > + set_locked(lock); > + > + /* > + * contended path; wait for next if not observed yet, release. > + */ > + if (!next) > + next = smp_cond_load_relaxed(&node->next, (VAL)); same here. Doing may_goto out here is necessary. Otherwise it's too much of a footgun. > + arch_mcs_spin_unlock_contended(&next->locked); > + > +release:; > + /* > + * release the node > + */ > + /* TODO(kkd): Is replacing __this_cpu_dec with this ok? */ > + node0->count--; > +} > + > +#endif > + > +#endif > diff --git a/tools/testing/selftests/bpf/bpf_atomic.h b/tools/testing/selftests/bpf/bpf_atomic.h > new file mode 100644 > index 000000000000..d9a8b9cd27b4 > --- /dev/null > +++ b/tools/testing/selftests/bpf/bpf_atomic.h > @@ -0,0 +1,121 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ > + > +#include <vmlinux.h> > +#include <bpf/bpf_helpers.h> > +#include "bpf_experimental.h" > + > +extern bool CONFIG_X86_64 __kconfig __weak; > + > +#define __scalar_type_to_expr_cases(type) \ > + unsigned type : (unsigned type)0, signed type : (signed type)0 > + > +#define __unqual_typeof(x) \ > + typeof(_Generic((x), \ > + char: (char)0, \ > + __scalar_type_to_expr_cases(char), \ > + __scalar_type_to_expr_cases(short), \ > + __scalar_type_to_expr_cases(int), \ > + __scalar_type_to_expr_cases(long), \ > + __scalar_type_to_expr_cases(long long), \ > + default: (void *)0)) I doubt all this magic makes any difference for generated code. Could you try without it? > +#define cpu_relax() ({}) > + > +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) > + > +#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = (val)) > + > +#define cmpxchg(p, old, new) __sync_val_compare_and_swap((p), old, new) > + > +#define try_cmpxchg(p, pold, new) \ > + ({ \ > + __unqual_typeof(*(p)) __old = cmpxchg(p, *(pold), new); \ > + *(pold) = __old; \ > + *(pold) == __old; \ > + }) This doesn't look right. It will always succeed ? Incorrect copy paste of __raw_try_cmpxchg ? Patch 2 test probably needs to be stronger. pw-bot: cr > + > +#define try_cmpxchg_relaxed(p, pold, new) try_cmpxchg(p, pold, new) > + > +#define try_cmpxchg_acquire(p, pold, new) try_cmpxchg(p, pold, new) > + > +#define smp_mb() \ > + ({ \ > + unsigned long __val; \ > + __sync_fetch_and_add(&__val, 0); \ > + }) > + > +#define smp_rmb() \ > + ({ \ > + if (!CONFIG_X86_64) \ > + smp_mb(); \ > + else \ > + barrier(); \ > + }) > + > +#define smp_wmb() \ > + ({ \ > + if (!CONFIG_X86_64) \ > + smp_mb(); \ > + else \ > + barrier(); \ > + }) > + > +/* Control dependency provides LOAD->STORE, provide LOAD->LOAD */ > +#define smp_acquire__after_ctrl_dep() ({ smp_rmb(); }) > + > +#define smp_load_acquire(p) \ > + ({ \ > + __unqual_typeof(*(p)) __v = READ_ONCE(*(p)); \ > + if (!CONFIG_X86_64) \ > + smp_mb(); \ > + barrier(); \ > + __v; \ > + }) > + > +#define smp_store_release(p, val) \ > + ({ \ > + if (!CONFIG_X86_64) \ > + smp_mb(); \ > + barrier(); \ > + WRITE_ONCE(*(p), val); \ > + }) > + > +#define smp_cond_load_relaxed(p, cond_expr) \ > + ({ \ > + typeof(p) __ptr = (p); \ > + __unqual_typeof(*(p)) VAL; \ > + for (;;) { \ > + VAL = (__unqual_typeof(*(p)))READ_ONCE(*__ptr); \ > + if (cond_expr) \ > + break; \ > + cpu_relax(); \ > + cond_break; \ > + } \ > + (typeof(*(p)))VAL; \ > + }) > + > +#define smp_cond_load_acquire(p, cond_expr) \ > + ({ \ > + __unqual_typeof(*p) \ > + __val = smp_cond_load_relaxed(p, cond_expr); \ > + smp_acquire__after_ctrl_dep(); \ > + (typeof(*(p)))__val; \ > + }) > + > +#define atomic_read(p) READ_ONCE((p)->counter) > + > +#define atomic_cond_read_relaxed(p, cond_expr) \ > + smp_cond_load_relaxed(&(p)->counter, cond_expr) > + > +#define atomic_cond_read_acquire(p, cond_expr) \ > + smp_cond_load_acquire(&(p)->counter, cond_expr) > + > +#define atomic_try_cmpxchg_relaxed(p, pold, new) \ > + try_cmpxchg_relaxed(&(p)->counter, pold, new) > + > +#define atomic_try_cmpxchg_acquire(p, pold, new) \ > + try_cmpxchg_acquire(&(p)->counter, pold, new) > + > +#define arch_mcs_spin_lock_contended(l) smp_cond_load_acquire(l, VAL) > +#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) > -- > 2.43.5 >