> > + select ARCH_WEAK_RELEASE_ACQUIRE if ARCH_USE_QUEUED_SPINLOCKS > > Why do we need this? Also, we presumably would prefer not to have it > when we end up using ticket spinlocks when combo spinlocks is selected. > Is there no way to avoid it? Probably not what you had in mind but we should be able to drop the full barriers in the ticket-lock implementation, deferring/confining them to RCU code; this way no separate treatment would be needed for either lock: diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c9ff8081efc1a..d37afe3bb20cb 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -79,7 +79,7 @@ config RISCV select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select ARCH_WANTS_NO_INSTR select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE - select ARCH_WEAK_RELEASE_ACQUIRE if ARCH_USE_QUEUED_SPINLOCKS + select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU select BUILDTIME_TABLE_SORT if MMU select CLINT_TIMER if RISCV_M_MODE diff --git a/include/asm-generic/ticket_spinlock.h b/include/asm-generic/ticket_spinlock.h index 325779970d8a0..47522640e5e39 100644 --- a/include/asm-generic/ticket_spinlock.h +++ b/include/asm-generic/ticket_spinlock.h @@ -13,10 +13,8 @@ * about this. If your architecture cannot do this you might be better off with * a test-and-set. * - * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence - * uses atomic_fetch_add() which is RCsc to create an RCsc hot path, along with - * a full fence after the spin to upgrade the otherwise-RCpc - * atomic_cond_read_acquire(). + * It further assumes atomic_*_release() + atomic_*_acquire() is RCtso, where + * regular code only expects atomic_t to be RCpc. * * The implementation uses smp_cond_load_acquire() to spin, so if the * architecture has WFE like instructions to sleep instead of poll for word @@ -32,22 +30,13 @@ static __always_inline void ticket_spin_lock(arch_spinlock_t *lock) { - u32 val = atomic_fetch_add(1<<16, &lock->val); + u32 val = atomic_fetch_add_acquire(1<<16, &lock->val); u16 ticket = val >> 16; if (ticket == (u16)val) return; - /* - * atomic_cond_read_acquire() is RCpc, but rather than defining a - * custom cond_read_rcsc() here we just emit a full fence. We only - * need the prior reads before subsequent writes ordering from - * smb_mb(), but as atomic_cond_read_acquire() just emits reads and we - * have no outstanding writes due to the atomic_fetch_add() the extra - * orderings are free. - */ atomic_cond_read_acquire(&lock->val, ticket == (u16)VAL); - smp_mb(); } static __always_inline bool ticket_spin_trylock(arch_spinlock_t *lock) @@ -57,7 +46,7 @@ static __always_inline bool ticket_spin_trylock(arch_spinlock_t *lock) if ((old >> 16) != (old & 0xffff)) return false; - return atomic_try_cmpxchg(&lock->val, &old, old + (1<<16)); /* SC, for RCsc */ + return atomic_try_cmpxchg_acquire(&lock->val, &old, old + (1<<16)); } static __always_inline void ticket_spin_unlock(arch_spinlock_t *lock) https://lore.kernel.org/lkml/ZlnyKclZOQdrJTtU@andrea/ provides additional context. But enough presumably, ;-) How do the above changes look in your tests? other suggestions? Andrea