On Tue, Nov 24, 2020 at 04:00:14PM +0100, Arnd Bergmann wrote: > On Tue, Nov 24, 2020 at 3:39 PM Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote: > > On Tue, Nov 24, 2020 at 01:43:54PM +0000, guoren@xxxxxxxxxx wrote: > > > diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild > > > > + if (align) { \ > > > + __asm__ __volatile__ ( \ > > > + "0: lr.w %0, 0(%z4)\n" \ > > > + " move %1, %0\n" \ > > > + " slli %1, %1, 16\n" \ > > > + " srli %1, %1, 16\n" \ > > > + " move %2, %z3\n" \ > > > + " slli %2, %2, 16\n" \ > > > + " or %1, %2, %1\n" \ > > > + " sc.w %2, %1, 0(%z4)\n" \ > > > + " bnez %2, 0b\n" \ > > > + " srli %0, %0, 16\n" \ > > > + : "=&r" (__ret), "=&r" (tmp), "=&r" (__rc) \ > > > + : "rJ" (__new), "rJ"(addr) \ > > > + : "memory"); \ > > > > I'm pretty sure there's a handfull of implementations like this out > > there... if only we could share. > > Isn't this effectively the same as the "_Q_PENDING_BITS != 8" > version of xchg_tail()? Pretty much I suppose. > If nothing else needs xchg() on a 16-bit value, maybe > changing the #ifdef in the qspinlock code is enough. Like the below? I think the native LL/SC variant is slightly better than the cmpxchg loop. The problem is that cmpxchg() is ll/sc and then we loop that, instead of only having the ll/sc loop. > Only around half the architectures actually implement 8-bit > and 16-bit cmpxchg() and xchg(), it might even be worth trying > to eventually change the interface to not do it at all, but > instead have explicit cmpxchg8() and cmpxchg16() helpers > for the few files that do use them. Yeah, many RISCs don't have sub-word atomics. Not sure how many other users there are. qspinlock certainly is the most popular I suppose. --- diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index cbff6ba53d56..7049fb2af0b2 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -163,26 +163,6 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); } -/* - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail), which heads an address dependency - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - /* - * We can use relaxed semantics since the caller ensures that the - * MCS node is properly initialized before updating the tail. - */ - return (u32)xchg_relaxed(&lock->tail, - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; -} - #else /* _Q_PENDING_BITS == 8 */ /** @@ -207,6 +187,32 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); } +#endif /* _Q_PENDING_BITS == 8 */ + +#if _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16 + +/* + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail), which heads an address dependency + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + /* + * We can use relaxed semantics since the caller ensures that the + * MCS node is properly initialized before updating the tail. + */ + return (u32)xchg_relaxed(&lock->tail, + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; +} + +#else /* !(_Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16) */ + /** * xchg_tail - Put in the new queue tail code word & retrieve previous one * @lock : Pointer to queued spinlock structure @@ -236,7 +242,8 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) } return old; } -#endif /* _Q_PENDING_BITS == 8 */ + +#endif /* _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16 */ /** * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending