On Wed, Nov 25, 2020 at 03:16:45PM +0100, Peter Zijlstra wrote: > @@ -207,6 +187,32 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) > atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); > } > > +#endif /* _Q_PENDING_BITS == 8 */ > + > +#if _Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16 > + > +/* > + * xchg_tail - Put in the new queue tail code word & retrieve previous one > + * @lock : Pointer to queued spinlock structure > + * @tail : The new queue tail code word > + * Return: The previous queue tail code word > + * > + * xchg(lock, tail), which heads an address dependency > + * > + * p,*,* -> n,*,* ; prev = xchg(lock, node) > + */ > +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) > +{ > + /* > + * We can use relaxed semantics since the caller ensures that the > + * MCS node is properly initialized before updating the tail. > + */ > + return (u32)xchg_relaxed(&lock->tail, > + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; > +} > + > +#else /* !(_Q_PENDING_BITS == 8 && ARCH_HAS_XCHG16) */ Why can't architectures just implement this with a 32-bit xchg instruction if they don't have one that operates on 16 bits? Sure, they'll store more data, but it's atomic so you shouldn't be able to tell... (ignoring parisc crazy). Also, I'm surprised qspinlock benefits riscv. On arm64, there's nothing in it over tickets for <= 16 CPUs. Will