On Wed, Jan 27, 2021 at 09:36:27PM +0100, Alexander A Sverdlin wrote: > From: Alexander Sverdlin <alexander.sverdlin@xxxxxxxxx> > > It makes no sense to fold smp_mb__before_llsc()/smp_llsc_mb() again and > again, leave only one barrier pair in the outer function. > > This removes one SYNCW from __xchg_small() and brings around 10% > performance improvement in a tight spinlock loop with 6 threads on a 6 core > Octeon. > > Signed-off-by: Alexander Sverdlin <alexander.sverdlin@xxxxxxxxx> > --- > arch/mips/kernel/cmpxchg.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/arch/mips/kernel/cmpxchg.c b/arch/mips/kernel/cmpxchg.c > index 89107de..122e85f 100644 > --- a/arch/mips/kernel/cmpxchg.c > +++ b/arch/mips/kernel/cmpxchg.c > @@ -41,7 +41,7 @@ unsigned long __xchg_small(volatile void *ptr, unsigned long val, unsigned int s > do { > old32 = load32; > new32 = (load32 & ~mask) | (val << shift); > - load32 = cmpxchg(ptr32, old32, new32); > + load32 = cmpxchg_local(ptr32, old32, new32); > } while (load32 != old32); > > return (load32 & mask) >> shift; > @@ -97,7 +97,7 @@ unsigned long __cmpxchg_small(volatile void *ptr, unsigned long old, > */ > old32 = (load32 & ~mask) | (old << shift); > new32 = (load32 & ~mask) | (new << shift); > - load32 = cmpxchg(ptr32, old32, new32); > + load32 = cmpxchg_local(ptr32, old32, new32); > if (load32 == old32) > return old; > } This is wrong, please use cmpxchg_relaxed() which you've just introduced. cmpxchg_local() need not be cross-cpu atomic at all (it is on mips by accident of implementation).