On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@xxxxxxxxxxx> wrote: > > LoongArch only support 32-bit/64-bit xchg/cmpxchg in native. But percpu > operation and qspinlock need 8-bit/16-bit xchg/cmpxchg. On NUMA system, > the performance of subword xchg/cmpxchg emulation is better than the > generic implementation (especially for qspinlock, data will be shown in > the next patch). This patch (of 2) adds subword xchg/cmpxchg emulation > with ll/sc and enable its usage for percpu operations. The xchg/cmpxchg are designed for multi-processor data-sharing issues. The percpu data won't share with other harts and disabling irq/preemption is enough. Do you have the same issue with f97fc810798c ("arm64: percpu: Implement this_cpu operations")? > > Signed-off-by: Rui Wang <wangrui@xxxxxxxxxxx> > Signed-off-by: Huacai Chen <chenhuacai@xxxxxxxxxxx> > --- > arch/loongarch/include/asm/cmpxchg.h | 98 +++++++++++++++++++++++++++- > arch/loongarch/include/asm/percpu.h | 8 +++ > 2 files changed, 105 insertions(+), 1 deletion(-) > > diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h > index 75b3a4478652..967e64bf2c37 100644 > --- a/arch/loongarch/include/asm/cmpxchg.h > +++ b/arch/loongarch/include/asm/cmpxchg.h > @@ -5,8 +5,9 @@ > #ifndef __ASM_CMPXCHG_H > #define __ASM_CMPXCHG_H > > -#include <asm/barrier.h> > +#include <linux/bits.h> > #include <linux/build_bug.h> > +#include <asm/barrier.h> > > #define __xchg_asm(amswap_db, m, val) \ > ({ \ > @@ -21,10 +22,53 @@ > __ret; \ > }) > > +static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val, > + unsigned int size) > +{ > + unsigned int shift; > + u32 old32, mask, temp; > + volatile u32 *ptr32; > + > + /* Mask value to the correct size. */ > + mask = GENMASK((size * BITS_PER_BYTE) - 1, 0); > + val &= mask; > + > + /* > + * Calculate a shift & mask that correspond to the value we wish to > + * exchange within the naturally aligned 4 byte integerthat includes > + * it. > + */ > + shift = (unsigned long)ptr & 0x3; > + shift *= BITS_PER_BYTE; > + mask <<= shift; > + > + /* > + * Calculate a pointer to the naturally aligned 4 byte integer that > + * includes our byte of interest, and load its value. > + */ > + ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3); > + > + asm volatile ( > + "1: ll.w %0, %3 \n" > + " andn %1, %0, %z4 \n" > + " or %1, %1, %z5 \n" > + " sc.w %1, %2 \n" > + " beqz %1, 1b \n" > + : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32) > + : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (val << shift) > + : "memory"); > + > + return (old32 & mask) >> shift; > +} > + > static inline unsigned long __xchg(volatile void *ptr, unsigned long x, > int size) > { > switch (size) { > + case 1: > + case 2: > + return __xchg_small(ptr, x, size); > + > case 4: > return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x); > > @@ -67,10 +111,62 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x, > __ret; \ > }) > > +static inline unsigned int __cmpxchg_small(volatile void *ptr, unsigned int old, > + unsigned int new, unsigned int size) > +{ > + unsigned int shift; > + u32 old32, mask, temp; > + volatile u32 *ptr32; > + > + /* Mask inputs to the correct size. */ > + mask = GENMASK((size * BITS_PER_BYTE) - 1, 0); > + old &= mask; > + new &= mask; > + > + /* > + * Calculate a shift & mask that correspond to the value we wish to > + * compare & exchange within the naturally aligned 4 byte integer > + * that includes it. > + */ > + shift = (unsigned long)ptr & 0x3; > + shift *= BITS_PER_BYTE; > + old <<= shift; > + new <<= shift; > + mask <<= shift; > + > + /* > + * Calculate a pointer to the naturally aligned 4 byte integer that > + * includes our byte of interest, and load its value. > + */ > + ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3); > + > + asm volatile ( > + "1: ll.w %0, %3 \n" > + " and %1, %0, %z4 \n" > + " bne %1, %z5, 2f \n" > + " andn %1, %0, %z4 \n" > + " or %1, %1, %z6 \n" > + " sc.w %1, %2 \n" > + " beqz %1, 1b \n" > + " b 3f \n" > + "2: \n" > + __WEAK_LLSC_MB > + "3: \n" > + : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32) > + : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new) > + : "memory"); > + > + return (old32 & mask) >> shift; > +} > + > static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, > unsigned long new, unsigned int size) > { > switch (size) { > + case 1: > + case 2: > + return __cmpxchg_small(ptr, old, new, size); > + > case 4: > return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr, > (u32)old, new); > diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h > index e6569f18c6dd..0bd6b0110198 100644 > --- a/arch/loongarch/include/asm/percpu.h > +++ b/arch/loongarch/include/asm/percpu.h > @@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val, > int size) > { > switch (size) { > + case 1: > + case 2: > + return __xchg_small((volatile void *)ptr, val, size); > + > case 4: > return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val); The percpu operations are local and we shouldn't combine them with the normal xchg/cmpxchg (They have different semantics one for local, one for share.), please implement your own percpu ops here to fix the irq disable/enable performance issue. > > @@ -204,9 +208,13 @@ do { \ > #define this_cpu_write_4(pcp, val) _percpu_write(pcp, val) > #define this_cpu_write_8(pcp, val) _percpu_write(pcp, val) > > +#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val) > +#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val) > #define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val) > #define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val) > > +#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > +#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > #define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > #define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > > -- > 2.27.0 > -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/