Hi, Ren, On Thu, Jun 23, 2022 at 2:49 PM Guo Ren <guoren@xxxxxxxxxx> wrote: > > On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@xxxxxxxxxxx> wrote: > > > > LoongArch only support 32-bit/64-bit xchg/cmpxchg in native. But percpu > > operation and qspinlock need 8-bit/16-bit xchg/cmpxchg. On NUMA system, > > the performance of subword xchg/cmpxchg emulation is better than the > > generic implementation (especially for qspinlock, data will be shown in > > the next patch). This patch (of 2) adds subword xchg/cmpxchg emulation > > with ll/sc and enable its usage for percpu operations. > The xchg/cmpxchg are designed for multi-processor data-sharing issues. > The percpu data won't share with other harts and disabling > irq/preemption is enough. Do you have the same issue with f97fc810798c > ("arm64: percpu: Implement this_cpu operations")? Yes, very similar, our csr operations are even slower than atomic operations. :( > > > > > > Signed-off-by: Rui Wang <wangrui@xxxxxxxxxxx> > > Signed-off-by: Huacai Chen <chenhuacai@xxxxxxxxxxx> > > --- > > arch/loongarch/include/asm/cmpxchg.h | 98 +++++++++++++++++++++++++++- > > arch/loongarch/include/asm/percpu.h | 8 +++ > > 2 files changed, 105 insertions(+), 1 deletion(-) > > > > diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h > > index 75b3a4478652..967e64bf2c37 100644 > > --- a/arch/loongarch/include/asm/cmpxchg.h > > +++ b/arch/loongarch/include/asm/cmpxchg.h > > @@ -5,8 +5,9 @@ > > #ifndef __ASM_CMPXCHG_H > > #define __ASM_CMPXCHG_H > > > > -#include <asm/barrier.h> > > +#include <linux/bits.h> > > #include <linux/build_bug.h> > > +#include <asm/barrier.h> > > > > #define __xchg_asm(amswap_db, m, val) \ > > ({ \ > > @@ -21,10 +22,53 @@ > > __ret; \ > > }) > > > > +static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val, > > + unsigned int size) > > +{ > > + unsigned int shift; > > + u32 old32, mask, temp; > > + volatile u32 *ptr32; > > + > > + /* Mask value to the correct size. */ > > + mask = GENMASK((size * BITS_PER_BYTE) - 1, 0); > > + val &= mask; > > + > > + /* > > + * Calculate a shift & mask that correspond to the value we wish to > > + * exchange within the naturally aligned 4 byte integerthat includes > > + * it. > > + */ > > + shift = (unsigned long)ptr & 0x3; > > + shift *= BITS_PER_BYTE; > > + mask <<= shift; > > + > > + /* > > + * Calculate a pointer to the naturally aligned 4 byte integer that > > + * includes our byte of interest, and load its value. > > + */ > > + ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3); > > + > > + asm volatile ( > > + "1: ll.w %0, %3 \n" > > + " andn %1, %0, %z4 \n" > > + " or %1, %1, %z5 \n" > > + " sc.w %1, %2 \n" > > + " beqz %1, 1b \n" > > + : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32) > > + : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (val << shift) > > + : "memory"); > > + > > + return (old32 & mask) >> shift; > > +} > > + > > static inline unsigned long __xchg(volatile void *ptr, unsigned long x, > > int size) > > { > > switch (size) { > > + case 1: > > + case 2: > > + return __xchg_small(ptr, x, size); > > + > > case 4: > > return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x); > > > > @@ -67,10 +111,62 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x, > > __ret; \ > > }) > > > > +static inline unsigned int __cmpxchg_small(volatile void *ptr, unsigned int old, > > + unsigned int new, unsigned int size) > > +{ > > + unsigned int shift; > > + u32 old32, mask, temp; > > + volatile u32 *ptr32; > > + > > + /* Mask inputs to the correct size. */ > > + mask = GENMASK((size * BITS_PER_BYTE) - 1, 0); > > + old &= mask; > > + new &= mask; > > + > > + /* > > + * Calculate a shift & mask that correspond to the value we wish to > > + * compare & exchange within the naturally aligned 4 byte integer > > + * that includes it. > > + */ > > + shift = (unsigned long)ptr & 0x3; > > + shift *= BITS_PER_BYTE; > > + old <<= shift; > > + new <<= shift; > > + mask <<= shift; > > + > > + /* > > + * Calculate a pointer to the naturally aligned 4 byte integer that > > + * includes our byte of interest, and load its value. > > + */ > > + ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3); > > + > > + asm volatile ( > > + "1: ll.w %0, %3 \n" > > + " and %1, %0, %z4 \n" > > + " bne %1, %z5, 2f \n" > > + " andn %1, %0, %z4 \n" > > + " or %1, %1, %z6 \n" > > + " sc.w %1, %2 \n" > > + " beqz %1, 1b \n" > > + " b 3f \n" > > + "2: \n" > > + __WEAK_LLSC_MB > > + "3: \n" > > + : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32) > > + : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new) > > + : "memory"); > > + > > + return (old32 & mask) >> shift; > > +} > > + > > static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, > > unsigned long new, unsigned int size) > > { > > switch (size) { > > + case 1: > > + case 2: > > + return __cmpxchg_small(ptr, old, new, size); > > + > > case 4: > > return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr, > > (u32)old, new); > > diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h > > index e6569f18c6dd..0bd6b0110198 100644 > > --- a/arch/loongarch/include/asm/percpu.h > > +++ b/arch/loongarch/include/asm/percpu.h > > @@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val, > > int size) > > { > > switch (size) { > > + case 1: > > + case 2: > > + return __xchg_small((volatile void *)ptr, val, size); > > + > > case 4: > > return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val); > The percpu operations are local and we shouldn't combine them with the > normal xchg/cmpxchg (They have different semantics one for local, one > for share.), please implement your own percpu ops here to fix the irq > disable/enable performance issue. Yes, percpu operations are local and atomic operations are for sharing. But we are using atomic ops to implement percpu ops (for performance), so just implementing common emulated operations and using it for both percpu ops and qspinlock is just OK? Huacai > > > > > @@ -204,9 +208,13 @@ do { \ > > #define this_cpu_write_4(pcp, val) _percpu_write(pcp, val) > > #define this_cpu_write_8(pcp, val) _percpu_write(pcp, val) > > > > +#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val) > > +#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val) > > #define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val) > > #define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val) > > > > +#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > > +#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > > #define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > > #define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > > > > -- > > 2.27.0 > > > > > -- > Best Regards > Guo Ren > > ML: https://lore.kernel.org/linux-csky/