On Thu, Jun 23, 2022 at 4:04 PM Huacai Chen <chenhuacai@xxxxxxxxxx> wrote: > > Hi, Ren, > > On Thu, Jun 23, 2022 at 2:49 PM Guo Ren <guoren@xxxxxxxxxx> wrote: > > > > On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@xxxxxxxxxxx> wrote: > > > > > > LoongArch only support 32-bit/64-bit xchg/cmpxchg in native. But percpu > > > operation and qspinlock need 8-bit/16-bit xchg/cmpxchg. On NUMA system, > > > the performance of subword xchg/cmpxchg emulation is better than the > > > generic implementation (especially for qspinlock, data will be shown in > > > the next patch). This patch (of 2) adds subword xchg/cmpxchg emulation > > > with ll/sc and enable its usage for percpu operations. > > The xchg/cmpxchg are designed for multi-processor data-sharing issues. > > The percpu data won't share with other harts and disabling > > irq/preemption is enough. Do you have the same issue with f97fc810798c > > ("arm64: percpu: Implement this_cpu operations")? > Yes, very similar, our csr operations are even slower than atomic operations. :( > > > > > > > > > > > Signed-off-by: Rui Wang <wangrui@xxxxxxxxxxx> > > > Signed-off-by: Huacai Chen <chenhuacai@xxxxxxxxxxx> > > > --- > > > arch/loongarch/include/asm/cmpxchg.h | 98 +++++++++++++++++++++++++++- > > > arch/loongarch/include/asm/percpu.h | 8 +++ > > > 2 files changed, 105 insertions(+), 1 deletion(-) > > > > > > diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h > > > index 75b3a4478652..967e64bf2c37 100644 > > > --- a/arch/loongarch/include/asm/cmpxchg.h > > > +++ b/arch/loongarch/include/asm/cmpxchg.h > > > @@ -5,8 +5,9 @@ > > > #ifndef __ASM_CMPXCHG_H > > > #define __ASM_CMPXCHG_H > > > > > > -#include <asm/barrier.h> > > > +#include <linux/bits.h> > > > #include <linux/build_bug.h> > > > +#include <asm/barrier.h> > > > > > > #define __xchg_asm(amswap_db, m, val) \ > > > ({ \ > > > @@ -21,10 +22,53 @@ > > > __ret; \ > > > }) > > > > > > +static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val, > > > + unsigned int size) > > > +{ > > > + unsigned int shift; > > > + u32 old32, mask, temp; > > > + volatile u32 *ptr32; > > > + > > > + /* Mask value to the correct size. */ > > > + mask = GENMASK((size * BITS_PER_BYTE) - 1, 0); > > > + val &= mask; > > > + > > > + /* > > > + * Calculate a shift & mask that correspond to the value we wish to > > > + * exchange within the naturally aligned 4 byte integerthat includes > > > + * it. > > > + */ > > > + shift = (unsigned long)ptr & 0x3; > > > + shift *= BITS_PER_BYTE; > > > + mask <<= shift; > > > + > > > + /* > > > + * Calculate a pointer to the naturally aligned 4 byte integer that > > > + * includes our byte of interest, and load its value. > > > + */ > > > + ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3); > > > + > > > + asm volatile ( > > > + "1: ll.w %0, %3 \n" > > > + " andn %1, %0, %z4 \n" > > > + " or %1, %1, %z5 \n" > > > + " sc.w %1, %2 \n" > > > + " beqz %1, 1b \n" > > > + : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32) > > > + : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (val << shift) > > > + : "memory"); > > > + > > > + return (old32 & mask) >> shift; > > > +} > > > + > > > static inline unsigned long __xchg(volatile void *ptr, unsigned long x, > > > int size) > > > { > > > switch (size) { > > > + case 1: > > > + case 2: > > > + return __xchg_small(ptr, x, size); > > > + > > > case 4: > > > return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x); > > > > > > @@ -67,10 +111,62 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x, > > > __ret; \ > > > }) > > > > > > +static inline unsigned int __cmpxchg_small(volatile void *ptr, unsigned int old, > > > + unsigned int new, unsigned int size) > > > +{ > > > + unsigned int shift; > > > + u32 old32, mask, temp; > > > + volatile u32 *ptr32; > > > + > > > + /* Mask inputs to the correct size. */ > > > + mask = GENMASK((size * BITS_PER_BYTE) - 1, 0); > > > + old &= mask; > > > + new &= mask; > > > + > > > + /* > > > + * Calculate a shift & mask that correspond to the value we wish to > > > + * compare & exchange within the naturally aligned 4 byte integer > > > + * that includes it. > > > + */ > > > + shift = (unsigned long)ptr & 0x3; > > > + shift *= BITS_PER_BYTE; > > > + old <<= shift; > > > + new <<= shift; > > > + mask <<= shift; > > > + > > > + /* > > > + * Calculate a pointer to the naturally aligned 4 byte integer that > > > + * includes our byte of interest, and load its value. > > > + */ > > > + ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3); > > > + > > > + asm volatile ( > > > + "1: ll.w %0, %3 \n" > > > + " and %1, %0, %z4 \n" > > > + " bne %1, %z5, 2f \n" > > > + " andn %1, %0, %z4 \n" > > > + " or %1, %1, %z6 \n" > > > + " sc.w %1, %2 \n" > > > + " beqz %1, 1b \n" > > > + " b 3f \n" > > > + "2: \n" > > > + __WEAK_LLSC_MB > > > + "3: \n" > > > + : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32) > > > + : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new) > > > + : "memory"); > > > + > > > + return (old32 & mask) >> shift; > > > +} > > > + > > > static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, > > > unsigned long new, unsigned int size) > > > { > > > switch (size) { > > > + case 1: > > > + case 2: > > > + return __cmpxchg_small(ptr, old, new, size); > > > + > > > case 4: > > > return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr, > > > (u32)old, new); > > > diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h > > > index e6569f18c6dd..0bd6b0110198 100644 > > > --- a/arch/loongarch/include/asm/percpu.h > > > +++ b/arch/loongarch/include/asm/percpu.h > > > @@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val, > > > int size) > > > { > > > switch (size) { > > > + case 1: > > > + case 2: > > > + return __xchg_small((volatile void *)ptr, val, size); > > > + > > > case 4: > > > return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val); > > The percpu operations are local and we shouldn't combine them with the > > normal xchg/cmpxchg (They have different semantics one for local, one > > for share.), please implement your own percpu ops here to fix the irq > > disable/enable performance issue. > Yes, percpu operations are local and atomic operations are for > sharing. But we are using atomic ops to implement percpu ops (for > performance), so just implementing common emulated operations and > using it for both percpu ops and qspinlock is just OK? No, separating them would be fine. The qspinlock only needs xchg16_relaxed and just leave that in loongarch's cmpxchg.h. That would be easier for Arnd to cleanup xchg/cmpxchg. > > Huacai > > > > > > > > @@ -204,9 +208,13 @@ do { \ > > > #define this_cpu_write_4(pcp, val) _percpu_write(pcp, val) > > > #define this_cpu_write_8(pcp, val) _percpu_write(pcp, val) > > > > > > +#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val) > > > +#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val) > > > #define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val) > > > #define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val) > > > > > > +#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > > > +#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > > > #define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > > > #define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > > > > > > -- > > > 2.27.0 > > > > > > > > > -- > > Best Regards > > Guo Ren > > > > ML: https://lore.kernel.org/linux-csky/ -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/