On Fri, Jun 17, 2022 at 10:55 PM Huacai Chen <chenhuacai@xxxxxxxxxxx> wrote: > > On NUMA system, the performance of qspinlock is better than generic > spinlock. Below is the UnixBench test results on a 8 nodes (4 cores > per node, 32 cores in total) machine. > > A. With generic spinlock: > > System Benchmarks Index Values BASELINE RESULT INDEX > Dhrystone 2 using register variables 116700.0 449574022.5 38523.9 > Double-Precision Whetstone 55.0 85190.4 15489.2 > Execl Throughput 43.0 14696.2 3417.7 > File Copy 1024 bufsize 2000 maxblocks 3960.0 143157.8 361.5 > File Copy 256 bufsize 500 maxblocks 1655.0 37631.8 227.4 > File Copy 4096 bufsize 8000 maxblocks 5800.0 444814.2 766.9 > Pipe Throughput 12440.0 5047490.7 4057.5 > Pipe-based Context Switching 4000.0 2021545.7 5053.9 > Process Creation 126.0 23829.8 1891.3 > Shell Scripts (1 concurrent) 42.4 33756.7 7961.5 > Shell Scripts (8 concurrent) 6.0 4062.9 6771.5 > System Call Overhead 15000.0 2479748.6 1653.2 > ======== > System Benchmarks Index Score 2955.6 > > B. With qspinlock: > > System Benchmarks Index Values BASELINE RESULT INDEX > Dhrystone 2 using register variables 116700.0 449467876.9 38514.8 > Double-Precision Whetstone 55.0 85174.6 15486.3 > Execl Throughput 43.0 14769.1 3434.7 > File Copy 1024 bufsize 2000 maxblocks 3960.0 146150.5 369.1 > File Copy 256 bufsize 500 maxblocks 1655.0 37496.8 226.6 > File Copy 4096 bufsize 8000 maxblocks 5800.0 447527.0 771.6 > Pipe Throughput 12440.0 5175989.2 4160.8 > Pipe-based Context Switching 4000.0 2207747.8 5519.4 > Process Creation 126.0 25125.5 1994.1 > Shell Scripts (1 concurrent) 42.4 33461.2 7891.8 > Shell Scripts (8 concurrent) 6.0 4024.7 6707.8 > System Call Overhead 15000.0 2917278.6 1944.9 > ======== > System Benchmarks Index Score 3040.1 > > Signed-off-by: Huacai Chen <chenhuacai@xxxxxxxxxxx> > --- > arch/loongarch/Kconfig | 1 + > arch/loongarch/include/asm/Kbuild | 5 +- > arch/loongarch/include/asm/cmpxchg.h | 14 +++ > arch/loongarch/include/asm/percpu.h | 8 ++ > arch/loongarch/include/asm/spinlock.h | 12 +++ > arch/loongarch/include/asm/spinlock_types.h | 11 ++ > arch/loongarch/kernel/Makefile | 2 +- > arch/loongarch/kernel/cmpxchg.c | 105 ++++++++++++++++++++ > 8 files changed, 154 insertions(+), 4 deletions(-) > create mode 100644 arch/loongarch/include/asm/spinlock.h > create mode 100644 arch/loongarch/include/asm/spinlock_types.h > create mode 100644 arch/loongarch/kernel/cmpxchg.c > > diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig > index 1920d52653b4..1ec220df751d 100644 > --- a/arch/loongarch/Kconfig > +++ b/arch/loongarch/Kconfig > @@ -46,6 +46,7 @@ config LOONGARCH > select ARCH_USE_BUILTIN_BSWAP > select ARCH_USE_CMPXCHG_LOCKREF > select ARCH_USE_QUEUED_RWLOCKS > + select ARCH_USE_QUEUED_SPINLOCKS > select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT > select ARCH_WANTS_NO_INSTR > select BUILDTIME_TABLE_SORT > diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild > index 83bc0681e72b..a0eed6076c79 100644 > --- a/arch/loongarch/include/asm/Kbuild > +++ b/arch/loongarch/include/asm/Kbuild > @@ -1,12 +1,11 @@ > # SPDX-License-Identifier: GPL-2.0 > generic-y += dma-contiguous.h > generic-y += export.h > +generic-y += mcs_spinlock.h > generic-y += parport.h > generic-y += early_ioremap.h > generic-y += qrwlock.h > -generic-y += qrwlock_types.h > -generic-y += spinlock.h > -generic-y += spinlock_types.h > +generic-y += qspinlock.h > generic-y += rwsem.h > generic-y += segment.h > generic-y += user.h > diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h > index 75b3a4478652..afcd05be010e 100644 > --- a/arch/loongarch/include/asm/cmpxchg.h > +++ b/arch/loongarch/include/asm/cmpxchg.h > @@ -21,10 +21,17 @@ > __ret; \ > }) > > +extern unsigned long __xchg_small(volatile void *ptr, unsigned long x, > + unsigned int size); > + > static inline unsigned long __xchg(volatile void *ptr, unsigned long x, > int size) > { > switch (size) { > + case 1: > + case 2: > + return __xchg_small(ptr, x, size); > + > case 4: > return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x); > > @@ -67,10 +74,17 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x, > __ret; \ > }) > > +extern unsigned long __cmpxchg_small(volatile void *ptr, unsigned long old, > + unsigned long new, unsigned int size); > + > static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, > unsigned long new, unsigned int size) > { > switch (size) { > + case 1: > + case 2: > + return __cmpxchg_small(ptr, old, new, size); > + > case 4: > return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr, > (u32)old, new); > diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h > index e6569f18c6dd..0bd6b0110198 100644 > --- a/arch/loongarch/include/asm/percpu.h > +++ b/arch/loongarch/include/asm/percpu.h > @@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val, > int size) > { > switch (size) { > + case 1: > + case 2: > + return __xchg_small((volatile void *)ptr, val, size); > + > case 4: > return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val); > > @@ -204,9 +208,13 @@ do { \ > #define this_cpu_write_4(pcp, val) _percpu_write(pcp, val) > #define this_cpu_write_8(pcp, val) _percpu_write(pcp, val) > > +#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val) > +#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val) > #define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val) > #define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val) > > +#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > +#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > #define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > #define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n) > > diff --git a/arch/loongarch/include/asm/spinlock.h b/arch/loongarch/include/asm/spinlock.h > new file mode 100644 > index 000000000000..7cb3476999be > --- /dev/null > +++ b/arch/loongarch/include/asm/spinlock.h > @@ -0,0 +1,12 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited > + */ > +#ifndef _ASM_SPINLOCK_H > +#define _ASM_SPINLOCK_H > + > +#include <asm/processor.h> > +#include <asm/qspinlock.h> > +#include <asm/qrwlock.h> > + > +#endif /* _ASM_SPINLOCK_H */ > diff --git a/arch/loongarch/include/asm/spinlock_types.h b/arch/loongarch/include/asm/spinlock_types.h > new file mode 100644 > index 000000000000..7458d036c161 > --- /dev/null > +++ b/arch/loongarch/include/asm/spinlock_types.h > @@ -0,0 +1,11 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited > + */ > +#ifndef _ASM_SPINLOCK_TYPES_H > +#define _ASM_SPINLOCK_TYPES_H > + > +#include <asm-generic/qspinlock_types.h> > +#include <asm-generic/qrwlock_types.h> > + > +#endif > diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile > index 940de9173542..07930921f7b5 100644 > --- a/arch/loongarch/kernel/Makefile > +++ b/arch/loongarch/kernel/Makefile > @@ -5,7 +5,7 @@ > > extra-y := head.o vmlinux.lds > > -obj-y += cpu-probe.o cacheinfo.o env.o setup.o entry.o genex.o \ > +obj-y += cpu-probe.o cacheinfo.o cmpxchg.o env.o setup.o entry.o genex.o \ > traps.o irq.o idle.o process.o dma.o mem.o io.o reset.o switch.o \ > elf.o syscall.o signal.o time.o topology.o inst.o ptrace.o vdso.o > > diff --git a/arch/loongarch/kernel/cmpxchg.c b/arch/loongarch/kernel/cmpxchg.c > new file mode 100644 > index 000000000000..4c83471c4e47 > --- /dev/null > +++ b/arch/loongarch/kernel/cmpxchg.c > @@ -0,0 +1,105 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Author: Huacai Chen <chenhuacai@xxxxxxxxxxx> > + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited > + * > + * Derived from MIPS: > + * Copyright (C) 2017 Imagination Technologies > + * Author: Paul Burton <paul.burton@xxxxxxxx> > + */ > + > +#include <linux/bug.h> > +#include <asm/barrier.h> > +#include <asm/cmpxchg.h> > + > +unsigned long __xchg_small(volatile void *ptr, unsigned long val, unsigned int size) > +{ > + u32 old32, mask, temp; > + volatile u32 *ptr32; > + unsigned int shift; > + > + /* Check that ptr is naturally aligned */ > + WARN_ON((unsigned long)ptr & (size - 1)); > + > + /* Mask value to the correct size. */ > + mask = GENMASK((size * BITS_PER_BYTE) - 1, 0); > + val &= mask; > + > + /* > + * Calculate a shift & mask that correspond to the value we wish to > + * exchange within the naturally aligned 4 byte integerthat includes > + * it. > + */ > + shift = (unsigned long)ptr & 0x3; > + shift *= BITS_PER_BYTE; > + mask <<= shift; > + > + /* > + * Calculate a pointer to the naturally aligned 4 byte integer that > + * includes our byte of interest, and load its value. > + */ > + ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3); > + > + asm volatile ( > + "1: ll.w %0, %3 \n" > + " andn %1, %0, %4 \n" > + " or %1, %1, %5 \n" > + " sc.w %1, %2 \n" > + " beqz %1, 1b \n" Above depends on how micro-arch implements ll/sc with strong forward guarantee, eg: A. Just check if there is remote write from snoop channel, that's a monitor style in cache coherency. And I think it's a weak forward guarantee not a good ll/sc implementation. B. Lock snoop channel and block other remote write requests until sc/branch/interrupt/normal load/store happen. That's strong enough for qspinlock and only interrupt could break ll/sc pair. (ISA should writes some limitation in spec, just like RISC-V) C. Fusion ll + alu + sc into one atomic bus transaction, See Atomic transactions in AMBA CHI - Arm Developer We are also preparing similar patch for RISC-V, but I think your spec should give out some details on ll/sc atomic forward guarantee. Only for the code implementation, I give Reviewed-by: Guo Ren <guoren@xxxxxxxxxx> > + : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32) > + : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (val << shift) > + : "memory"); > + > + return (old32 & mask) >> shift; > +} > + > +unsigned long __cmpxchg_small(volatile void *ptr, unsigned long old, > + unsigned long new, unsigned int size) > +{ > + u32 old32, mask, temp; > + volatile u32 *ptr32; > + unsigned int shift; > + > + /* Check that ptr is naturally aligned */ > + WARN_ON((unsigned long)ptr & (size - 1)); > + > + /* Mask inputs to the correct size. */ > + mask = GENMASK((size * BITS_PER_BYTE) - 1, 0); > + old &= mask; > + new &= mask; > + > + /* > + * Calculate a shift & mask that correspond to the value we wish to > + * compare & exchange within the naturally aligned 4 byte integer > + * that includes it. > + */ > + shift = (unsigned long)ptr & 0x3; > + shift *= BITS_PER_BYTE; > + old <<= shift; > + new <<= shift; > + mask <<= shift; > + > + /* > + * Calculate a pointer to the naturally aligned 4 byte integer that > + * includes our byte of interest, and load its value. > + */ > + ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3); > + > + asm volatile ( > + "1: ll.w %0, %3 \n" > + " and %1, %0, %4 \n" > + " bne %1, %5, 2f \n" > + " andn %1, %0, %4 \n" > + " or %1, %1, %6 \n" > + " sc.w %1, %2 \n" > + " beqz %1, 1b \n" > + " b 3f \n" > + "2: \n" > + __WEAK_LLSC_MB > + "3: \n" > + : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32) > + : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new) > + : "memory"); > + > + return (old32 & mask) >> shift; > +} > -- > 2.27.0 > -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/