Re: [PATCH V2 1/2] LoongArch: Add subword xchg/cmpxchg emulation

Guo Ren <guoren@xxxxxxxxxx> · Thu, 23 Jun 2022 14:49:40 +0800



On Thu, Jun 23, 2022 at 12:46 PM Huacai Chen <chenhuacai@xxxxxxxxxxx> wrote:
>
> LoongArch only support 32-bit/64-bit xchg/cmpxchg in native. But percpu
> operation and qspinlock need 8-bit/16-bit xchg/cmpxchg. On NUMA system,
> the performance of subword xchg/cmpxchg emulation is better than the
> generic implementation (especially for qspinlock, data will be shown in
> the next patch). This patch (of 2) adds subword xchg/cmpxchg emulation
> with ll/sc and enable its usage for percpu operations.
The xchg/cmpxchg are designed for multi-processor data-sharing issues.
The percpu data won't share with other harts and disabling
irq/preemption is enough. Do you have the same issue with f97fc810798c
("arm64: percpu: Implement this_cpu operations")?


>
> Signed-off-by: Rui Wang <wangrui@xxxxxxxxxxx>
> Signed-off-by: Huacai Chen <chenhuacai@xxxxxxxxxxx>
> ---
>  arch/loongarch/include/asm/cmpxchg.h | 98 +++++++++++++++++++++++++++-
>  arch/loongarch/include/asm/percpu.h  |  8 +++
>  2 files changed, 105 insertions(+), 1 deletion(-)
>
> diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h
> index 75b3a4478652..967e64bf2c37 100644
> --- a/arch/loongarch/include/asm/cmpxchg.h
> +++ b/arch/loongarch/include/asm/cmpxchg.h
> @@ -5,8 +5,9 @@
>  #ifndef __ASM_CMPXCHG_H
>  #define __ASM_CMPXCHG_H
>
> -#include <asm/barrier.h>
> +#include <linux/bits.h>
>  #include <linux/build_bug.h>
> +#include <asm/barrier.h>
>
>  #define __xchg_asm(amswap_db, m, val)          \
>  ({                                             \
> @@ -21,10 +22,53 @@
>                 __ret;                          \
>  })
>
> +static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val,
> +                                       unsigned int size)
> +{
> +       unsigned int shift;
> +       u32 old32, mask, temp;
> +       volatile u32 *ptr32;
> +
> +       /* Mask value to the correct size. */
> +       mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
> +       val &= mask;
> +
> +       /*
> +        * Calculate a shift & mask that correspond to the value we wish to
> +        * exchange within the naturally aligned 4 byte integerthat includes
> +        * it.
> +        */
> +       shift = (unsigned long)ptr & 0x3;
> +       shift *= BITS_PER_BYTE;
> +       mask <<= shift;
> +
> +       /*
> +        * Calculate a pointer to the naturally aligned 4 byte integer that
> +        * includes our byte of interest, and load its value.
> +        */
> +       ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
> +
> +       asm volatile (
> +       "1:     ll.w            %0, %3          \n"
> +       "       andn            %1, %0, %z4     \n"
> +       "       or              %1, %1, %z5     \n"
> +       "       sc.w            %1, %2          \n"
> +       "       beqz            %1, 1b          \n"
> +       : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
> +       : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (val << shift)
> +       : "memory");
> +
> +       return (old32 & mask) >> shift;
> +}
> +
>  static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
>                                    int size)
>  {
>         switch (size) {
> +       case 1:
> +       case 2:
> +               return __xchg_small(ptr, x, size);
> +
>         case 4:
>                 return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x);
>
> @@ -67,10 +111,62 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
>         __ret;                                                          \
>  })
>
> +static inline unsigned int __cmpxchg_small(volatile void *ptr, unsigned int old,
> +                                          unsigned int new, unsigned int size)
> +{
> +       unsigned int shift;
> +       u32 old32, mask, temp;
> +       volatile u32 *ptr32;
> +
> +       /* Mask inputs to the correct size. */
> +       mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
> +       old &= mask;
> +       new &= mask;
> +
> +       /*
> +        * Calculate a shift & mask that correspond to the value we wish to
> +        * compare & exchange within the naturally aligned 4 byte integer
> +        * that includes it.
> +        */
> +       shift = (unsigned long)ptr & 0x3;
> +       shift *= BITS_PER_BYTE;
> +       old <<= shift;
> +       new <<= shift;
> +       mask <<= shift;
> +
> +       /*
> +        * Calculate a pointer to the naturally aligned 4 byte integer that
> +        * includes our byte of interest, and load its value.
> +        */
> +       ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
> +
> +       asm volatile (
> +       "1:     ll.w            %0, %3          \n"
> +       "       and             %1, %0, %z4     \n"
> +       "       bne             %1, %z5, 2f     \n"
> +       "       andn            %1, %0, %z4     \n"
> +       "       or              %1, %1, %z6     \n"
> +       "       sc.w            %1, %2          \n"
> +       "       beqz            %1, 1b          \n"
> +       "       b               3f              \n"
> +       "2:                                     \n"
> +       __WEAK_LLSC_MB
> +       "3:                                     \n"
> +       : "=&r" (old32), "=&r" (temp), "=" GCC_OFF_SMALL_ASM() (*ptr32)
> +       : GCC_OFF_SMALL_ASM() (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new)
> +       : "memory");
> +
> +       return (old32 & mask) >> shift;
> +}
> +
>  static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
>                                       unsigned long new, unsigned int size)
>  {
>         switch (size) {
> +       case 1:
> +       case 2:
> +               return __cmpxchg_small(ptr, old, new, size);
> +
>         case 4:
>                 return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr,
>                                      (u32)old, new);
> diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h
> index e6569f18c6dd..0bd6b0110198 100644
> --- a/arch/loongarch/include/asm/percpu.h
> +++ b/arch/loongarch/include/asm/percpu.h
> @@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
>                                                 int size)
>  {
>         switch (size) {
> +       case 1:
> +       case 2:
> +               return __xchg_small((volatile void *)ptr, val, size);
> +
>         case 4:
>                 return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val);
The percpu operations are local and we shouldn't combine them with the
normal xchg/cmpxchg (They have different semantics one for local, one
for share.), please implement your own percpu ops here to fix the irq
disable/enable performance issue.

>
> @@ -204,9 +208,13 @@ do {                                                                       \
>  #define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
>  #define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
>
> +#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
> +#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
>  #define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val)
>  #define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val)
>
> +#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
> +#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
>  #define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
>  #define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
>
> --
> 2.27.0
>


-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/