Current RISCV_ACQUIRE/RELEASE_BARRIER is for spin_lock not atomic. __cmpxchg_release(ptr, old, new, size) ... __asm__ __volatile__ ( RISCV_RELEASE_BARRIER "0: lr.w %0, %2\n" The "fence rw, w -> lr.w" is invalid and lr would beyond fence, so we need "fence rw, r -> lr.w" here. Atomic acquire is the same. Fixes: 0123f4d76ca6 ("riscv/spinlock: Strengthen implementations with fences") Signed-off-by: Guo Ren <guoren@xxxxxxxxxxxxxxxxx> Signed-off-by: Guo Ren <guoren@xxxxxxxxxx> Cc: Palmer Dabbelt <palmer@xxxxxxxxxxx> Cc: Mark Rutland <mark.rutland@xxxxxxx> Cc: Andrea Parri <parri.andrea@xxxxxxxxx> Cc: Dan Lustig <dlustig@xxxxxxxxxx> Cc: stable@xxxxxxxxxxxxxxx --- arch/riscv/include/asm/atomic.h | 4 ++-- arch/riscv/include/asm/cmpxchg.h | 8 ++++---- arch/riscv/include/asm/fence.h | 4 ++++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h index aef8aa9ac4f4..7cd66eba6ec3 100644 --- a/arch/riscv/include/asm/atomic.h +++ b/arch/riscv/include/asm/atomic.h @@ -20,10 +20,10 @@ #include <asm/barrier.h> #define __atomic_acquire_fence() \ - __asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory") + __asm__ __volatile__(RISCV_ATOMIC_ACQUIRE_BARRIER "":::"memory") #define __atomic_release_fence() \ - __asm__ __volatile__(RISCV_RELEASE_BARRIER "" ::: "memory"); + __asm__ __volatile__(RISCV_ATOMIC_RELEASE_BARRIER"" ::: "memory"); static __always_inline int arch_atomic_read(const atomic_t *v) { diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 9269fceb86e0..605edc2fca3b 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -217,7 +217,7 @@ " bne %0, %z3, 1f\n" \ " sc.w %1, %z4, %2\n" \ " bnez %1, 0b\n" \ - RISCV_ACQUIRE_BARRIER \ + RISCV_ATOMIC_ACQUIRE_BARRIER \ "1:\n" \ : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ : "rJ" ((long)__old), "rJ" (__new) \ @@ -229,7 +229,7 @@ " bne %0, %z3, 1f\n" \ " sc.d %1, %z4, %2\n" \ " bnez %1, 0b\n" \ - RISCV_ACQUIRE_BARRIER \ + RISCV_ATOMIC_ACQUIRE_BARRIER \ "1:\n" \ : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ : "rJ" (__old), "rJ" (__new) \ @@ -259,7 +259,7 @@ switch (size) { \ case 4: \ __asm__ __volatile__ ( \ - RISCV_RELEASE_BARRIER \ + RISCV_ATOMIC_RELEASE_BARRIER \ "0: lr.w %0, %2\n" \ " bne %0, %z3, 1f\n" \ " sc.w %1, %z4, %2\n" \ @@ -271,7 +271,7 @@ break; \ case 8: \ __asm__ __volatile__ ( \ - RISCV_RELEASE_BARRIER \ + RISCV_ATOMIC_RELEASE_BARRIER \ "0: lr.d %0, %2\n" \ " bne %0, %z3, 1f\n" \ " sc.d %1, %z4, %2\n" \ diff --git a/arch/riscv/include/asm/fence.h b/arch/riscv/include/asm/fence.h index 2b443a3a487f..4e446d64f04f 100644 --- a/arch/riscv/include/asm/fence.h +++ b/arch/riscv/include/asm/fence.h @@ -4,9 +4,13 @@ #ifdef CONFIG_SMP #define RISCV_ACQUIRE_BARRIER "\tfence r , rw\n" #define RISCV_RELEASE_BARRIER "\tfence rw, w\n" +#define RISCV_ATOMIC_ACQUIRE_BARRIER "\tfence w , rw\n" +#define RISCV_ATOMIC_RELEASE_BARRIER "\tfence rw, r\n" #else #define RISCV_ACQUIRE_BARRIER #define RISCV_RELEASE_BARRIER +#define RISCV_ATOMIC_ACQUIRE_BARRIER +#define RISCV_ATOMIC_RELEASE_BARRIER #endif #endif /* _ASM_RISCV_FENCE_H */ > > > 2. Using .aqrl to replace the fence rw, rw is okay to ISA manual, > > right? And reducing a fence instruction to gain better performance: > > "0: lr.w %[p], %[c]\n" > > " sub %[rc], %[p], %[o]\n" > > " bltz %[rc], 1f\n". > > - " sc.w.rl %[rc], %[rc], %[c]\n" > > + " sc.w.aqrl %[rc], %[rc], %[c]\n" > > " bnez %[rc], 0b\n" > > - " fence rw, rw\n" > > Yes, using .aqrl is valid. Thx and I think the below is also valid, right? - RISCV_RELEASE_BARRIER \ - " amoswap.w %0, %2, %1\n" \ + " amoswap.w.rl %0, %2, %1\n" \ - " amoswap.d %0, %2, %1\n" \ - RISCV_ACQUIRE_BARRIER \ + " amoswap.d.aq %0, %2, %1\n" \ > > Dan > > >> > >> Dan > >> > >>> The purpose of the whole patchset is to reduce the usage of > >>> independent fence rw, rw instructions, and maximize the usage of the > >>> .aq/.rl/.aqrl aonntation of RISC-V. > >>> > >>> __asm__ __volatile__ ( \ > >>> "0: lr.w %0, %2\n" \ > >>> " bne %0, %z3, 1f\n" \ > >>> " sc.w.rl %1, %z4, %2\n" \ > >>> " bnez %1, 0b\n" \ > >>> " fence rw, rw\n" \ > >>> "1:\n" \ > >>> > >>>> we end up with u == 1, v == 1, r1 on P0 is 0 and r1 on P1 is 0, for the > >>>> following litmus test? > >>>> > >>>> C lr-sc-aqrl-pair-vs-full-barrier > >>>> > >>>> {} > >>>> > >>>> P0(int *x, int *y, atomic_t *u) > >>>> { > >>>> int r0; > >>>> int r1; > >>>> > >>>> WRITE_ONCE(*x, 1); > >>>> r0 = atomic_cmpxchg(u, 0, 1); > >>>> r1 = READ_ONCE(*y); > >>>> } > >>>> > >>>> P1(int *x, int *y, atomic_t *v) > >>>> { > >>>> int r0; > >>>> int r1; > >>>> > >>>> WRITE_ONCE(*y, 1); > >>>> r0 = atomic_cmpxchg(v, 0, 1); > >>>> r1 = READ_ONCE(*x); > >>>> } > >>>> > >>>> exists (u=1 /\ v=1 /\ 0:r1=0 /\ 1:r1=0) > >>> I think my patchset won't affect the above sequence guarantee. Current > >>> RISC-V implementation only gives RCsc when the original value is the > >>> same at least once. So I prefer RISC-V cmpxchg should be: > >>> > >>> > >>> - "0: lr.w %0, %2\n" \ > >>> + "0: lr.w.rl %0, %2\n" \ > >>> " bne %0, %z3, 1f\n" \ > >>> " sc.w.rl %1, %z4, %2\n" \ > >>> " bnez %1, 0b\n" \ > >>> - " fence rw, rw\n" \ > >>> "1:\n" \ > >>> + " fence w, rw\n" \ > >>> > >>> To give an unconditional RSsc for atomic_cmpxchg. > >>> > >>>> > >>>> Regards, > >>>> Boqun > >>> > >>> > >>> > > > > > > -- Best Regards Guo Ren ML: https://lore.kernel.org/linux-csky/