From: Guo Ren <guoren@xxxxxxxxxxxxxxxxx> The current implementation is the same with 8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for full barrier semantics"). RISC-V could combine acquire and release into the AMO instructions and it could reduce the cost of instruction in performance. Here are the reasons for optimization: - Reduce one extra fence instruction - The "LR/SC" instruction with "acquire and release" operation is less cost than ACQUIRE_BARRIER/RELEASE_BARRIER which used precedes-loads/subsequent-stores prohibit to protect only LR/SC self-instruction. - Putting acquire/release barrier into the loop shouldn't cost extra performance problems from the micro-arch design view. Because LR and SC are sequential in the loop by RVWMO rules. Signed-off-by: Guo Ren <guoren@xxxxxxxxxxxxxxxxx> Signed-off-by: Guo Ren <guoren@xxxxxxxxxx> Cc: Palmer Dabbelt <palmer@xxxxxxxxxxx> Cc: Mark Rutland <mark.rutland@xxxxxxx> --- arch/riscv/include/asm/atomic.h | 6 ++---- arch/riscv/include/asm/cmpxchg.h | 18 ++++++------------ 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h index 20ce8b83bc18..4aaf5b01e7c6 100644 --- a/arch/riscv/include/asm/atomic.h +++ b/arch/riscv/include/asm/atomic.h @@ -382,9 +382,8 @@ static __always_inline int arch_atomic_sub_if_positive(atomic_t *v, int offset) "0: lr.w %[p], %[c]\n" " sub %[rc], %[p], %[o]\n" " bltz %[rc], 1f\n" - " sc.w.rl %[rc], %[rc], %[c]\n" + " sc.w.aqrl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : [o]"r" (offset) @@ -404,9 +403,8 @@ static __always_inline s64 arch_atomic64_sub_if_positive(atomic64_t *v, s64 offs "0: lr.d %[p], %[c]\n" " sub %[rc], %[p], %[o]\n" " bltz %[rc], 1f\n" - " sc.d.rl %[rc], %[rc], %[c]\n" + " sc.d.aqrl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : [o]"r" (offset) diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 1af8db92250b..dfb51c98324d 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -215,9 +215,8 @@ __asm__ __volatile__ ( \ "0: lr.w %0, %2\n" \ " bne %0, %z3, 1f\n" \ - " sc.w %1, %z4, %2\n" \ + " sc.w.aq %1, %z4, %2\n" \ " bnez %1, 0b\n" \ - RISCV_ACQUIRE_BARRIER \ "1:\n" \ : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ : "rJ" ((long)__old), "rJ" (__new) \ @@ -227,9 +226,8 @@ __asm__ __volatile__ ( \ "0: lr.d %0, %2\n" \ " bne %0, %z3, 1f\n" \ - " sc.d %1, %z4, %2\n" \ + " sc.d.aq %1, %z4, %2\n" \ " bnez %1, 0b\n" \ - RISCV_ACQUIRE_BARRIER \ "1:\n" \ : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ : "rJ" (__old), "rJ" (__new) \ @@ -259,8 +257,7 @@ switch (size) { \ case 4: \ __asm__ __volatile__ ( \ - RISCV_RELEASE_BARRIER \ - "0: lr.w %0, %2\n" \ + "0: lr.w.rl %0, %2\n" \ " bne %0, %z3, 1f\n" \ " sc.w %1, %z4, %2\n" \ " bnez %1, 0b\n" \ @@ -271,8 +268,7 @@ break; \ case 8: \ __asm__ __volatile__ ( \ - RISCV_RELEASE_BARRIER \ - "0: lr.d %0, %2\n" \ + "0: lr.d.rl %0, %2\n" \ " bne %0, %z3, 1f\n" \ " sc.d %1, %z4, %2\n" \ " bnez %1, 0b\n" \ @@ -307,9 +303,8 @@ __asm__ __volatile__ ( \ "0: lr.w %0, %2\n" \ " bne %0, %z3, 1f\n" \ - " sc.w.rl %1, %z4, %2\n" \ + " sc.w.aqrl %1, %z4, %2\n" \ " bnez %1, 0b\n" \ - " fence rw, rw\n" \ "1:\n" \ : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ : "rJ" ((long)__old), "rJ" (__new) \ @@ -319,9 +314,8 @@ __asm__ __volatile__ ( \ "0: lr.d %0, %2\n" \ " bne %0, %z3, 1f\n" \ - " sc.d.rl %1, %z4, %2\n" \ + " sc.d.aqrl %1, %z4, %2\n" \ " bnez %1, 0b\n" \ - " fence rw, rw\n" \ "1:\n" \ : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ : "rJ" (__old), "rJ" (__new) \ -- 2.25.1