Re: [PATCH v3 03/11] riscv: Implement cmpxchg8/16() using Zabha

Alexandre Ghiti <alex@xxxxxxxx> · Wed, 17 Jul 2024 17:34:19 +0200




On 17/07/2024 17:29, Conor Dooley wrote:
On Wed, Jul 17, 2024 at 10:26:34AM -0500, Andrew Jones wrote:
On Wed, Jul 17, 2024 at 08:19:49AM GMT, Alexandre Ghiti wrote:
-#define __arch_cmpxchg_masked(sc_sfx, prepend, append, r, p, o, n)	\
+#define __arch_cmpxchg_masked(sc_sfx, cas_sfx, prepend, append, r, p, o, n)	\
  ({									\
+	__label__ no_zabha_zacas, end;					\
+									\
+	if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) &&			\
+	    IS_ENABLED(CONFIG_RISCV_ISA_ZACAS)) {			\
+		asm goto(ALTERNATIVE("j %[no_zabha_zacas]", "nop", 0,	\
+				     RISCV_ISA_EXT_ZABHA, 1)		\
+			 : : : : no_zabha_zacas);			\
+		asm goto(ALTERNATIVE("j %[no_zabha_zacas]", "nop", 0,	\
+				     RISCV_ISA_EXT_ZACAS, 1)		\
+			 : : : : no_zabha_zacas);			\
I came late to the call, but I guess trying to get rid of these asm gotos
was the topic of the discussion. The proposal was to try and use static
branches, but keep in mind that we've had trouble with static branches
inside macros in the past when those macros are used in many places[1]

[1] commit 0b1d60d6dd9e ("riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y")
The other half of the suggestion was not using an asm goto, but instead
trying to patch the whole thing in the alternative, for the problematic
section with llvm < 17.


And I'm not a big fan of this solution since it would imply patching the 
5-7 instructions for LR/SC into nops which would probably slow (a bit) 
the amocas/amoswap sequence. I agree it should not be that big, but that 
it is just to fix an llvm issue, so not worth it to me!


+									\
+		__asm__ __volatile__ (					\
+			prepend						\
+			"	amocas" cas_sfx " %0, %z2, %1\n"	\
+			append						\
+			: "+&r" (r), "+A" (*(p))			\
+			: "rJ" (n)					\
+			: "memory");					\
+		goto end;						\
+	}								\
+									\
+no_zabha_zacas:;							\
unnecessary ;

  	u32 *__ptr32b = (u32 *)((ulong)(p) & ~0x3);			\
  	ulong __s = ((ulong)(p) & (0x4 - sizeof(*p))) * BITS_PER_BYTE;	\
  	ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)	\
@@ -133,6 +155,8 @@
  		: "memory");						\
  									\
  	r = (__typeof__(*(p)))((__retx & __mask) >> __s);		\
+									\
+end:;									\
  })
  
  #define __arch_cmpxchg(lr_sfx, sc_cas_sfx, prepend, append, r, p, co, o, n)	\