[PATCH] LoongArch: Support dbar with different hints

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Traditionally, LoongArch uses "dbar 0" (full completion barrier) for
everything. But the full completion barrier is a performance killer, so
Loongson-3A6000 and newer processors introduce different hints:

Bit4: ordering or completion (0: completion, 1: ordering)
Bit3: barrier for previous read (0: true, 1: false)
Bit2: barrier for previous write (0: true, 1: false)
Bit1: barrier for succeeding read (0: true, 1: false)
Bit0: barrier for succedding write (0: true, 1: false)

Hint 0x700: barrier for "read after read" from the same address, which
is needed by LL-SC loops.

This patch enable various hints for different memory barries, it brings
performance improvements for Loongson-3A6000 series, and doesn't impact
Loongson-3A5000 series because they treat all hints as "dbar 0".

Signed-off-by: Jun Yi <yijun@xxxxxxxxxxx>
Signed-off-by: Huacai Chen <chenhuacai@xxxxxxxxxxx>
---
 arch/loongarch/include/asm/barrier.h | 130 ++++++++++++---------------
 arch/loongarch/include/asm/io.h      |   2 +-
 arch/loongarch/kernel/smp.c          |   2 +-
 arch/loongarch/mm/tlbex.S            |   6 +-
 4 files changed, 60 insertions(+), 80 deletions(-)

diff --git a/arch/loongarch/include/asm/barrier.h b/arch/loongarch/include/asm/barrier.h
index cda977675854..0286ae7e3636 100644
--- a/arch/loongarch/include/asm/barrier.h
+++ b/arch/loongarch/include/asm/barrier.h
@@ -5,27 +5,56 @@
 #ifndef __ASM_BARRIER_H
 #define __ASM_BARRIER_H
 
-#define __sync()	__asm__ __volatile__("dbar 0" : : : "memory")
+/*
+ * Hint types:
+ *
+ * Bit4: ordering or completion (0: completion, 1: ordering)
+ * Bit3: barrier for previous read (0: true, 1: false)
+ * Bit2: barrier for previous write (0: true, 1: false)
+ * Bit1: barrier for succeeding read (0: true, 1: false)
+ * Bit0: barrier for succedding write (0: true, 1: false)
+ *
+ * Hint 0x700: barrier for "read after read" from the same address
+ */
+
+#define DBAR(hint) __asm__ __volatile__("dbar %0 " : : "I"(hint) : "memory")
+
+#define crwrw		0b00000
+#define cr_r_		0b00101
+#define c_w_w		0b01010
 
-#define fast_wmb()	__sync()
-#define fast_rmb()	__sync()
-#define fast_mb()	__sync()
-#define fast_iob()	__sync()
-#define wbflush()	__sync()
+#define orwrw		0b10000
+#define or_r_		0b10101
+#define o_w_w		0b11010
 
-#define wmb()		fast_wmb()
-#define rmb()		fast_rmb()
-#define mb()		fast_mb()
-#define iob()		fast_iob()
+#define orw_w		0b10010
+#define or_rw		0b10100
 
-#define __smp_mb()	__asm__ __volatile__("dbar 0" : : : "memory")
-#define __smp_rmb()	__asm__ __volatile__("dbar 0" : : : "memory")
-#define __smp_wmb()	__asm__ __volatile__("dbar 0" : : : "memory")
+#define c_sync()	DBAR(crwrw)
+#define c_rsync()	DBAR(cr_r_)
+#define c_wsync()	DBAR(c_w_w)
+
+#define o_sync()	DBAR(orwrw)
+#define o_rsync()	DBAR(or_r_)
+#define o_wsync()	DBAR(o_w_w)
+
+#define ldacq_mb()	DBAR(or_rw)
+#define strel_mb()	DBAR(orw_w)
+
+#define mb()		c_sync()
+#define rmb()		c_rsync()
+#define wmb()		c_wsync()
+#define iob()		c_sync()
+#define wbflush()	c_sync()
+
+#define __smp_mb()	o_sync()
+#define __smp_rmb()	o_rsync()
+#define __smp_wmb()	o_wsync()
 
 #ifdef CONFIG_SMP
-#define __WEAK_LLSC_MB		"	dbar 0  \n"
+#define __WEAK_LLSC_MB		"	dbar 0x700	\n"
 #else
-#define __WEAK_LLSC_MB		"		\n"
+#define __WEAK_LLSC_MB		"			\n"
 #endif
 
 #define __smp_mb__before_atomic()	barrier()
@@ -59,68 +88,19 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 	return mask;
 }
 
-#define __smp_load_acquire(p)							\
-({										\
-	union { typeof(*p) __val; char __c[1]; } __u;				\
-	unsigned long __tmp = 0;							\
-	compiletime_assert_atomic_type(*p);					\
-	switch (sizeof(*p)) {							\
-	case 1:									\
-		*(__u8 *)__u.__c = *(volatile __u8 *)p;				\
-		__smp_mb();							\
-		break;								\
-	case 2:									\
-		*(__u16 *)__u.__c = *(volatile __u16 *)p;			\
-		__smp_mb();							\
-		break;								\
-	case 4:									\
-		__asm__ __volatile__(						\
-		"amor_db.w %[val], %[tmp], %[mem]	\n"				\
-		: [val] "=&r" (*(__u32 *)__u.__c)				\
-		: [mem] "ZB" (*(u32 *) p), [tmp] "r" (__tmp)			\
-		: "memory");							\
-		break;								\
-	case 8:									\
-		__asm__ __volatile__(						\
-		"amor_db.d %[val], %[tmp], %[mem]	\n"				\
-		: [val] "=&r" (*(__u64 *)__u.__c)				\
-		: [mem] "ZB" (*(u64 *) p), [tmp] "r" (__tmp)			\
-		: "memory");							\
-		break;								\
-	}									\
-	(typeof(*p))__u.__val;								\
+#define __smp_load_acquire(p)				\
+({							\
+	typeof(*p) ___p1 = READ_ONCE(*p);		\
+	compiletime_assert_atomic_type(*p);		\
+	ldacq_mb();					\
+	___p1;						\
 })
 
-#define __smp_store_release(p, v)						\
-do {										\
-	union { typeof(*p) __val; char __c[1]; } __u =				\
-		{ .__val = (__force typeof(*p)) (v) };				\
-	unsigned long __tmp;							\
-	compiletime_assert_atomic_type(*p);					\
-	switch (sizeof(*p)) {							\
-	case 1:									\
-		__smp_mb();							\
-		*(volatile __u8 *)p = *(__u8 *)__u.__c;				\
-		break;								\
-	case 2:									\
-		__smp_mb();							\
-		*(volatile __u16 *)p = *(__u16 *)__u.__c;			\
-		break;								\
-	case 4:									\
-		__asm__ __volatile__(						\
-		"amswap_db.w %[tmp], %[val], %[mem]	\n"			\
-		: [mem] "+ZB" (*(u32 *)p), [tmp] "=&r" (__tmp)			\
-		: [val] "r" (*(__u32 *)__u.__c)					\
-		: );								\
-		break;								\
-	case 8:									\
-		__asm__ __volatile__(						\
-		"amswap_db.d %[tmp], %[val], %[mem]	\n"			\
-		: [mem] "+ZB" (*(u64 *)p), [tmp] "=&r" (__tmp)			\
-		: [val] "r" (*(__u64 *)__u.__c)					\
-		: );								\
-		break;								\
-	}									\
+#define __smp_store_release(p, v)			\
+do {							\
+	compiletime_assert_atomic_type(*p);		\
+	strel_mb();					\
+	WRITE_ONCE(*p, v);				\
 } while (0)
 
 #define __smp_store_mb(p, v)							\
diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h
index 545e2708fbf7..1c9410220040 100644
--- a/arch/loongarch/include/asm/io.h
+++ b/arch/loongarch/include/asm/io.h
@@ -62,7 +62,7 @@ extern pgprot_t pgprot_wc;
 #define ioremap_cache(offset, size)	\
 	ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL))
 
-#define mmiowb() asm volatile ("dbar 0" ::: "memory")
+#define mmiowb() wmb()
 
 /*
  * String version of I/O memory access operations.
diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index ed167e244cda..8daa97148c8e 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -118,7 +118,7 @@ static u32 ipi_read_clear(int cpu)
 	action = iocsr_read32(LOONGARCH_IOCSR_IPI_STATUS);
 	/* Clear the ipi register to clear the interrupt */
 	iocsr_write32(action, LOONGARCH_IOCSR_IPI_CLEAR);
-	smp_mb();
+	wbflush();
 
 	return action;
 }
diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S
index 244e2f5aeee5..240ced55586e 100644
--- a/arch/loongarch/mm/tlbex.S
+++ b/arch/loongarch/mm/tlbex.S
@@ -184,7 +184,7 @@ tlb_huge_update_load:
 	ertn
 
 nopage_tlb_load:
-	dbar		0
+	dbar		0x700
 	csrrd		ra, EXCEPTION_KS2
 	la_abs		t0, tlb_do_page_fault_0
 	jr		t0
@@ -333,7 +333,7 @@ tlb_huge_update_store:
 	ertn
 
 nopage_tlb_store:
-	dbar		0
+	dbar		0x700
 	csrrd		ra, EXCEPTION_KS2
 	la_abs		t0, tlb_do_page_fault_1
 	jr		t0
@@ -480,7 +480,7 @@ tlb_huge_update_modify:
 	ertn
 
 nopage_tlb_modify:
-	dbar		0
+	dbar		0x700
 	csrrd		ra, EXCEPTION_KS2
 	la_abs		t0, tlb_do_page_fault_1
 	jr		t0
-- 
2.39.1




[Index of Archives]     [Linux Kernel]     [Kernel Newbies]     [x86 Platform Driver]     [Netdev]     [Linux Wireless]     [Netfilter]     [Bugtraq]     [Linux Filesystems]     [Yosemite Discussion]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]

  Powered by Linux