Re: [PATCH 11/11] MIPS: Use queued spinlocks (qspinlock)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello Paul and all,

On 10/06/2017 02:26, Paul Burton wrote:
> This patch switches MIPS to make use of generically implemented queued
> spinlocks, rather than the ticket spinlocks used previously. This allows
> us to drop a whole load of inline assembly, share more generic code, and
> is also a performance win.

unfortunately this patch made spin_locks on Octeon two times slower.
Further patches didn't improve the situation and my measurement show that
currently in 5.9 the situation is even worse (which I still have to analyze).

I've performed two kinds of test for this particular patch, based on upstream
lock_torture test and using my own kernel module with a more tight loop
around spin_lock (which I could share on request).

The CONFIG_LOCK_TORTURE_TEST results were obtained with the following parameters:
modprobe locktorture nwriters_stress=6; sleep 62; rmmod locktorture;

Before the patch:
Writes:  Total: 40973197  Max/Min: 0/0   Fail: 0 

After the patch:
Writes:  Total: 34317903  Max/Min: 0/0   Fail: 0

This shows 17% performance loss.

However if I test a tighter loop:
        for (i = 0; i < NUM_ITERATIONS; i++) {                                                                                                                                  
                spin_lock(&locktest_spinlock);                                                                                                                                  
                locktest_counter++;                                                                                                                                             
                spin_unlock(&locktest_spinlock);                                                                                                                                
        }                                                                                                                                                                       

again with 6 threads, I get 1.499s (for 1M iterations) instead of 0.731s.
This is 105% overhead per lock-unlock combination in comparison to MIPS-specific
implementation!

What do you think, should I prepare a revert of this conversion to queued locks
or do you have ideas I might try on Octeon?
I suspect locks other than spin_lock might be affected as well.

> Results from running the AIM7 short workload on a MIPS Creator Ci40 (ie.
> 2 core 2 thread interAptiv CPU clocked at 546MHz) with v4.12-rc4
> pistachio_defconfig, with ftrace disabled due to a current bug, and both
> with & without use of queued rwlocks & spinlocks:
> 
>   Forks | v4.12-rc4 | +qlocks  | Change
>  -------|-----------|----------|--------
>      10 | 52630.32  | 53316.31 | +1.01%
>      20 | 51777.80  | 52623.15 | +1.02%
>      30 | 51645.92  | 52517.26 | +1.02%
>      40 | 51634.88  | 52419.89 | +1.02%
>      50 | 51506.75  | 52307.81 | +1.02%
>      60 | 51500.74  | 52322.72 | +1.02%
>      70 | 51434.81  | 52288.60 | +1.02%
>      80 | 51423.22  | 52434.85 | +1.02%
>      90 | 51428.65  | 52410.10 | +1.02%
> 
> The kernels used for these tests also had my "MIPS: Hardcode cpu_has_*
> where known at compile time due to ISA" patch applied, which allows the
> kernel_uses_llsc checks in cmpxchg() & xchg() to be optimised away at
> compile time.
> 
> Signed-off-by: Paul Burton <paul.burton@xxxxxxxxxx>
> Cc: Ralf Baechle <ralf@xxxxxxxxxxxxxx>
> Cc: linux-mips@xxxxxxxxxxxxxx
> ---
> 
>  arch/mips/Kconfig                      |   1 +
>  arch/mips/include/asm/Kbuild           |   1 +
>  arch/mips/include/asm/spinlock.h       | 210 +--------------------------------
>  arch/mips/include/asm/spinlock_types.h |  24 +---
>  4 files changed, 4 insertions(+), 232 deletions(-)
> 
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index 50c71273f569..398f0a55d4fa 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -71,6 +71,7 @@ config MIPS
>  	select HAVE_REGS_AND_STACK_ACCESS_API
>  	select HAVE_COPY_THREAD_TLS
>  	select ARCH_USE_QUEUED_RWLOCKS
> +	select ARCH_USE_QUEUED_SPINLOCKS
>  
>  menu "Machine selection"
>  
> diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
> index ae6cb47e9d22..7c8aab23bce8 100644
> --- a/arch/mips/include/asm/Kbuild
> +++ b/arch/mips/include/asm/Kbuild
> @@ -13,6 +13,7 @@ generic-y += parport.h
>  generic-y += percpu.h
>  generic-y += preempt.h
>  generic-y += qrwlock.h
> +generic-y += qspinlock.h
>  generic-y += sections.h
>  generic-y += segment.h
>  generic-y += serial.h
> diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
> index 3e7afff196cd..a7d21da16b6a 100644
> --- a/arch/mips/include/asm/spinlock.h
> +++ b/arch/mips/include/asm/spinlock.h
> @@ -9,217 +9,9 @@
>  #ifndef _ASM_SPINLOCK_H
>  #define _ASM_SPINLOCK_H
>  
> -#include <linux/compiler.h>
> -
> -#include <asm/barrier.h>
>  #include <asm/processor.h>
>  #include <asm/qrwlock.h>
> -#include <asm/compiler.h>
> -#include <asm/war.h>
> -
> -/*
> - * Your basic SMP spinlocks, allowing only a single CPU anywhere
> - *
> - * Simple spin lock operations.	 There are two variants, one clears IRQ's
> - * on the local processor, one does not.
> - *
> - * These are fair FIFO ticket locks
> - *
> - * (the type definitions are in asm/spinlock_types.h)
> - */
> -
> -
> -/*
> - * Ticket locks are conceptually two parts, one indicating the current head of
> - * the queue, and the other indicating the current tail. The lock is acquired
> - * by atomically noting the tail and incrementing it by one (thus adding
> - * ourself to the queue and noting our position), then waiting until the head
> - * becomes equal to the the initial value of the tail.
> - */
> -
> -static inline int arch_spin_is_locked(arch_spinlock_t *lock)
> -{
> -	u32 counters = ACCESS_ONCE(lock->lock);
> -
> -	return ((counters >> 16) ^ counters) & 0xffff;
> -}
> -
> -static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
> -{
> -	return lock.h.serving_now == lock.h.ticket;
> -}
> -
> -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
> -
> -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
> -{
> -	u16 owner = READ_ONCE(lock->h.serving_now);
> -	smp_rmb();
> -	for (;;) {
> -		arch_spinlock_t tmp = READ_ONCE(*lock);
> -
> -		if (tmp.h.serving_now == tmp.h.ticket ||
> -		    tmp.h.serving_now != owner)
> -			break;
> -
> -		cpu_relax();
> -	}
> -	smp_acquire__after_ctrl_dep();
> -}
> -
> -static inline int arch_spin_is_contended(arch_spinlock_t *lock)
> -{
> -	u32 counters = ACCESS_ONCE(lock->lock);
> -
> -	return (((counters >> 16) - counters) & 0xffff) > 1;
> -}
> -#define arch_spin_is_contended	arch_spin_is_contended
> -
> -static inline void arch_spin_lock(arch_spinlock_t *lock)
> -{
> -	int my_ticket;
> -	int tmp;
> -	int inc = 0x10000;
> -
> -	if (R10000_LLSC_WAR) {
> -		__asm__ __volatile__ (
> -		"	.set push		# arch_spin_lock	\n"
> -		"	.set noreorder					\n"
> -		"							\n"
> -		"1:	ll	%[ticket], %[ticket_ptr]		\n"
> -		"	addu	%[my_ticket], %[ticket], %[inc]		\n"
> -		"	sc	%[my_ticket], %[ticket_ptr]		\n"
> -		"	beqzl	%[my_ticket], 1b			\n"
> -		"	 nop						\n"
> -		"	srl	%[my_ticket], %[ticket], 16		\n"
> -		"	andi	%[ticket], %[ticket], 0xffff		\n"
> -		"	bne	%[ticket], %[my_ticket], 4f		\n"
> -		"	 subu	%[ticket], %[my_ticket], %[ticket]	\n"
> -		"2:							\n"
> -		"	.subsection 2					\n"
> -		"4:	andi	%[ticket], %[ticket], 0xffff		\n"
> -		"	sll	%[ticket], 5				\n"
> -		"							\n"
> -		"6:	bnez	%[ticket], 6b				\n"
> -		"	 subu	%[ticket], 1				\n"
> -		"							\n"
> -		"	lhu	%[ticket], %[serving_now_ptr]		\n"
> -		"	beq	%[ticket], %[my_ticket], 2b		\n"
> -		"	 subu	%[ticket], %[my_ticket], %[ticket]	\n"
> -		"	b	4b					\n"
> -		"	 subu	%[ticket], %[ticket], 1			\n"
> -		"	.previous					\n"
> -		"	.set pop					\n"
> -		: [ticket_ptr] "+" GCC_OFF_SMALL_ASM() (lock->lock),
> -		  [serving_now_ptr] "+m" (lock->h.serving_now),
> -		  [ticket] "=&r" (tmp),
> -		  [my_ticket] "=&r" (my_ticket)
> -		: [inc] "r" (inc));
> -	} else {
> -		__asm__ __volatile__ (
> -		"	.set push		# arch_spin_lock	\n"
> -		"	.set noreorder					\n"
> -		"							\n"
> -		"1:	ll	%[ticket], %[ticket_ptr]		\n"
> -		"	addu	%[my_ticket], %[ticket], %[inc]		\n"
> -		"	sc	%[my_ticket], %[ticket_ptr]		\n"
> -		"	beqz	%[my_ticket], 1b			\n"
> -		"	 srl	%[my_ticket], %[ticket], 16		\n"
> -		"	andi	%[ticket], %[ticket], 0xffff		\n"
> -		"	bne	%[ticket], %[my_ticket], 4f		\n"
> -		"	 subu	%[ticket], %[my_ticket], %[ticket]	\n"
> -		"2:	.insn						\n"
> -		"	.subsection 2					\n"
> -		"4:	andi	%[ticket], %[ticket], 0xffff		\n"
> -		"	sll	%[ticket], 5				\n"
> -		"							\n"
> -		"6:	bnez	%[ticket], 6b				\n"
> -		"	 subu	%[ticket], 1				\n"
> -		"							\n"
> -		"	lhu	%[ticket], %[serving_now_ptr]		\n"
> -		"	beq	%[ticket], %[my_ticket], 2b		\n"
> -		"	 subu	%[ticket], %[my_ticket], %[ticket]	\n"
> -		"	b	4b					\n"
> -		"	 subu	%[ticket], %[ticket], 1			\n"
> -		"	.previous					\n"
> -		"	.set pop					\n"
> -		: [ticket_ptr] "+" GCC_OFF_SMALL_ASM() (lock->lock),
> -		  [serving_now_ptr] "+m" (lock->h.serving_now),
> -		  [ticket] "=&r" (tmp),
> -		  [my_ticket] "=&r" (my_ticket)
> -		: [inc] "r" (inc));
> -	}
> -
> -	smp_llsc_mb();
> -}
> -
> -static inline void arch_spin_unlock(arch_spinlock_t *lock)
> -{
> -	unsigned int serving_now = lock->h.serving_now + 1;
> -	wmb();
> -	lock->h.serving_now = (u16)serving_now;
> -	nudge_writes();
> -}
> -
> -static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock)
> -{
> -	int tmp, tmp2, tmp3;
> -	int inc = 0x10000;
> -
> -	if (R10000_LLSC_WAR) {
> -		__asm__ __volatile__ (
> -		"	.set push		# arch_spin_trylock	\n"
> -		"	.set noreorder					\n"
> -		"							\n"
> -		"1:	ll	%[ticket], %[ticket_ptr]		\n"
> -		"	srl	%[my_ticket], %[ticket], 16		\n"
> -		"	andi	%[now_serving], %[ticket], 0xffff	\n"
> -		"	bne	%[my_ticket], %[now_serving], 3f	\n"
> -		"	 addu	%[ticket], %[ticket], %[inc]		\n"
> -		"	sc	%[ticket], %[ticket_ptr]		\n"
> -		"	beqzl	%[ticket], 1b				\n"
> -		"	 li	%[ticket], 1				\n"
> -		"2:							\n"
> -		"	.subsection 2					\n"
> -		"3:	b	2b					\n"
> -		"	 li	%[ticket], 0				\n"
> -		"	.previous					\n"
> -		"	.set pop					\n"
> -		: [ticket_ptr] "+" GCC_OFF_SMALL_ASM() (lock->lock),
> -		  [ticket] "=&r" (tmp),
> -		  [my_ticket] "=&r" (tmp2),
> -		  [now_serving] "=&r" (tmp3)
> -		: [inc] "r" (inc));
> -	} else {
> -		__asm__ __volatile__ (
> -		"	.set push		# arch_spin_trylock	\n"
> -		"	.set noreorder					\n"
> -		"							\n"
> -		"1:	ll	%[ticket], %[ticket_ptr]		\n"
> -		"	srl	%[my_ticket], %[ticket], 16		\n"
> -		"	andi	%[now_serving], %[ticket], 0xffff	\n"
> -		"	bne	%[my_ticket], %[now_serving], 3f	\n"
> -		"	 addu	%[ticket], %[ticket], %[inc]		\n"
> -		"	sc	%[ticket], %[ticket_ptr]		\n"
> -		"	beqz	%[ticket], 1b				\n"
> -		"	 li	%[ticket], 1				\n"
> -		"2:	.insn						\n"
> -		"	.subsection 2					\n"
> -		"3:	b	2b					\n"
> -		"	 li	%[ticket], 0				\n"
> -		"	.previous					\n"
> -		"	.set pop					\n"
> -		: [ticket_ptr] "+" GCC_OFF_SMALL_ASM() (lock->lock),
> -		  [ticket] "=&r" (tmp),
> -		  [my_ticket] "=&r" (tmp2),
> -		  [now_serving] "=&r" (tmp3)
> -		: [inc] "r" (inc));
> -	}
> -
> -	smp_llsc_mb();
> -
> -	return tmp;
> -}
> +#include <asm/qspinlock.h>
>  
>  #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
>  #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
> diff --git a/arch/mips/include/asm/spinlock_types.h b/arch/mips/include/asm/spinlock_types.h
> index 3d38bfad9b49..177e722eb96c 100644
> --- a/arch/mips/include/asm/spinlock_types.h
> +++ b/arch/mips/include/asm/spinlock_types.h
> @@ -1,29 +1,7 @@
>  #ifndef _ASM_SPINLOCK_TYPES_H
>  #define _ASM_SPINLOCK_TYPES_H
>  
> -#include <linux/types.h>
> -
> -#include <asm/byteorder.h>
> -
> -typedef union {
> -	/*
> -	 * bits	 0..15 : serving_now
> -	 * bits 16..31 : ticket
> -	 */
> -	u32 lock;
> -	struct {
> -#ifdef __BIG_ENDIAN
> -		u16 ticket;
> -		u16 serving_now;
> -#else
> -		u16 serving_now;
> -		u16 ticket;
> -#endif
> -	} h;
> -} arch_spinlock_t;
> -
> -#define __ARCH_SPIN_LOCK_UNLOCKED	{ .lock = 0 }
> -
> +#include <asm-generic/qspinlock_types.h>
>  #include <asm-generic/qrwlock_types.h>
>  
>  #endif
> 

-- 
Best regards,
Alexander Sverdlin.



[Index of Archives]     [LKML Archive]     [Linux ARM Kernel]     [Linux ARM]     [Git]     [Yosemite News]     [Linux SCSI]     [Linux Hams]

  Powered by Linux