Hello, following patch is my current approach for fixing this issue. I introduced big_cpu_relax(), which uses Will's implementation [1] on ARM64 without LSE atomics and original cpu_relax() on any other CPU. Anyone has a better idea how to solve this issue properly? [1] https://lore.kernel.org/lkml/20170728092831.GA24839@xxxxxxx/ Zdenek Bouska -- Siemens, s.r.o Siemens Advanta Development diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 3918f2a67970..f3861ab9f541 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -367,6 +367,23 @@ static inline void spin_lock_prefetch(const void *ptr) "nop") : : "p" (ptr)); } +void armv8_big_cpu_relax(unsigned long pc); + +static inline void _big_cpu_relax(void) +{ + armv8_big_cpu_relax(_THIS_IP_); +} + +#define ARCH_HAS_BIG_CPU_RELAX +static inline void big_cpu_relax(void) +{ + if (system_uses_lse_atomics()) { + cpu_relax(); + } else { + _big_cpu_relax(); + } +} + extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */ extern void __init minsigstksz_setup(void); diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c index 5b7890139bc2..3f4fd24bd4b2 100644 --- a/arch/arm64/lib/delay.c +++ b/arch/arm64/lib/delay.c @@ -67,3 +67,29 @@ void __ndelay(unsigned long nsecs) __const_udelay(nsecs * 0x5UL); /* 2**32 / 1000000000 (rounded up) */ } EXPORT_SYMBOL(__ndelay); + +static DEFINE_PER_CPU(u64, __cpu_relax_data); + +#define CPU_RELAX_WFE_THRESHOLD 10000 +void armv8_big_cpu_relax(unsigned long pc) +{ + u64 new, old = raw_cpu_read(__cpu_relax_data); + u32 old_pc, new_pc; + bool wfe = false; + + old_pc = (u32)old; + new = new_pc = (u32)pc; + + if (old_pc == new_pc) { + if ((old >> 32) > CPU_RELAX_WFE_THRESHOLD) { + asm volatile("sevl; wfe; wfe\n" ::: "memory"); + wfe = true; + } else { + new = old + (1UL << 32); + } + } + + if (this_cpu_cmpxchg(__cpu_relax_data, old, new) == old && !wfe) + asm volatile("yield" ::: "memory"); +} +EXPORT_SYMBOL(armv8_big_cpu_relax); diff --git a/include/linux/processor.h b/include/linux/processor.h index dc78bdc7079a..3dc5e3fcb400 100644 --- a/include/linux/processor.h +++ b/include/linux/processor.h @@ -59,4 +59,8 @@ do { \ #endif +#ifndef ARCH_HAS_BIG_CPU_RELAX +#define big_cpu_relax() cpu_relax() +#endif + #endif /* _LINUX_PROCESSOR_H */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8ce75495e04f..cc8445de1006 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -21,6 +21,7 @@ #include <linux/sched/isolation.h> #include <uapi/linux/sched/types.h> #include <linux/task_work.h> +#include <linux/processor.h> #include "internals.h" @@ -1101,7 +1102,7 @@ static void irq_finalize_oneshot(struct irq_desc *desc, if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); - cpu_relax(); + big_cpu_relax(); goto again; }