Refactor arm64's delay() to poll for a mask/value condition (vcond) in it's wfet(), wfe(), and relaxed polling loops. Signed-off-by: Haris Okanovic <harisokn@xxxxxxxxxx> --- arch/arm64/lib/delay.c | 70 ++++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c index cb2062e7e234..a7c3040af316 100644 --- a/arch/arm64/lib/delay.c +++ b/arch/arm64/lib/delay.c @@ -14,43 +14,73 @@ #include <linux/timex.h> #include <clocksource/arm_arch_timer.h> +#include <asm/readex.h> -#define USECS_TO_CYCLES(time_usecs) \ - xloops_to_cycles((time_usecs) * 0x10C7UL) - -static inline unsigned long xloops_to_cycles(unsigned long xloops) +static inline u64 xloops_to_cycles(u64 xloops) { return (xloops * loops_per_jiffy * HZ) >> 32; } -void __delay(unsigned long cycles) +#define USECS_TO_XLOOPS(time_usecs) \ + ((time_usecs) * 0x10C7UL) + +#define USECS_TO_CYCLES(time_usecs) \ + xloops_to_cycles(USECS_TO_XLOOPS(time_usecs)) + +#define NSECS_TO_XLOOPS(time_nsecs) \ + ((time_nsecs) * 0x10C7UL) + +#define NSECS_TO_CYCLES(time_nsecs) \ + xloops_to_cycles(NSECS_TO_XLOOPS(time_nsecs)) + +static unsigned long __delay_until_ul(u64 cycles, unsigned long* addr, unsigned long mask, unsigned long val) { - cycles_t start = get_cycles(); + u64 start = get_cycles(); + unsigned long cur; if (alternative_has_cap_unlikely(ARM64_HAS_WFXT)) { u64 end = start + cycles; - /* - * Start with WFIT. If an interrupt makes us resume - * early, use a WFET loop to complete the delay. - */ - wfit(end); - while ((get_cycles() - start) < cycles) + do { + cur = __READ_ONCE_EX(*addr); + if ((cur & mask) == val) { + break; + } wfet(end); - } else if (arch_timer_evtstrm_available()) { - const cycles_t timer_evt_period = + } while ((get_cycles() - start) < cycles); + } else if (arch_timer_evtstrm_available()) { + const u64 timer_evt_period = USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US); - while ((get_cycles() - start + timer_evt_period) < cycles) + do { + cur = __READ_ONCE_EX(*addr); + if ((cur & mask) == val) { + break; + } wfe(); + } while ((get_cycles() - start + timer_evt_period) < cycles); + } else { + do { + cur = __READ_ONCE_EX(*addr); + if ((cur & mask) == val) { + break; + } + cpu_relax(); + } while ((get_cycles() - start) < cycles); } - while ((get_cycles() - start) < cycles) - cpu_relax(); + return cur; +} + +void __delay(unsigned long cycles) +{ + /* constant word for wfet()/wfe() to poll */ + unsigned long dummy ____cacheline_aligned = 0; + __delay_until_ul(cycles, &dummy, 0, 1); } EXPORT_SYMBOL(__delay); -inline void __const_udelay(unsigned long xloops) +void __const_udelay(unsigned long xloops) { __delay(xloops_to_cycles(xloops)); } @@ -58,12 +88,12 @@ EXPORT_SYMBOL(__const_udelay); void __udelay(unsigned long usecs) { - __const_udelay(usecs * 0x10C7UL); /* 2**32 / 1000000 (rounded up) */ + __delay(USECS_TO_CYCLES(usecs)); } EXPORT_SYMBOL(__udelay); void __ndelay(unsigned long nsecs) { - __const_udelay(nsecs * 0x5UL); /* 2**32 / 1000000000 (rounded up) */ + __delay(NSECS_TO_CYCLES(nsecs)); } EXPORT_SYMBOL(__ndelay); -- 2.34.1