On Thu, Apr 06, 2017 at 08:16:19AM -0700, Linus Torvalds wrote: > In theory x86 could use monitor/mwait for it too, in practice I think > it tends to still be too high latency (because it was originally just > designed for the idle loop). mwait got extended to actually be useful, > but I'm not sure what the latency is for the modern one. I've been meaning to test mwait-c0 for this, but never got around to it. Something like the below, which is ugly (because I couldn't be bothered to resolve the header recursion and thus duplicates the monitor/mwait functions) and broken (because it hard assumes the hardware can do monitor/mwait). But it builds and boots, no clue if its better or worse. Changing mwait eax to 0 would give us C1 and might also be worth a try I suppose. --- arch/x86/include/asm/barrier.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index bfb28ca..faab9cd 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -80,6 +80,36 @@ do { \ #define __smp_mb__before_atomic() barrier() #define __smp_mb__after_atomic() barrier() +static inline void ___monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void ___mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +#define smp_cond_load_acquire(ptr, cond_expr) \ +({ \ + typeof(ptr) __PTR = (ptr); \ + typeof(*ptr) VAL; \ + for (;;) { \ + ___monitor(__PTR, 0, 0); \ + VAL = READ_ONCE(*__PTR); \ + if (cond_expr) \ + break; \ + ___mwait(0xf0 /* C0 */, 0x01 /* INT */); \ + } \ + smp_acquire__after_ctrl_dep(); \ + VAL; \ +}) + #include <asm-generic/barrier.h> #endif /* _ASM_X86_BARRIER_H */