Hi all, [adding kernel folk who work on asm stuff] As a heads-up, GCC 12 (not yet released) appears to erroneously optimize away calls to functions with volatile asm. Szabolcs has raised an issue on the GCC bugzilla: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105160 ... which is a P1 release blocker, and is currently being investigated. Jemery originally reported this as an issue with {readl,writel}_relaxed(), but the underlying problem doesn't have anything to do with those specifically. I'm dumping a bunch of info here largely for posterity / archival, and to find out who (from the kernel side) is willing and able to test proposed compiler fixes, once those are available. I'm happy to do so for aarch64; Peter, I assume you'd be happy to look at the x86 side? This is a generic issue, and I wrote test cases for aarch64 and x86_64. Those are inline later in this mail, and currently you can see them on compiler explorer: aarch64: https://godbolt.org/z/vMczqjYvs x86_64: https://godbolt.org/z/cveff9hq5 My aarch64 test case is: | #define sysreg_read(regname) \ | ({ \ | unsigned long __sr_val; \ | asm volatile( \ | "mrs %0, " #regname "\n" \ | : "=r" (__sr_val)); \ | \ | __sr_val; \ | }) | | #define sysreg_write(regname, __sw_val) \ | do { \ | asm volatile( \ | "msr " #regname ", %0\n" \ | : \ | : "r" (__sw_val)); \ | } while (0) | | #define isb() \ | do { \ | asm volatile( \ | "isb" \ | : \ | : \ | : "memory"); \ | } while (0) | | static unsigned long sctlr_read(void) | { | return sysreg_read(sctlr_el1); | } | | static void sctlr_write(unsigned long val) | { | sysreg_write(sctlr_el1, val); | } | | static void sctlr_rmw(void) | { | unsigned long val; | | val = sctlr_read(); | val |= 1UL << 7; | sctlr_write(val); | } | | void sctlr_read_multiple(void) | { | sctlr_read(); | sctlr_read(); | sctlr_read(); | sctlr_read(); | } | | void sctlr_write_multiple(void) | { | sctlr_write(0); | sctlr_write(0); | sctlr_write(0); | sctlr_write(0); | sctlr_write(0); | } | | void sctlr_rmw_multiple(void) | { | sctlr_rmw(); | sctlr_rmw(); | sctlr_rmw(); | sctlr_rmw(); | } | | void function(void) | { | sctlr_read_multiple(); | sctlr_write_multiple(); | sctlr_rmw_multiple(); | | isb(); | } Per compiler explorer (https://godbolt.org/z/vMczqjYvs) GCC trunk currently compiles this as: | sctlr_rmw: | mrs x0, sctlr_el1 | orr x0, x0, 128 | msr sctlr_el1, x0 | ret | sctlr_read_multiple: | mrs x0, sctlr_el1 | mrs x0, sctlr_el1 | mrs x0, sctlr_el1 | mrs x0, sctlr_el1 | ret | sctlr_write_multiple: | mov x0, 0 | msr sctlr_el1, x0 | msr sctlr_el1, x0 | msr sctlr_el1, x0 | msr sctlr_el1, x0 | msr sctlr_el1, x0 | ret | sctlr_rmw_multiple: | ret | function: | isb | ret Whereas GCC 11.2 compiles this as: | sctlr_rmw: | mrs x0, sctlr_el1 | orr x0, x0, 128 | msr sctlr_el1, x0 | ret | sctlr_read_multiple: | mrs x0, sctlr_el1 | mrs x0, sctlr_el1 | mrs x0, sctlr_el1 | mrs x0, sctlr_el1 | ret | sctlr_write_multiple: | mov x0, 0 | msr sctlr_el1, x0 | msr sctlr_el1, x0 | msr sctlr_el1, x0 | msr sctlr_el1, x0 | msr sctlr_el1, x0 | ret | sctlr_rmw_multiple: | stp x29, x30, [sp, -16]! | mov x29, sp | bl sctlr_rmw | bl sctlr_rmw | bl sctlr_rmw | bl sctlr_rmw | ldp x29, x30, [sp], 16 | ret | function: | stp x29, x30, [sp, -16]! | mov x29, sp | bl sctlr_read_multiple | bl sctlr_write_multiple | bl sctlr_rmw_multiple | isb | ldp x29, x30, [sp], 16 | ret My x86_64 test case is: | unsigned long rdmsr(unsigned long reg) | { | unsigned int lo, hi; | | asm volatile( | "rdmsr" | : "=d" (hi), "=a" (lo) | : "c" (reg) | ); | | return ((unsigned long)hi << 32) | lo; | } | | void wrmsr(unsigned long reg, unsigned long val) | { | unsigned int lo = val; | unsigned int hi = val >> 32; | | asm volatile( | "wrmsr" | : | : "d" (hi), "a" (lo), "c" (reg) | ); | } | | void msr_rmw_set_bits(unsigned long reg, unsigned long bits) | { | unsigned long val; | | val = rdmsr(reg); | val |= bits; | wrmsr(reg, val); | } | | void func_with_msr_side_effects(unsigned long reg) | { | msr_rmw_set_bits(reg, 1UL << 0); | msr_rmw_set_bits(reg, 1UL << 1); | msr_rmw_set_bits(reg, 1UL << 2); | msr_rmw_set_bits(reg, 1UL << 3); | } Per compiler explorer (https://godbolt.org/z/cveff9hq5) GCC trunk currently compiles this as: | msr_rmw_set_bits: | mov rcx, rdi | rdmsr | sal rdx, 32 | mov eax, eax | or rax, rsi | or rax, rdx | mov rdx, rax | shr rdx, 32 | wrmsr | ret | func_with_msr_side_effects: | ret While GCC 11.2 compiles that as: | msr_rmw_set_bits: | mov rcx, rdi | rdmsr | sal rdx, 32 | mov eax, eax | or rax, rsi | or rax, rdx | mov rdx, rax | shr rdx, 32 | wrmsr | ret | func_with_msr_side_effects: | push rbp | push rbx | mov rbx, rdi | mov rbp, rsi | call msr_rmw_set_bits | mov rsi, rbp | mov rdi, rbx | call msr_rmw_set_bits | mov rsi, rbp | mov rdi, rbx | call msr_rmw_set_bits | mov rsi, rbp | mov rdi, rbx | call msr_rmw_set_bits | pop rbx | pop rbp | ret Thanks, Mark.