On 2024-09-26 18:12, Linus Torvalds wrote:
On Thu, 26 Sept 2024 at 08:54, Jonas Oberhauser
<jonas.oberhauser@xxxxxxxxxxxxxxx> wrote:
No, the issue introduced by the compiler optimization (or by your
original patch) is that the CPU can speculatively load from the first
pointer as soon as it has completed the load of that pointer:
You mean the compiler can do it. The inline asm has no impact on what
the CPU does. The conditional isn't a barrier for the actual hardware.
But once the compiler doesn't try to do it, the data dependency on the
address does end up being an ordering constraint on the hardware too
(I'm happy to say that I haven't heard from the crazies that want
value prediction in a long time).
Just use a barrier. Or make sure to use the proper ordered memory
accesses when possible. Don't use an inline asm for the compare - we
don't even have anything insane like that as a portable helper, and we
shouldn't have it.
How does the compiler barrier help in any way here ?
I am concerned about the compiler SSA GVN (Global Value Numbering)
optimizations, and I don't think a compiler barrier solves anything.
(or I'm missing something obvious)
I was concerned about the suggestion from Jonas to use "node2"
rather than "node" after the equality check as a way to ensure
the intended register is used to return the pointer, because after
the SSA GVN optimization pass, AFAIU this won't help in any way.
I have a set of examples below that show gcc use the result of the
first load, and clang use the result of the second load (on
both x86-64 and aarch64). Likewise when a load-acquire is used as
second load, which I find odd. Hopefully mixing this optimization
from gcc with speculation still abide by the memory model.
Only the asm goto approach ensures that gcc uses the result from
the second load.
#include <stdbool.h>
#define READ_ONCE(x) \
(*(__volatile__ __typeof__(x) *)&(x))
static inline
bool same_ptr(void *a, void *b)
{
asm goto (
#ifdef __x86_64__
"cmpq %[a], %[b]\n\t"
"jne %l[ne]\n\t"
#elif defined(__aarch64__)
"cmp %[a], %[b]\n\t"
"bne %l[ne]\n\t"
#else
# error "unimplemented"
#endif
: : [a] "r" (a), [b] "r" (b)
: : ne);
return true;
ne:
return false;
}
int *p;
int fct_2_volatile(void)
{
int *a, *b;
do {
a = READ_ONCE(p);
asm volatile ("" : : : "memory");
b = READ_ONCE(p);
} while (a != b);
return *b;
}
int fct_volatile_acquire(void)
{
int *a, *b;
do {
a = READ_ONCE(p);
asm volatile ("" : : : "memory");
b = __atomic_load_n(&p, __ATOMIC_ACQUIRE);
} while (a != b);
return *b;
}
int fct_asm_compare(void)
{
int *a, *b;
do {
a = READ_ONCE(p);
asm volatile ("" : : : "memory");
b = READ_ONCE(p);
} while (!same_ptr(a, b));
return *b;
}
x86-64 gcc 14.2:
fct_2_volatile:
mov rax,QWORD PTR [rip+0x0] # 7 <fct_2_volatile+0x7>
mov rdx,QWORD PTR [rip+0x0] # e <fct_2_volatile+0xe>
cmp rax,rdx
jne 0 <fct_2_volatile>
mov eax,DWORD PTR [rax]
ret
cs nop WORD PTR [rax+rax*1+0x0]
fct_volatile_acquire:
mov rax,QWORD PTR [rip+0x0] # 27 <fct_volatile_acquire+0x7>
mov rdx,QWORD PTR [rip+0x0] # 2e <fct_volatile_acquire+0xe>
cmp rax,rdx
jne 20 <fct_volatile_acquire>
mov eax,DWORD PTR [rax]
ret
cs nop WORD PTR [rax+rax*1+0x0]
fct_asm_compare:
mov rdx,QWORD PTR [rip+0x0] # 47 <fct_asm_compare+0x7>
mov rax,QWORD PTR [rip+0x0] # 4e <fct_asm_compare+0xe>
cmp rax,rdx
jne 40 <fct_asm_compare>
mov eax,DWORD PTR [rax]
ret
main:
xor eax,eax
ret
x86-64 clang 19.1.0:
fct_2_volatile:
mov rcx,QWORD PTR [rip+0x0] # 7 <fct_2_volatile+0x7>
mov rax,QWORD PTR [rip+0x0] # e <fct_2_volatile+0xe>
cmp rcx,rax
jne 0 <fct_2_volatile>
mov eax,DWORD PTR [rax]
ret
cs nop WORD PTR [rax+rax*1+0x0]
fct_volatile_acquire:
mov rcx,QWORD PTR [rip+0x0] # 27 <fct_volatile_acquire+0x7>
mov rax,QWORD PTR [rip+0x0] # 2e <fct_volatile_acquire+0xe>
cmp rcx,rax
jne 20 <fct_volatile_acquire>
mov eax,DWORD PTR [rax]
ret
cs nop WORD PTR [rax+rax*1+0x0]
fct_asm_compare:
mov rcx,QWORD PTR [rip+0x0] # 47 <fct_asm_compare+0x7>
mov rax,QWORD PTR [rip+0x0] # 4e <fct_asm_compare+0xe>
cmp rax,rcx
jne 40 <fct_asm_compare>
mov eax,DWORD PTR [rax]
ret
cs nop WORD PTR [rax+rax*1+0x0]
main:
xor eax,eax
ret
ARM64 gcc 14.2.0:
fct_2_volatile:
adrp x0, .LANCHOR0
add x0, x0, :lo12:.LANCHOR0
.L2:
ldr x1, [x0]
ldr x2, [x0]
cmp x1, x2
bne .L2
ldr w0, [x1]
ret
fct_volatile_acquire:
adrp x0, .LANCHOR0
add x0, x0, :lo12:.LANCHOR0
.L6:
ldr x1, [x0]
ldar x2, [x0]
cmp x1, x2
bne .L6
ldr w0, [x1]
ret
fct_asm_compare:
adrp x1, .LANCHOR0
add x1, x1, :lo12:.LANCHOR0
.L9:
ldr x2, [x1]
ldr x0, [x1]
cmp x2, x0
bne .L9
ldr w0, [x0]
ret
p:
.zero 8
armv8-a clang (trunk):
fct_2_volatile:
adrp x8, 0 <fct_2_volatile>
ldr x10, [x8]
ldr x9, [x8]
cmp x10, x9
b.ne 4 <fct_2_volatile+0x4> // b.any
ldr w0, [x9]
ret
fct_volatile_acquire:
adrp x8, 0 <fct_2_volatile>
add x8, x8, #0x0
ldr x10, [x8]
ldar x9, [x8]
cmp x10, x9
b.ne 24 <fct_volatile_acquire+0x8> // b.any
ldr w0, [x9]
ret
fct_asm_compare:
adrp x8, 0 <fct_2_volatile>
ldr x9, [x8]
ldr x8, [x8]
cmp x9, x8
b.ne 3c <fct_asm_compare> // b.any
ldr w0, [x8]
ret
main:
mov w0, wzr
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com