Re: [RFC PATCH 1/4] hazptr: Add initial implementation of hazard pointers

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 2024-09-26 18:12, Linus Torvalds wrote:
On Thu, 26 Sept 2024 at 08:54, Jonas Oberhauser
<jonas.oberhauser@xxxxxxxxxxxxxxx> wrote:

No, the issue introduced by the compiler optimization (or by your
original patch) is that the CPU can speculatively load from the first
pointer as soon as it has completed the load of that pointer:

You mean the compiler can do it. The inline asm has no impact on what
the CPU does. The conditional isn't a barrier for the actual hardware.
But once the compiler doesn't try to do it, the data dependency on the
address does end up being an ordering constraint on the hardware too
(I'm happy to say that I haven't heard from the crazies that want
value prediction in a long time).

Just use a barrier.  Or make sure to use the proper ordered memory
accesses when possible. Don't use an inline asm for the compare - we
don't even have anything insane like that as a portable helper, and we
shouldn't have it.

How does the compiler barrier help in any way here ?

I am concerned about the compiler SSA GVN (Global Value Numbering)
optimizations, and I don't think a compiler barrier solves anything.
(or I'm missing something obvious)

I was concerned about the suggestion from Jonas to use "node2"
rather than "node" after the equality check as a way to ensure
the intended register is used to return the pointer, because after
the SSA GVN optimization pass, AFAIU this won't help in any way.
I have a set of examples below that show gcc use the result of the
first load, and clang use the result of the second load (on
both x86-64 and aarch64). Likewise when a load-acquire is used as
second load, which I find odd. Hopefully mixing this optimization
from gcc with speculation still abide by the memory model.

Only the asm goto approach ensures that gcc uses the result from
the second load.

#include <stdbool.h>

#define READ_ONCE(x)   \
    (*(__volatile__  __typeof__(x) *)&(x))

static inline
bool same_ptr(void *a, void *b)
{
    asm goto (
#ifdef __x86_64__
        "cmpq %[a], %[b]\n\t"
        "jne %l[ne]\n\t"
#elif defined(__aarch64__)
        "cmp %[a], %[b]\n\t"
        "bne %l[ne]\n\t"
#else
# error "unimplemented"
#endif
        : : [a] "r" (a), [b] "r" (b)
        : : ne);
    return true;
ne:
    return false;
}

int *p;

int fct_2_volatile(void)
{
    int *a, *b;

    do {
        a = READ_ONCE(p);
        asm volatile ("" : : : "memory");
        b = READ_ONCE(p);
    } while (a != b);
    return *b;
}

int fct_volatile_acquire(void)
{
    int *a, *b;

    do {
        a = READ_ONCE(p);
        asm volatile ("" : : : "memory");
        b = __atomic_load_n(&p, __ATOMIC_ACQUIRE);
    } while (a != b);
    return *b;
}

int fct_asm_compare(void)
{
    int *a, *b;

    do {
        a = READ_ONCE(p);
        asm volatile ("" : : : "memory");
        b = READ_ONCE(p);
    } while (!same_ptr(a, b));
    return *b;
}

x86-64 gcc 14.2:

fct_2_volatile:
 mov    rax,QWORD PTR [rip+0x0]        # 7 <fct_2_volatile+0x7>
 mov    rdx,QWORD PTR [rip+0x0]        # e <fct_2_volatile+0xe>
 cmp    rax,rdx
 jne    0 <fct_2_volatile>
 mov    eax,DWORD PTR [rax]
 ret
 cs nop WORD PTR [rax+rax*1+0x0]
fct_volatile_acquire:
 mov    rax,QWORD PTR [rip+0x0]        # 27 <fct_volatile_acquire+0x7>
 mov    rdx,QWORD PTR [rip+0x0]        # 2e <fct_volatile_acquire+0xe>
 cmp    rax,rdx
 jne    20 <fct_volatile_acquire>
 mov    eax,DWORD PTR [rax]
 ret
 cs nop WORD PTR [rax+rax*1+0x0]
fct_asm_compare:
 mov    rdx,QWORD PTR [rip+0x0]        # 47 <fct_asm_compare+0x7>
 mov    rax,QWORD PTR [rip+0x0]        # 4e <fct_asm_compare+0xe>
 cmp    rax,rdx
 jne    40 <fct_asm_compare>
 mov    eax,DWORD PTR [rax]
 ret
main:
 xor    eax,eax
 ret

x86-64 clang 19.1.0:

fct_2_volatile:
 mov    rcx,QWORD PTR [rip+0x0]        # 7 <fct_2_volatile+0x7>
 mov    rax,QWORD PTR [rip+0x0]        # e <fct_2_volatile+0xe>
 cmp    rcx,rax
 jne    0 <fct_2_volatile>
 mov    eax,DWORD PTR [rax]
 ret
 cs nop WORD PTR [rax+rax*1+0x0]
fct_volatile_acquire:
 mov    rcx,QWORD PTR [rip+0x0]        # 27 <fct_volatile_acquire+0x7>
 mov    rax,QWORD PTR [rip+0x0]        # 2e <fct_volatile_acquire+0xe>
 cmp    rcx,rax
 jne    20 <fct_volatile_acquire>
 mov    eax,DWORD PTR [rax]
 ret
 cs nop WORD PTR [rax+rax*1+0x0]
fct_asm_compare:
 mov    rcx,QWORD PTR [rip+0x0]        # 47 <fct_asm_compare+0x7>
 mov    rax,QWORD PTR [rip+0x0]        # 4e <fct_asm_compare+0xe>
 cmp    rax,rcx
 jne    40 <fct_asm_compare>
 mov    eax,DWORD PTR [rax]
 ret
 cs nop WORD PTR [rax+rax*1+0x0]
main:
 xor    eax,eax
 ret

ARM64 gcc 14.2.0:

fct_2_volatile:
        adrp    x0, .LANCHOR0
        add     x0, x0, :lo12:.LANCHOR0
.L2:
        ldr     x1, [x0]
        ldr     x2, [x0]
        cmp     x1, x2
        bne     .L2
        ldr     w0, [x1]
        ret
fct_volatile_acquire:
        adrp    x0, .LANCHOR0
        add     x0, x0, :lo12:.LANCHOR0
.L6:
        ldr     x1, [x0]
        ldar    x2, [x0]
        cmp     x1, x2
        bne     .L6
        ldr     w0, [x1]
        ret
fct_asm_compare:
        adrp    x1, .LANCHOR0
        add     x1, x1, :lo12:.LANCHOR0
.L9:
        ldr     x2, [x1]
        ldr     x0, [x1]
        cmp x2, x0
        bne .L9

        ldr     w0, [x0]
        ret
p:
        .zero   8

armv8-a clang (trunk):

fct_2_volatile:
 adrp	x8, 0 <fct_2_volatile>
 ldr	x10, [x8]
 ldr	x9, [x8]
 cmp	x10, x9
 b.ne	4 <fct_2_volatile+0x4>  // b.any
 ldr	w0, [x9]
 ret
fct_volatile_acquire:
 adrp	x8, 0 <fct_2_volatile>
 add	x8, x8, #0x0
 ldr	x10, [x8]
 ldar	x9, [x8]
 cmp	x10, x9
 b.ne	24 <fct_volatile_acquire+0x8>  // b.any
 ldr	w0, [x9]
 ret
fct_asm_compare:
 adrp	x8, 0 <fct_2_volatile>
 ldr	x9, [x8]
 ldr	x8, [x8]
 cmp	x9, x8
 b.ne	3c <fct_asm_compare>  // b.any
 ldr	w0, [x8]
 ret
main:
 mov	w0, wzr

--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux