On Wed, Apr 26, 2023 at 9:53 PM Namhyung Kim <namhyung@xxxxxxxxx> wrote: > > On Wed, Apr 26, 2023 at 09:26:59PM -0700, Andrii Nakryiko wrote: > > On Wed, Apr 26, 2023 at 7:21 PM Namhyung Kim <namhyung@xxxxxxxxxx> wrote: > > > > > > Hello Andrii, > > > > > > On Wed, Apr 26, 2023 at 6:19 PM Andrii Nakryiko > > > <andrii.nakryiko@xxxxxxxxx> wrote: > > > > > > > > On Wed, Apr 26, 2023 at 5:14 PM Namhyung Kim <namhyung@xxxxxxxxxx> wrote: > > > > > > > > > > Hello, > > > > > > > > > > I'm having a problem of loading perf lock contention BPF program [1] > > > > > on old kernels. It has collect_lock_syms() to get the address of each > > > > > CPU's run-queue lock. The kernel 5.14 changed the name of the field > > > > > so there's bpf_core_field_exists to check the name like below. > > > > > > > > > > if (bpf_core_field_exists(rq_new->__lock)) > > > > > lock_addr = (__u64)&rq_new->__lock; > > > > > else > > > > > lock_addr = (__u64)&rq_old->lock; > > > > > > > > I suspect compiler rewrites it to something like > > > > > > > > lock_addr = (__u64)&rq_old->lock; > > > > if (bpf_core_field_exists(rq_new->__lock)) > > > > lock_addr = (__u64)&rq_new->__lock; > > > > > > > > so rq_old relocation always happens and ends up being not guarded > > > > properly. You can try adding barrier_var(rq_new) and > > > > barrier_var(rq_old) around if and inside branches, that should > > > > pessimize compiler > > > > > > > > alternatively if you do > > > > > > > > if (bpf_core_field_exists(rq_new->__lock)) > > > > lock_addr = (__u64)&rq_new->__lock; > > > > else if (bpf_core_field_exists(rq_old->lock)) > > > > lock_addr = (__u64)&rq_old->lock; > > > > else > > > > lock_addr = 0; /* or signal error somehow */ > > > > > > > > It might work as well. > > > > > > Thanks a lot for your comment! > > > > > > I've tried the below code but no luck. :( > > > > Can you post an output of llvm-objdump -d <your.bpf.o> of the program > > (collect_lock_syms?) containing above code (or at least relevant > > portions with some buffer before/after to get a sense of what's going > > on) > > Sure. > > Here's the full source code: > > SEC("raw_tp/bpf_test_finish") > int BPF_PROG(collect_lock_syms) > { > __u64 lock_addr; > __u32 lock_flag; > > for (int i = 0; i < MAX_CPUS; i++) { > struct rq *rq = bpf_per_cpu_ptr(&runqueues, i); > struct rq___new *rq_new = (void *)rq; > struct rq___old *rq_old = (void *)rq; > > if (rq == NULL) > break; > > barrier_var(rq_old); > barrier_var(rq_new); > > if (bpf_core_field_exists(rq_old->lock)) { > barrier_var(rq_old); > lock_addr = (__u64)&rq_old->lock; > } else if (bpf_core_field_exists(rq_new->__lock)) { > barrier_var(rq_new); > lock_addr = (__u64)&rq_new->__lock; > } else > continue; > > lock_flag = LOCK_CLASS_RQLOCK; > bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); > } > return 0; > } > > And the disassembly is: > > $ llvm-objdump -d util/bpf_skel/.tmp/lock_contention.bpf.o > ... > Disassembly of section raw_tp/bpf_test_finish: > > 0000000000000000 <collect_lock_syms>: > 0: b7 06 00 00 00 00 00 00 r6 = 0 > 1: b7 07 00 00 01 00 00 00 r7 = 1 > 2: b7 09 00 00 01 00 00 00 r9 = 1 > 3: b7 08 00 00 00 00 00 00 r8 = 0 > > 0000000000000020 <LBB3_1>: > 4: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll > 6: bf 62 00 00 00 00 00 00 r2 = r6 > 7: 85 00 00 00 99 00 00 00 call 153 > 8: 15 00 12 00 00 00 00 00 if r0 == 0 goto +18 <LBB3_8> > 9: bf 01 00 00 00 00 00 00 r1 = r0 > 10: 15 07 12 00 00 00 00 00 if r7 == 0 goto +18 <LBB3_4> > 11: 0f 81 00 00 00 00 00 00 r1 += r8 > > 0000000000000060 <LBB3_6>: > 12: 63 9a f4 ff 00 00 00 00 *(u32 *)(r10 - 12) = r9 > 13: 7b 1a f8 ff 00 00 00 00 *(u64 *)(r10 - 8) = r1 > 14: bf a2 00 00 00 00 00 00 r2 = r10 > 15: 07 02 00 00 f8 ff ff ff r2 += -8 > 16: bf a3 00 00 00 00 00 00 r3 = r10 > 17: 07 03 00 00 f4 ff ff ff r3 += -12 > 18: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll > 20: b7 04 00 00 00 00 00 00 r4 = 0 > 21: 85 00 00 00 02 00 00 00 call 2 > > 00000000000000b0 <LBB3_7>: > 22: 07 06 00 00 01 00 00 00 r6 += 1 > 23: bf 61 00 00 00 00 00 00 r1 = r6 > 24: 67 01 00 00 20 00 00 00 r1 <<= 32 > 25: 77 01 00 00 20 00 00 00 r1 >>= 32 > 26: 55 01 e9 ff 00 04 00 00 if r1 != 1024 goto -23 <LBB3_1> > > 00000000000000d8 <LBB3_8>: > 27: b7 00 00 00 00 00 00 00 r0 = 0 > 28: 95 00 00 00 00 00 00 00 exit > > 00000000000000e8 <LBB3_4>: > 29: b7 01 00 00 01 00 00 00 r1 = 1 > 30: 15 01 f7 ff 00 00 00 00 if r1 == 0 goto -9 <LBB3_7> > 31: b7 01 00 00 00 00 00 00 r1 = 0 > 32: 0f 10 00 00 00 00 00 00 r0 += r1 > 33: bf 01 00 00 00 00 00 00 r1 = r0 > 34: 05 00 e9 ff 00 00 00 00 goto -23 <LBB3_6> > > The error message is like this: > > libbpf: prog 'collect_lock_syms': BPF program load failed: Invalid argument > libbpf: prog 'collect_lock_syms': -- BEGIN PROG LOAD LOG -- > reg type unsupported for arg#0 function collect_lock_syms#380 > 0: R1=ctx(off=0,imm=0) R10=fp0 > ; int BPF_PROG(collect_lock_syms) > 0: (b7) r6 = 0 ; R6_w=0 > 1: (b7) r7 = 0 ; R7_w=0 > 2: (b7) r9 = 1 ; R9_w=1 > 3: <invalid CO-RE relocation> > failed to resolve CO-RE relocation <byte_off> [381] struct rq___old.lock (0:0 @ offset 0) > processed 4 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 > -- END PROG LOAD LOG -- > libbpf: prog 'collect_lock_syms': failed to load: -22 > libbpf: failed to load object 'lock_contention_bpf' > libbpf: failed to load BPF skeleton 'lock_contention_bpf': -22 > Failed to load lock-contention BPF skeleton > lock contention BPF setup failed > > Ok, I didn't manage to force compiler to behave as long as `&rq_old->lock` pattern was used. So I went for a different approach. This works: $ git diff diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index 8911e2a077d8..8d3cfbb3cc65 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -418,32 +418,32 @@ int contention_end(u64 *ctx) extern struct rq runqueues __ksym; -struct rq__old { +struct rq___old { raw_spinlock_t lock; } __attribute__((preserve_access_index)); -struct rq__new { +struct rq___new { raw_spinlock_t __lock; } __attribute__((preserve_access_index)); SEC("raw_tp/bpf_test_finish") int BPF_PROG(collect_lock_syms) { - __u64 lock_addr; + __u64 lock_addr, lock_off; __u32 lock_flag; + if (bpf_core_field_exists(struct rq___new, __lock)) + lock_off = offsetof(struct rq___new, __lock); + else + lock_off = offsetof(struct rq___old, lock); + for (int i = 0; i < MAX_CPUS; i++) { struct rq *rq = bpf_per_cpu_ptr(&runqueues, i); - struct rq__new *rq_new = (void *)rq; - struct rq__old *rq_old = (void *)rq; if (rq == NULL) break; - if (bpf_core_field_exists(rq_new->__lock)) - lock_addr = (__u64)&rq_new->__lock; - else - lock_addr = (__u64)&rq_old->lock; + lock_addr = (__u64)(void *)rq + lock_off; lock_flag = LOCK_CLASS_RQLOCK; bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); } > Thanks, > Namhyung > >