On Fri, May 5, 2023 at 9:56 AM Arnaldo Carvalho de Melo <acme@xxxxxxxxxx> wrote: > > Em Fri, May 05, 2023 at 10:33:15AM -0300, Arnaldo Carvalho de Melo escreveu: > > Em Fri, May 05, 2023 at 01:03:14AM +0200, Jiri Olsa escreveu: > > That with the preserve_access_index isn't needed, we need just the > > fields that we access in the tools, right? > > I'm now doing build test this in many distro containers, without the two > reverts, i.e. BPF skels continue as opt-out as in my pull request, to > test build and also for the functionality tests on the tools using such > bpf skels, see below, no touching of vmlinux nor BTF data during the > build. > > - Arnaldo > > From 882adaee50bc27f85374aeb2fbaa5b76bef60d05 Mon Sep 17 00:00:00 2001 > From: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx> > Date: Thu, 4 May 2023 19:03:51 -0300 > Subject: [PATCH 1/1] perf bpf skels: Stop using vmlinux.h generated from BTF, > use subset of used structs + CO-RE > > Linus reported a build break due to using a vmlinux without a BTF elf > section to generate the vmlinux.h header with bpftool for use in the BPF > tools in tools/perf/util/bpf_skel/*.bpf.c. > > Instead add a vmlinux.h file with the structs needed with the fields the > tools need, marking the structs with __attribute__((preserve_access_index)), > so that libbpf's CO-RE code can fixup the struct field offsets. > > In some cases the vmlinux.h file that was being generated by bpftool > from the kernel BTF information was not needed at all, just including > linux/bpf.h, sometimes linux/perf_event.h was enough as non-UAPI > types were not being used. > > To keep te patch small, include those UAPI headers from the trimmed down > vmlinux.h file, that then provides the tools with just the structs and > the subset of its fields needed for them. > > Testing it: > > # perf lock contention -b find / > /dev/null > ^C contended total wait max wait avg wait type caller > > 7 53.59 us 10.86 us 7.66 us rwlock:R start_this_handle+0xa0 > 2 30.35 us 21.99 us 15.17 us rwsem:R iterate_dir+0x52 > 1 9.04 us 9.04 us 9.04 us rwlock:W start_this_handle+0x291 > 1 8.73 us 8.73 us 8.73 us spinlock raw_spin_rq_lock_nested+0x1e > # > # perf lock contention -abl find / > /dev/null > ^C contended total wait max wait avg wait address symbol > > 1 262.96 ms 262.96 ms 262.96 ms ffff8e67502d0170 (mutex) > 12 244.24 us 39.91 us 20.35 us ffff8e6af56f8070 mmap_lock (rwsem) > 7 30.28 us 6.85 us 4.33 us ffff8e6c865f1d40 rq_lock (spinlock) > 3 7.42 us 4.03 us 2.47 us ffff8e6c864b1d40 rq_lock (spinlock) > 2 3.72 us 2.19 us 1.86 us ffff8e6c86571d40 rq_lock (spinlock) > 1 2.42 us 2.42 us 2.42 us ffff8e6c86471d40 rq_lock (spinlock) > 4 2.11 us 559 ns 527 ns ffffffff9a146c80 rcu_state (spinlock) > 3 1.45 us 818 ns 482 ns ffff8e674ae8384c (rwlock) > 1 870 ns 870 ns 870 ns ffff8e68456ee060 (rwlock) > 1 663 ns 663 ns 663 ns ffff8e6c864f1d40 rq_lock (spinlock) > 1 573 ns 573 ns 573 ns ffff8e6c86531d40 rq_lock (spinlock) > 1 472 ns 472 ns 472 ns ffff8e6c86431740 (spinlock) > 1 397 ns 397 ns 397 ns ffff8e67413a4f04 (spinlock) > # > # perf test offcpu > 95: perf record offcpu profiling tests : Ok > # > # perf kwork latency --use-bpf > Starting trace, Hit <Ctrl+C> to stop and report > ^C > Kwork Name | Cpu | Avg delay | Count | Max delay | Max delay start | Max delay end | > -------------------------------------------------------------------------------------------------------------------------------- > (w)flush_memcg_stats_dwork | 0000 | 1056.212 ms | 2 | 2112.345 ms | 550113.229573 s | 550115.341919 s | > (w)toggle_allocation_gate | 0000 | 10.144 ms | 62 | 416.389 ms | 550113.453518 s | 550113.869907 s | > (w)0xffff8e6748e28080 | 0002 | 0.623 ms | 1 | 0.623 ms | 550110.989841 s | 550110.990464 s | > (w)vmstat_shepherd | 0000 | 0.586 ms | 10 | 2.828 ms | 550111.971536 s | 550111.974364 s | > (w)vmstat_update | 0007 | 0.363 ms | 5 | 1.634 ms | 550113.222520 s | 550113.224154 s | > (w)vmstat_update | 0000 | 0.324 ms | 10 | 2.827 ms | 550111.971526 s | 550111.974354 s | > (w)0xffff8e674c5f4a58 | 0002 | 0.102 ms | 5 | 0.134 ms | 550110.989839 s | 550110.989972 s | > (w)psi_avgs_work | 0001 | 0.086 ms | 3 | 0.107 ms | 550114.957852 s | 550114.957959 s | > (w)psi_avgs_work | 0000 | 0.079 ms | 5 | 0.100 ms | 550118.605668 s | 550118.605768 s | > (w)kfree_rcu_monitor | 0006 | 0.079 ms | 1 | 0.079 ms | 550110.925821 s | 550110.925900 s | > (w)psi_avgs_work | 0004 | 0.079 ms | 1 | 0.079 ms | 550109.581835 s | 550109.581914 s | > (w)psi_avgs_work | 0001 | 0.078 ms | 1 | 0.078 ms | 550109.197809 s | 550109.197887 s | > (w)psi_avgs_work | 0002 | 0.077 ms | 5 | 0.086 ms | 550110.669819 s | 550110.669905 s | > <SNIP> > # strace -e bpf -o perf-stat-bpf-counters.output perf stat -e cycles --bpf-counters sleep 1 > > Performance counter stats for 'sleep 1': > > 6,197,983 cycles > > 1.003922848 seconds time elapsed > > 0.000000000 seconds user > 0.002032000 seconds sys > > # head -7 perf-stat-bpf-counters.output > bpf(BPF_OBJ_GET, {pathname="/sys/fs/bpf/perf_attr_map", bpf_fd=0, file_flags=0}, 16) = 3 > bpf(BPF_OBJ_GET_INFO_BY_FD, {info={bpf_fd=3, info_len=88, info=0x7ffcead64990}}, 16) = 0 > bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=3, key=0x24129e0, value=0x7ffcead65a48, flags=BPF_ANY}, 32) = 0 > bpf(BPF_LINK_GET_FD_BY_ID, {link_id=1252}, 12) = -1 ENOENT (No such file or directory) > bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2, insns=0x7ffcead65780, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0}, 116) = 4 > bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2, insns=0x7ffcead65920, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0, fd_array=NULL}, 128) = 4 > bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0\20\0\0\0\20\0\0\0\5\0\0\0\1\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=45, btf_log_size=0, btf_log_level=0}, 28) = 4 > # > > Reported-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> > Suggested-by: Andrii Nakryiko <andrii@xxxxxxxxxx> > Cc: Adrian Hunter <adrian.hunter@xxxxxxxxx> > Cc: Ian Rogers <irogers@xxxxxxxxxx> > Cc: Jiri Olsa <jolsa@xxxxxxxxxx> > Cc: Namhyung Kim <namhyung@xxxxxxxxxx> > Co-developed-by: Jiri Olsa <jolsa@xxxxxxxxxx> > Signed-off-by: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx> > --- > tools/perf/Makefile.perf | 20 +--- > tools/perf/util/bpf_skel/.gitignore | 1 - > tools/perf/util/bpf_skel/vmlinux.h | 173 ++++++++++++++++++++++++++++ > 3 files changed, 174 insertions(+), 20 deletions(-) > create mode 100644 tools/perf/util/bpf_skel/vmlinux.h > > diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf > index 48aba186ceb50792..61c33d100b2bcc90 100644 > --- a/tools/perf/Makefile.perf > +++ b/tools/perf/Makefile.perf > @@ -1063,25 +1063,7 @@ $(BPFTOOL): | $(SKEL_TMP_OUT) > $(Q)CFLAGS= $(MAKE) -C ../bpf/bpftool \ > OUTPUT=$(SKEL_TMP_OUT)/ bootstrap > > -VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ > - $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ > - ../../vmlinux \ > - /sys/kernel/btf/vmlinux \ > - /boot/vmlinux-$(shell uname -r) > -VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) > - > -$(SKEL_OUT)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) > -ifeq ($(VMLINUX_H),) > - $(QUIET_GEN)$(BPFTOOL) btf dump file $< format c > $@ || \ > - (echo "Failure to generate vmlinux.h needed for the recommended BPF skeleton support." && \ > - echo "To disable this use the build option NO_BPF_SKEL=1." && \ > - echo "Alternatively point at a pre-generated vmlinux.h with VMLINUX_H=<path>." && \ > - false) > -else > - $(Q)cp "$(VMLINUX_H)" $@ > -endif Perhaps we can define VMLINUX_H in the Makefile.config to be tools/perf/util/bpf_skel/vmlinux.h, then someone can clear it like: make -C tools/perf VMLINIX_H= to trigger generation, etc. Such thoughts may be better kept for 6.5 :-) Thanks, Ian > -$(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) $(SKEL_OUT)/vmlinux.h | $(SKEL_TMP_OUT) > +$(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT) > $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) \ > -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@ > > diff --git a/tools/perf/util/bpf_skel/.gitignore b/tools/perf/util/bpf_skel/.gitignore > index cd01455e1b53c3d9..7a1c832825de8445 100644 > --- a/tools/perf/util/bpf_skel/.gitignore > +++ b/tools/perf/util/bpf_skel/.gitignore > @@ -1,4 +1,3 @@ > # SPDX-License-Identifier: GPL-2.0-only > .tmp > *.skel.h > -vmlinux.h > diff --git a/tools/perf/util/bpf_skel/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux.h > new file mode 100644 > index 0000000000000000..449b1ea91fc48143 > --- /dev/null > +++ b/tools/perf/util/bpf_skel/vmlinux.h > @@ -0,0 +1,173 @@ > +#ifndef __VMLINUX_H > +#define __VMLINUX_H > + > +#include <linux/bpf.h> > +#include <linux/types.h> > +#include <linux/perf_event.h> > +#include <stdbool.h> > + > +// non-UAPI kernel data structures, used in the .bpf.c BPF tool component. > + > +// Just the fields used in these tools preserving the access index so that > +// libbpf can fixup offsets with the ones used in the kernel when loading the > +// BPF bytecode, if they differ from what is used here. > + > +typedef __u8 u8; > +typedef __u32 u32; > +typedef __u64 u64; > +typedef __s64 s64; > + > +typedef int pid_t; > + > +enum cgroup_subsys_id { > + perf_event_cgrp_id = 8, > +}; > + > +enum { > + HI_SOFTIRQ = 0, > + TIMER_SOFTIRQ, > + NET_TX_SOFTIRQ, > + NET_RX_SOFTIRQ, > + BLOCK_SOFTIRQ, > + IRQ_POLL_SOFTIRQ, > + TASKLET_SOFTIRQ, > + SCHED_SOFTIRQ, > + HRTIMER_SOFTIRQ, > + RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ > + > + NR_SOFTIRQS > +}; > + > +typedef struct { > + s64 counter; > +} __attribute__((preserve_access_index)) atomic64_t; > + > +typedef atomic64_t atomic_long_t; > + > +struct raw_spinlock { > + int rawlock; > +} __attribute__((preserve_access_index)); > + > +typedef struct raw_spinlock raw_spinlock_t; > + > +typedef struct { > + struct raw_spinlock rlock; > +} __attribute__((preserve_access_index)) spinlock_t; > + > +struct sighand_struct { > + spinlock_t siglock; > +} __attribute__((preserve_access_index)); > + > +struct rw_semaphore { > + atomic_long_t owner; > +} __attribute__((preserve_access_index)); > + > +struct mutex { > + atomic_long_t owner; > +} __attribute__((preserve_access_index)); > + > +struct kernfs_node { > + u64 id; > +} __attribute__((preserve_access_index)); > + > +struct cgroup { > + struct kernfs_node *kn; > + int level; > +} __attribute__((preserve_access_index)); > + > +struct cgroup_subsys_state { > + struct cgroup *cgroup; > +} __attribute__((preserve_access_index)); > + > +struct css_set { > + struct cgroup_subsys_state *subsys[13]; > + struct cgroup *dfl_cgrp; > +} __attribute__((preserve_access_index)); > + > +struct mm_struct { > + struct rw_semaphore mmap_lock; > +} __attribute__((preserve_access_index)); > + > +struct task_struct { > + unsigned int flags; > + struct mm_struct *mm; > + pid_t pid; > + pid_t tgid; > + char comm[16]; > + struct sighand_struct *sighand; > + struct css_set *cgroups; > +} __attribute__((preserve_access_index)); > + > +struct trace_entry { > + short unsigned int type; > + unsigned char flags; > + unsigned char preempt_count; > + int pid; > +} __attribute__((preserve_access_index)); > + > +struct trace_event_raw_irq_handler_entry { > + struct trace_entry ent; > + int irq; > + u32 __data_loc_name; > + char __data[]; > +} __attribute__((preserve_access_index)); > + > +struct trace_event_raw_irq_handler_exit { > + struct trace_entry ent; > + int irq; > + int ret; > + char __data[]; > +} __attribute__((preserve_access_index)); > + > +struct trace_event_raw_softirq { > + struct trace_entry ent; > + unsigned int vec; > + char __data[]; > +} __attribute__((preserve_access_index)); > + > +struct trace_event_raw_workqueue_execute_start { > + struct trace_entry ent; > + void *work; > + void *function; > + char __data[]; > +} __attribute__((preserve_access_index)); > + > +struct trace_event_raw_workqueue_execute_end { > + struct trace_entry ent; > + void *work; > + void *function; > + char __data[]; > +} __attribute__((preserve_access_index)); > + > +struct trace_event_raw_workqueue_activate_work { > + struct trace_entry ent; > + void *work; > + char __data[]; > +} __attribute__((preserve_access_index)); > + > +struct perf_sample_data { > + u64 addr; > + u64 period; > + union perf_sample_weight weight; > + u64 txn; > + union perf_mem_data_src data_src; > + u64 ip; > + struct { > + u32 pid; > + u32 tid; > + } tid_entry; > + u64 time; > + u64 id; > + struct { > + u32 cpu; > + } cpu_entry; > + u64 phys_addr; > + u64 data_page_size; > + u64 code_page_size; > +} __attribute__((__aligned__(64))) __attribute__((preserve_access_index)); > + > +struct bpf_perf_event_data_kern { > + struct perf_sample_data *data; > + struct perf_event *event; > +} __attribute__((preserve_access_index)); > +#endif // __VMLINUX_H > -- > 2.39.2 >