Hello, On Wed, Sep 04, 2024 at 12:31:02PM +0000, Tengda Wu wrote: > bperf has a nice ability to share PMUs, but it still does not support > inherit events during fork(), resulting in some deviations in its stat > results compared with perf. > > perf stat result: > $ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop > > Performance counter stats for './perf test -w sqrtloop': > > 2,316,038,116 cycles > 2,859,350,725 instructions # 1.23 insn per cycle > > 1.009603637 seconds time elapsed > > 1.004196000 seconds user > 0.003950000 seconds sys > > bperf stat result: > $ ./perf stat --bpf-counters -e cycles,instructions -- ./perf test -w sqrtloop > > Performance counter stats for './perf test -w sqrtloop': > > 18,762,093 cycles > 23,487,766 instructions # 1.25 insn per cycle > > 1.008913769 seconds time elapsed > > 1.003248000 seconds user > 0.004069000 seconds sys > > In order to support event inheritance, two new bpf programs are added > to monitor the fork and exit of tasks respectively. When a task is > created, add it to the filter map to enable counting, and reuse the > `accum_key` of its parent task to count together with the parent task. > When a task exits, remove it from the filter map to disable counting. Right, I think we may need this for other BPF programs. > > After support: > $ ./perf stat --bpf-counters -e cycles,instructions -- ./perf test -w sqrtloop > > Performance counter stats for './perf test -w sqrtloop': > > 2,316,543,537 cycles > 2,859,677,779 instructions # 1.23 insn per cycle > > 1.009566332 seconds time elapsed > > 1.004414000 seconds user > 0.003545000 seconds sys > > Signed-off-by: Tengda Wu <wutengda@xxxxxxxxxxxxxxx> > --- > tools/perf/util/bpf_counter.c | 9 +-- > tools/perf/util/bpf_skel/bperf_follower.bpf.c | 75 +++++++++++++++++-- > tools/perf/util/bpf_skel/bperf_u.h | 5 ++ > 3 files changed, 78 insertions(+), 11 deletions(-) > > diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c > index 7a8af60e0f51..e07ff04b934f 100644 > --- a/tools/perf/util/bpf_counter.c > +++ b/tools/perf/util/bpf_counter.c > @@ -529,9 +529,6 @@ static int bperf__load(struct evsel *evsel, struct target *target) > /* set up reading map */ > bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings, > filter_entry_cnt); > - /* set up follower filter based on target */ > - bpf_map__set_max_entries(evsel->follower_skel->maps.filter, > - filter_entry_cnt); > err = bperf_follower_bpf__load(evsel->follower_skel); > if (err) { > pr_err("Failed to load follower skeleton\n"); > @@ -543,6 +540,7 @@ static int bperf__load(struct evsel *evsel, struct target *target) > for (i = 0; i < filter_entry_cnt; i++) { > int filter_map_fd; > __u32 key; > + struct bperf_filter_value fval = { i, 0 }; > > if (filter_type == BPERF_FILTER_PID || > filter_type == BPERF_FILTER_TGID) > @@ -553,10 +551,11 @@ static int bperf__load(struct evsel *evsel, struct target *target) > break; > > filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter); > - bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY); > + bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY); > } > > evsel->follower_skel->bss->type = filter_type; > + evsel->follower_skel->bss->init_filter_entries = filter_entry_cnt; Do you need this in the BPF program? > > err = bperf_follower_bpf__attach(evsel->follower_skel); > > @@ -623,7 +622,7 @@ static int bperf__read(struct evsel *evsel) > bperf_sync_counters(evsel); > reading_map_fd = bpf_map__fd(skel->maps.accum_readings); > > - for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) { > + for (i = 0; i < skel->bss->init_filter_entries; i++) { > struct perf_cpu entry; > __u32 cpu; > > diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c > index f193998530d4..59fab421526a 100644 > --- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c > +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c > @@ -5,6 +5,8 @@ > #include <bpf/bpf_tracing.h> > #include "bperf_u.h" > > +#define MAX_ENTRIES 102400 > + > struct { > __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); > __uint(key_size, sizeof(__u32)); > @@ -22,25 +24,29 @@ struct { > struct { > __uint(type, BPF_MAP_TYPE_HASH); > __uint(key_size, sizeof(__u32)); > - __uint(value_size, sizeof(__u32)); > + __uint(value_size, sizeof(struct bperf_filter_value)); > + __uint(max_entries, MAX_ENTRIES); > + __uint(map_flags, BPF_F_NO_PREALLOC); > } filter SEC(".maps"); > > enum bperf_filter_type type = 0; > int enabled = 0; > +__u32 init_filter_entries = 0; > > SEC("fexit/XXX") > int BPF_PROG(fexit_XXX) > { > struct bpf_perf_event_value *diff_val, *accum_val; > __u32 filter_key, zero = 0; > - __u32 *accum_key; > + __u32 accum_key; > + struct bperf_filter_value *fval; > > if (!enabled) > return 0; > > switch (type) { > case BPERF_FILTER_GLOBAL: > - accum_key = &zero; > + accum_key = zero; > goto do_add; > case BPERF_FILTER_CPU: > filter_key = bpf_get_smp_processor_id(); > @@ -55,16 +61,20 @@ int BPF_PROG(fexit_XXX) > return 0; > } > > - accum_key = bpf_map_lookup_elem(&filter, &filter_key); > - if (!accum_key) > + fval = bpf_map_lookup_elem(&filter, &filter_key); > + if (!fval) > return 0; > > + accum_key = fval->accum_key; > + if (fval->exited) > + bpf_map_delete_elem(&filter, &filter_key); > + > do_add: > diff_val = bpf_map_lookup_elem(&diff_readings, &zero); > if (!diff_val) > return 0; > > - accum_val = bpf_map_lookup_elem(&accum_readings, accum_key); > + accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key); > if (!accum_val) > return 0; > > @@ -75,4 +85,57 @@ int BPF_PROG(fexit_XXX) > return 0; > } > > +SEC("tp_btf/task_newtask") > +int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags) > +{ > + __u32 parent_pid, child_pid; > + struct bperf_filter_value *parent_fval; > + struct bperf_filter_value child_fval = { 0 }; > + > + if (!enabled) > + return 0; > + > + if (type != BPERF_FILTER_PID && type != BPERF_FILTER_TGID) > + return 0; I think you can load/attach this program only if the type is either PID or TGID. Then it'd reduce the overhead. > + > + parent_pid = bpf_get_current_pid_tgid() >> 32; > + child_pid = task->pid; > + > + /* Check if the current task is one of the target tasks to be counted */ > + parent_fval = bpf_map_lookup_elem(&filter, &parent_pid); > + if (!parent_fval) > + return 0; > + > + /* Start counting for the new task by adding it into filter map, > + * inherit the accum key of its parent task so that they can be > + * counted together. > + */ > + child_fval.accum_key = parent_fval->accum_key; > + child_fval.exited = 0; > + bpf_map_update_elem(&filter, &child_pid, &child_fval, BPF_NOEXIST); > + > + return 0; > +} > + > +SEC("tp_btf/sched_process_exit") > +int BPF_PROG(on_exittask, struct task_struct *task) > +{ > + __u32 pid; > + struct bperf_filter_value *fval; > + > + if (!enabled) > + return 0; > + > + if (type != BPERF_FILTER_PID && type != BPERF_FILTER_TGID) > + return 0; Ditto. Thanks, Namhyung > + > + /* Stop counting for this task by removing it from filter map */ > + pid = task->pid; > + fval = bpf_map_lookup_elem(&filter, &pid); > + if (fval) > + fval->exited = 1; > + > + return 0; > +} > + > char LICENSE[] SEC("license") = "Dual BSD/GPL"; > diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h > index 1ce0c2c905c1..4a4a753980be 100644 > --- a/tools/perf/util/bpf_skel/bperf_u.h > +++ b/tools/perf/util/bpf_skel/bperf_u.h > @@ -11,4 +11,9 @@ enum bperf_filter_type { > BPERF_FILTER_TGID, > }; > > +struct bperf_filter_value { > + __u32 accum_key; > + __u8 exited; > +}; > + > #endif /* __BPERF_STAT_U_H */ > -- > 2.34.1 >