On 2024/9/5 12:01, Namhyung Kim wrote: > Hello, > > On Wed, Sep 04, 2024 at 12:31:02PM +0000, Tengda Wu wrote: >> bperf has a nice ability to share PMUs, but it still does not support >> inherit events during fork(), resulting in some deviations in its stat >> results compared with perf. >> >> perf stat result: >> $ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop >> >> Performance counter stats for './perf test -w sqrtloop': >> >> 2,316,038,116 cycles >> 2,859,350,725 instructions # 1.23 insn per cycle >> >> 1.009603637 seconds time elapsed >> >> 1.004196000 seconds user >> 0.003950000 seconds sys >> >> bperf stat result: >> $ ./perf stat --bpf-counters -e cycles,instructions -- ./perf test -w sqrtloop >> >> Performance counter stats for './perf test -w sqrtloop': >> >> 18,762,093 cycles >> 23,487,766 instructions # 1.25 insn per cycle >> >> 1.008913769 seconds time elapsed >> >> 1.003248000 seconds user >> 0.004069000 seconds sys >> >> In order to support event inheritance, two new bpf programs are added >> to monitor the fork and exit of tasks respectively. When a task is >> created, add it to the filter map to enable counting, and reuse the >> `accum_key` of its parent task to count together with the parent task. >> When a task exits, remove it from the filter map to disable counting. > > Right, I think we may need this for other BPF programs. > >> >> After support: >> $ ./perf stat --bpf-counters -e cycles,instructions -- ./perf test -w sqrtloop >> >> Performance counter stats for './perf test -w sqrtloop': >> >> 2,316,543,537 cycles >> 2,859,677,779 instructions # 1.23 insn per cycle >> >> 1.009566332 seconds time elapsed >> >> 1.004414000 seconds user >> 0.003545000 seconds sys >> >> Signed-off-by: Tengda Wu <wutengda@xxxxxxxxxxxxxxx> >> --- >> tools/perf/util/bpf_counter.c | 9 +-- >> tools/perf/util/bpf_skel/bperf_follower.bpf.c | 75 +++++++++++++++++-- >> tools/perf/util/bpf_skel/bperf_u.h | 5 ++ >> 3 files changed, 78 insertions(+), 11 deletions(-) >> >> diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c >> index 7a8af60e0f51..e07ff04b934f 100644 >> --- a/tools/perf/util/bpf_counter.c >> +++ b/tools/perf/util/bpf_counter.c >> @@ -529,9 +529,6 @@ static int bperf__load(struct evsel *evsel, struct target *target) >> /* set up reading map */ >> bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings, >> filter_entry_cnt); >> - /* set up follower filter based on target */ >> - bpf_map__set_max_entries(evsel->follower_skel->maps.filter, >> - filter_entry_cnt); >> err = bperf_follower_bpf__load(evsel->follower_skel); >> if (err) { >> pr_err("Failed to load follower skeleton\n"); >> @@ -543,6 +540,7 @@ static int bperf__load(struct evsel *evsel, struct target *target) >> for (i = 0; i < filter_entry_cnt; i++) { >> int filter_map_fd; >> __u32 key; >> + struct bperf_filter_value fval = { i, 0 }; >> >> if (filter_type == BPERF_FILTER_PID || >> filter_type == BPERF_FILTER_TGID) >> @@ -553,10 +551,11 @@ static int bperf__load(struct evsel *evsel, struct target *target) >> break; >> >> filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter); >> - bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY); >> + bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY); >> } >> >> evsel->follower_skel->bss->type = filter_type; >> + evsel->follower_skel->bss->init_filter_entries = filter_entry_cnt; > > Do you need this in the BPF program? No need. Actually, this variable is only used in bperf__read(). Since bperf__read() does not have a `struct target` parameter, it is not possible to check the filter_entry_cnt value again through bperf_check_target(). We can only put it in a global scope, one way is the current implementation, and another way is to declare a global variable in bpf_counter. Maybe the second way is more appropriate? If so, I will modify it. > >> >> err = bperf_follower_bpf__attach(evsel->follower_skel); >> >> @@ -623,7 +622,7 @@ static int bperf__read(struct evsel *evsel) >> bperf_sync_counters(evsel); >> reading_map_fd = bpf_map__fd(skel->maps.accum_readings); >> >> - for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) { >> + for (i = 0; i < skel->bss->init_filter_entries; i++) { >> struct perf_cpu entry; >> __u32 cpu; >> >> diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c >> index f193998530d4..59fab421526a 100644 >> --- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c >> +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c >> @@ -5,6 +5,8 @@ >> #include <bpf/bpf_tracing.h> >> #include "bperf_u.h" >> >> +#define MAX_ENTRIES 102400 >> + >> struct { >> __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); >> __uint(key_size, sizeof(__u32)); >> @@ -22,25 +24,29 @@ struct { >> struct { >> __uint(type, BPF_MAP_TYPE_HASH); >> __uint(key_size, sizeof(__u32)); >> - __uint(value_size, sizeof(__u32)); >> + __uint(value_size, sizeof(struct bperf_filter_value)); >> + __uint(max_entries, MAX_ENTRIES); >> + __uint(map_flags, BPF_F_NO_PREALLOC); >> } filter SEC(".maps"); >> >> enum bperf_filter_type type = 0; >> int enabled = 0; >> +__u32 init_filter_entries = 0; >> >> SEC("fexit/XXX") >> int BPF_PROG(fexit_XXX) >> { >> struct bpf_perf_event_value *diff_val, *accum_val; >> __u32 filter_key, zero = 0; >> - __u32 *accum_key; >> + __u32 accum_key; >> + struct bperf_filter_value *fval; >> >> if (!enabled) >> return 0; >> >> switch (type) { >> case BPERF_FILTER_GLOBAL: >> - accum_key = &zero; >> + accum_key = zero; >> goto do_add; >> case BPERF_FILTER_CPU: >> filter_key = bpf_get_smp_processor_id(); >> @@ -55,16 +61,20 @@ int BPF_PROG(fexit_XXX) >> return 0; >> } >> >> - accum_key = bpf_map_lookup_elem(&filter, &filter_key); >> - if (!accum_key) >> + fval = bpf_map_lookup_elem(&filter, &filter_key); >> + if (!fval) >> return 0; >> >> + accum_key = fval->accum_key; >> + if (fval->exited) >> + bpf_map_delete_elem(&filter, &filter_key); >> + >> do_add: >> diff_val = bpf_map_lookup_elem(&diff_readings, &zero); >> if (!diff_val) >> return 0; >> >> - accum_val = bpf_map_lookup_elem(&accum_readings, accum_key); >> + accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key); >> if (!accum_val) >> return 0; >> >> @@ -75,4 +85,57 @@ int BPF_PROG(fexit_XXX) >> return 0; >> } >> >> +SEC("tp_btf/task_newtask") >> +int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags) >> +{ >> + __u32 parent_pid, child_pid; >> + struct bperf_filter_value *parent_fval; >> + struct bperf_filter_value child_fval = { 0 }; >> + >> + if (!enabled) >> + return 0; >> + >> + if (type != BPERF_FILTER_PID && type != BPERF_FILTER_TGID) >> + return 0; > > I think you can load/attach this program only if the type is either > PID or TGID. Then it'd reduce the overhead. Great suggestion, I'll give it a try. > >> + >> + parent_pid = bpf_get_current_pid_tgid() >> 32; >> + child_pid = task->pid; >> + >> + /* Check if the current task is one of the target tasks to be counted */ >> + parent_fval = bpf_map_lookup_elem(&filter, &parent_pid); >> + if (!parent_fval) >> + return 0; >> + >> + /* Start counting for the new task by adding it into filter map, >> + * inherit the accum key of its parent task so that they can be >> + * counted together. >> + */ >> + child_fval.accum_key = parent_fval->accum_key; >> + child_fval.exited = 0; >> + bpf_map_update_elem(&filter, &child_pid, &child_fval, BPF_NOEXIST); >> + >> + return 0; >> +} >> + >> +SEC("tp_btf/sched_process_exit") >> +int BPF_PROG(on_exittask, struct task_struct *task) >> +{ >> + __u32 pid; >> + struct bperf_filter_value *fval; >> + >> + if (!enabled) >> + return 0; >> + >> + if (type != BPERF_FILTER_PID && type != BPERF_FILTER_TGID) >> + return 0; > > Ditto. > > Thanks, > Namhyung > >> + >> + /* Stop counting for this task by removing it from filter map */ >> + pid = task->pid; >> + fval = bpf_map_lookup_elem(&filter, &pid); >> + if (fval) >> + fval->exited = 1; >> + >> + return 0; >> +} >> + >> char LICENSE[] SEC("license") = "Dual BSD/GPL"; >> diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h >> index 1ce0c2c905c1..4a4a753980be 100644 >> --- a/tools/perf/util/bpf_skel/bperf_u.h >> +++ b/tools/perf/util/bpf_skel/bperf_u.h >> @@ -11,4 +11,9 @@ enum bperf_filter_type { >> BPERF_FILTER_TGID, >> }; >> >> +struct bperf_filter_value { >> + __u32 accum_key; >> + __u8 exited; >> +}; >> + >> #endif /* __BPERF_STAT_U_H */ >> -- >> 2.34.1 >>