Re: [PATCH -next 1/2] perf stat: Support inherit events during fork() for bperf

Tengda Wu <wutengda@xxxxxxxxxxxxxxx> · Thu, 5 Sep 2024 14:44:14 +0800

On 2024/9/5 12:01, Namhyung Kim wrote:
> Hello,
> 
> On Wed, Sep 04, 2024 at 12:31:02PM +0000, Tengda Wu wrote:
>> bperf has a nice ability to share PMUs, but it still does not support
>> inherit events during fork(), resulting in some deviations in its stat
>> results compared with perf.
>>
>> perf stat result:
>>   $ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop
>>
>>    Performance counter stats for './perf test -w sqrtloop':
>>
>>        2,316,038,116      cycles
>>        2,859,350,725      instructions                     #    1.23  insn per cycle
>>
>>          1.009603637 seconds time elapsed
>>
>>          1.004196000 seconds user
>>          0.003950000 seconds sys
>>
>> bperf stat result:
>>   $ ./perf stat --bpf-counters -e cycles,instructions -- ./perf test -w sqrtloop
>>
>>    Performance counter stats for './perf test -w sqrtloop':
>>
>>           18,762,093      cycles
>>           23,487,766      instructions                     #    1.25  insn per cycle
>>
>>          1.008913769 seconds time elapsed
>>
>>          1.003248000 seconds user
>>          0.004069000 seconds sys
>>
>> In order to support event inheritance, two new bpf programs are added
>> to monitor the fork and exit of tasks respectively. When a task is
>> created, add it to the filter map to enable counting, and reuse the
>> `accum_key` of its parent task to count together with the parent task.
>> When a task exits, remove it from the filter map to disable counting.
> 
> Right, I think we may need this for other BPF programs.
> 
>>
>> After support:
>>   $ ./perf stat --bpf-counters -e cycles,instructions -- ./perf test -w sqrtloop
>>
>>    Performance counter stats for './perf test -w sqrtloop':
>>
>>        2,316,543,537      cycles
>>        2,859,677,779      instructions                     #    1.23  insn per cycle
>>
>>          1.009566332 seconds time elapsed
>>
>>          1.004414000 seconds user
>>          0.003545000 seconds sys
>>
>> Signed-off-by: Tengda Wu <wutengda@xxxxxxxxxxxxxxx>
>> ---
>>  tools/perf/util/bpf_counter.c                 |  9 +--
>>  tools/perf/util/bpf_skel/bperf_follower.bpf.c | 75 +++++++++++++++++--
>>  tools/perf/util/bpf_skel/bperf_u.h            |  5 ++
>>  3 files changed, 78 insertions(+), 11 deletions(-)
>>
>> diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
>> index 7a8af60e0f51..e07ff04b934f 100644
>> --- a/tools/perf/util/bpf_counter.c
>> +++ b/tools/perf/util/bpf_counter.c
>> @@ -529,9 +529,6 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>>  	/* set up reading map */
>>  	bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
>>  				 filter_entry_cnt);
>> -	/* set up follower filter based on target */
>> -	bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
>> -				 filter_entry_cnt);
>>  	err = bperf_follower_bpf__load(evsel->follower_skel);
>>  	if (err) {
>>  		pr_err("Failed to load follower skeleton\n");
>> @@ -543,6 +540,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>>  	for (i = 0; i < filter_entry_cnt; i++) {
>>  		int filter_map_fd;
>>  		__u32 key;
>> +		struct bperf_filter_value fval = { i, 0 };
>>  
>>  		if (filter_type == BPERF_FILTER_PID ||
>>  		    filter_type == BPERF_FILTER_TGID)
>> @@ -553,10 +551,11 @@ static int bperf__load(struct evsel *evsel, struct target *target)
>>  			break;
>>  
>>  		filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
>> -		bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
>> +		bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY);
>>  	}
>>  
>>  	evsel->follower_skel->bss->type = filter_type;
>> +	evsel->follower_skel->bss->init_filter_entries = filter_entry_cnt;
> 
> Do you need this in the BPF program?

No need. Actually, this variable is only used in bperf__read(). 
Since bperf__read() does not have a `struct target` parameter, it is not possible to check the
filter_entry_cnt value again through bperf_check_target(). We can only put it in a global scope,
one way is the current implementation, and another way is to declare a global variable in bpf_counter.
Maybe the second way is more appropriate? If so, I will modify it.

> 
>>  
>>  	err = bperf_follower_bpf__attach(evsel->follower_skel);
>>  
>> @@ -623,7 +622,7 @@ static int bperf__read(struct evsel *evsel)
>>  	bperf_sync_counters(evsel);
>>  	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
>>  
>> -	for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
>> +	for (i = 0; i < skel->bss->init_filter_entries; i++) {
>>  		struct perf_cpu entry;
>>  		__u32 cpu;
>>  
>> diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
>> index f193998530d4..59fab421526a 100644
>> --- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
>> +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
>> @@ -5,6 +5,8 @@
>>  #include <bpf/bpf_tracing.h>
>>  #include "bperf_u.h"
>>  
>> +#define MAX_ENTRIES 102400
>> +
>>  struct {
>>  	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
>>  	__uint(key_size, sizeof(__u32));
>> @@ -22,25 +24,29 @@ struct {
>>  struct {
>>  	__uint(type, BPF_MAP_TYPE_HASH);
>>  	__uint(key_size, sizeof(__u32));
>> -	__uint(value_size, sizeof(__u32));
>> +	__uint(value_size, sizeof(struct bperf_filter_value));
>> +	__uint(max_entries, MAX_ENTRIES);
>> +	__uint(map_flags, BPF_F_NO_PREALLOC);
>>  } filter SEC(".maps");
>>  
>>  enum bperf_filter_type type = 0;
>>  int enabled = 0;
>> +__u32 init_filter_entries = 0;
>>  
>>  SEC("fexit/XXX")
>>  int BPF_PROG(fexit_XXX)
>>  {
>>  	struct bpf_perf_event_value *diff_val, *accum_val;
>>  	__u32 filter_key, zero = 0;
>> -	__u32 *accum_key;
>> +	__u32 accum_key;
>> +	struct bperf_filter_value *fval;
>>  
>>  	if (!enabled)
>>  		return 0;
>>  
>>  	switch (type) {
>>  	case BPERF_FILTER_GLOBAL:
>> -		accum_key = &zero;
>> +		accum_key = zero;
>>  		goto do_add;
>>  	case BPERF_FILTER_CPU:
>>  		filter_key = bpf_get_smp_processor_id();
>> @@ -55,16 +61,20 @@ int BPF_PROG(fexit_XXX)
>>  		return 0;
>>  	}
>>  
>> -	accum_key = bpf_map_lookup_elem(&filter, &filter_key);
>> -	if (!accum_key)
>> +	fval = bpf_map_lookup_elem(&filter, &filter_key);
>> +	if (!fval)
>>  		return 0;
>>  
>> +	accum_key = fval->accum_key;
>> +	if (fval->exited)
>> +		bpf_map_delete_elem(&filter, &filter_key);
>> +
>>  do_add:
>>  	diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
>>  	if (!diff_val)
>>  		return 0;
>>  
>> -	accum_val = bpf_map_lookup_elem(&accum_readings, accum_key);
>> +	accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key);
>>  	if (!accum_val)
>>  		return 0;
>>  
>> @@ -75,4 +85,57 @@ int BPF_PROG(fexit_XXX)
>>  	return 0;
>>  }
>>  
>> +SEC("tp_btf/task_newtask")
>> +int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags)
>> +{
>> +	__u32 parent_pid, child_pid;
>> +	struct bperf_filter_value *parent_fval;
>> +	struct bperf_filter_value child_fval = { 0 };
>> +
>> +	if (!enabled)
>> +		return 0;
>> +
>> +	if (type != BPERF_FILTER_PID && type != BPERF_FILTER_TGID)
>> +		return 0;
> 
> I think you can load/attach this program only if the type is either
> PID or TGID.  Then it'd reduce the overhead.

Great suggestion, I'll give it a try.

> 
>> +
>> +	parent_pid = bpf_get_current_pid_tgid() >> 32;
>> +	child_pid = task->pid;
>> +
>> +	/* Check if the current task is one of the target tasks to be counted */
>> +	parent_fval = bpf_map_lookup_elem(&filter, &parent_pid);
>> +	if (!parent_fval)
>> +		return 0;
>> +
>> +	/* Start counting for the new task by adding it into filter map,
>> +	 * inherit the accum key of its parent task so that they can be
>> +	 * counted together.
>> +	 */
>> +	child_fval.accum_key = parent_fval->accum_key;
>> +	child_fval.exited = 0;
>> +	bpf_map_update_elem(&filter, &child_pid, &child_fval, BPF_NOEXIST);
>> +
>> +	return 0;
>> +}
>> +
>> +SEC("tp_btf/sched_process_exit")
>> +int BPF_PROG(on_exittask, struct task_struct *task)
>> +{
>> +	__u32 pid;
>> +	struct bperf_filter_value *fval;
>> +
>> +	if (!enabled)
>> +		return 0;
>> +
>> +	if (type != BPERF_FILTER_PID && type != BPERF_FILTER_TGID)
>> +		return 0;
> 
> Ditto.
> 
> Thanks,
> Namhyung
> 
>> +
>> +	/* Stop counting for this task by removing it from filter map */
>> +	pid = task->pid;
>> +	fval = bpf_map_lookup_elem(&filter, &pid);
>> +	if (fval)
>> +		fval->exited = 1;
>> +
>> +	return 0;
>> +}
>> +
>>  char LICENSE[] SEC("license") = "Dual BSD/GPL";
>> diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h
>> index 1ce0c2c905c1..4a4a753980be 100644
>> --- a/tools/perf/util/bpf_skel/bperf_u.h
>> +++ b/tools/perf/util/bpf_skel/bperf_u.h
>> @@ -11,4 +11,9 @@ enum bperf_filter_type {
>>  	BPERF_FILTER_TGID,
>>  };
>>  
>> +struct bperf_filter_value {
>> +	__u32 accum_key;
>> +	__u8 exited;
>> +};
>> +
>>  #endif /* __BPERF_STAT_U_H */
>> -- 
>> 2.34.1
>>