2023-01-19 16:22 UTC+0800 ~ Tonghao Zhang <tong@xxxxxxxxxxxxx> >> On Jan 18, 2023, at 6:41 PM, Quentin Monnet <quentin@xxxxxxxxxxxxx> wrote: >> >> 2023-01-17 12:49 UTC+0800 ~ tong@xxxxxxxxxxxxx >>> From: Tonghao Zhang <tong@xxxxxxxxxxxxx> >>> >>> The number of online cpu may be not equal to possible cpu. >>> bpftool prog profile, can not create pmu event on possible >>> but not online cpu. >> >> s/not/on/ ? >> >>> >>> $ dmidecode -s system-product-name >>> PowerEdge R620 >>> $ cat /sys/devices/system/cpu/online >>> 0-31 >>> $ cat /sys/devices/system/cpu/possible >>> 0-47 >>> >>> To fix this issue, use online cpu instead of possible, to >>> create perf event and other resource. >>> >>> Signed-off-by: Tonghao Zhang <tong@xxxxxxxxxxxxx> >>> Cc: Quentin Monnet <quentin@xxxxxxxxxxxxx> >>> Cc: Alexei Starovoitov <ast@xxxxxxxxxx> >>> Cc: Daniel Borkmann <daniel@xxxxxxxxxxxxx> >>> Cc: Andrii Nakryiko <andrii@xxxxxxxxxx> >>> Cc: Martin KaFai Lau <martin.lau@xxxxxxxxx> >>> Cc: Song Liu <song@xxxxxxxxxx> >>> Cc: Yonghong Song <yhs@xxxxxx> >>> Cc: John Fastabend <john.fastabend@xxxxxxxxx> >>> Cc: KP Singh <kpsingh@xxxxxxxxxx> >>> Cc: Stanislav Fomichev <sdf@xxxxxxxxxx> >>> Cc: Hao Luo <haoluo@xxxxxxxxxx> >>> Cc: Jiri Olsa <jolsa@xxxxxxxxxx> >>> --- >>> tools/bpf/bpftool/prog.c | 5 +++-- >>> 1 file changed, 3 insertions(+), 2 deletions(-) >>> >>> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c >>> index cfc9fdc1e863..08b352dd799e 100644 >>> --- a/tools/bpf/bpftool/prog.c >>> +++ b/tools/bpf/bpftool/prog.c >>> @@ -2056,6 +2056,7 @@ static int profile_parse_metrics(int argc, char **argv) >>> >>> static void profile_read_values(struct profiler_bpf *obj) >>> { >>> + __u32 possible_cpus = libbpf_num_possible_cpus(); >>> __u32 m, cpu, num_cpu = obj->rodata->num_cpu; >>> int reading_map_fd, count_map_fd; >>> __u64 counts[num_cpu]; >>> @@ -2080,7 +2081,7 @@ static void profile_read_values(struct profiler_bpf *obj) >>> profile_total_count += counts[cpu]; >>> >>> for (m = 0; m < ARRAY_SIZE(metrics); m++) { >>> - struct bpf_perf_event_value values[num_cpu]; >>> + struct bpf_perf_event_value values[possible_cpus]; >>> >>> if (!metrics[m].selected) >>> continue; >>> @@ -2321,7 +2322,7 @@ static int do_profile(int argc, char **argv) >>> if (num_metric <= 0) >>> goto out; >>> >>> - num_cpu = libbpf_num_possible_cpus(); >>> + num_cpu = libbpf_num_online_cpus(); >>> if (num_cpu <= 0) { >>> p_err("failed to identify number of CPUs"); >>> goto out; >> >> Thanks, but it doesn't seem to be enough to solve the issue. How did you >> test it? With your series applied locally, I'm trying the following >> (Intel x86_64, CPUs: 0..7): >> >> # echo 0 > /sys/devices/system/cpu/cpu2/online >> # ./bpftool prog profile id 1525 duration 1 cycles instructions >> Error: failed to create event cycles on cpu 2 >> >> It seems that we're still trying to open the perf events on the offline >> CPU in profile_open_perf_events(), because even though we try to use >> fewer of the possible CPUs we're still referencing them in order by > Hi > Thanks for your review and comment. > I don’t test the case that one cpu is offline which is not last CPU. >> their index. So it works if I only disabled the last CPUs instead (#7, >> then #6, ...), but to work with _any_ CPU disabled, we would need to >> retrieve the list of online CPUs. >> > Yes, In other way, to fix it, we can use the errno return by open_perf_event. If errno is ENODEV, we can skip this cpu to profile? > The patch fix this issue. > > diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c > index 620032042576..deec7196b48c 100644 > --- a/tools/bpf/bpftool/common.c > +++ b/tools/bpf/bpftool/common.c > @@ -55,6 +55,24 @@ void p_err(const char *fmt, ...) > va_end(ap); > } > > +void p_warn(const char *fmt, ...) > +{ > + va_list ap; > + > + va_start(ap, fmt); > + if (json_output) { > + jsonw_start_object(json_wtr); > + jsonw_name(json_wtr, "warning"); > + jsonw_vprintf_enquote(json_wtr, fmt, ap); > + jsonw_end_object(json_wtr); > + } else { > + fprintf(stderr, "Warn: "); > + vfprintf(stderr, fmt, ap); > + fprintf(stderr, "\n"); > + } > + va_end(ap); > +} > + > void p_info(const char *fmt, ...) > { > va_list ap; > diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h > index a84224b6a604..e62edec9e13a 100644 > --- a/tools/bpf/bpftool/main.h > +++ b/tools/bpf/bpftool/main.h > @@ -86,6 +86,7 @@ extern struct btf *base_btf; > extern struct hashmap *refs_table; > > void __printf(1, 2) p_err(const char *fmt, ...); > +void __printf(1, 2) p_warn(const char *fmt, ...); > void __printf(1, 2) p_info(const char *fmt, ...); > > bool is_prefix(const char *pfx, const char *str); > diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c > index cfc9fdc1e863..d9363ba01ec0 100644 > --- a/tools/bpf/bpftool/prog.c > +++ b/tools/bpf/bpftool/prog.c > @@ -2233,10 +2233,36 @@ static void profile_close_perf_events(struct profiler_bpf *obj) > profile_perf_event_cnt = 0; > } > > +static int profile_open_perf_event(int mid, int cpu, int map_fd) > +{ > + int pmu_fd; > + > + pmu_fd = syscall(__NR_perf_event_open, &metrics[mid].attr, > + -1/*pid*/, cpu, -1/*group_fd*/, 0); > + if (pmu_fd < 0) { > + if (errno == ENODEV) { > + p_warn("cpu %d may be offline, skip %s metric profiling.", > + cpu, metrics[mid].name); Nit: I think it's fine to keep this at the info level (p_info()). > + profile_perf_event_cnt++; > + return 0; > + } > + return -1; > + } > + > + if (bpf_map_update_elem(map_fd, > + &profile_perf_event_cnt, > + &pmu_fd, BPF_ANY) || > + ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) > + return -1; > + > + profile_perf_events[profile_perf_event_cnt++] = pmu_fd; > + return 0; > +} > + > static int profile_open_perf_events(struct profiler_bpf *obj) > { > unsigned int cpu, m; > - int map_fd, pmu_fd; > + int map_fd; > > profile_perf_events = calloc( > sizeof(int), obj->rodata->num_cpu * obj->rodata->num_metric); > @@ -2255,17 +2281,11 @@ static int profile_open_perf_events(struct profiler_bpf *obj) > if (!metrics[m].selected) > continue; > for (cpu = 0; cpu < obj->rodata->num_cpu; cpu++) { > - pmu_fd = syscall(__NR_perf_event_open, &metrics[m].attr, > - -1/*pid*/, cpu, -1/*group_fd*/, 0); > - if (pmu_fd < 0 || > - bpf_map_update_elem(map_fd, &profile_perf_event_cnt, > - &pmu_fd, BPF_ANY) || > - ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) { > + if (profile_open_perf_event(m, cpu, map_fd)) { > p_err("failed to create event %s on cpu %d", > metrics[m].name, cpu); > return -1; > } > - profile_perf_events[profile_perf_event_cnt++] = pmu_fd; > } > } > return 0; > > > ---- > Best Regards, Tonghao <tong@xxxxxxxxxxxxx> > I haven't tested this patch, but yes, looks like it should address the issue. Could you submit a proper v2, please? Quentin