On Thu, Jul 29, 2021 at 10:36 AM Yonghong Song <yhs@xxxxxx> wrote: > > > > On 7/26/21 9:12 AM, Andrii Nakryiko wrote: > > Introduce a new type of BPF link - BPF perf link. This brings perf_event-based > > BPF program attachments (perf_event, tracepoints, kprobes, and uprobes) into > > the common BPF link infrastructure, allowing to list all active perf_event > > based attachments, auto-detaching BPF program from perf_event when link's FD > > is closed, get generic BPF link fdinfo/get_info functionality. > > > > BPF_LINK_CREATE command expects perf_event's FD as target_fd. No extra flags > > are currently supported. > > > > Force-detaching and atomic BPF program updates are not yet implemented, but > > with perf_event-based BPF links we now have common framework for this without > > the need to extend ioctl()-based perf_event interface. > > > > One interesting consideration is a new value for bpf_attach_type, which > > BPF_LINK_CREATE command expects. Generally, it's either 1-to-1 mapping from > > bpf_attach_type to bpf_prog_type, or many-to-1 mapping from a subset of > > bpf_attach_types to one bpf_prog_type (e.g., see BPF_PROG_TYPE_SK_SKB or > > BPF_PROG_TYPE_CGROUP_SOCK). In this case, though, we have three different > > program types (KPROBE, TRACEPOINT, PERF_EVENT) using the same perf_event-based > > mechanism, so it's many bpf_prog_types to one bpf_attach_type. I chose to > > define a single BPF_PERF_EVENT attach type for all of them and adjust > > link_create()'s logic for checking correspondence between attach type and > > program type. > > > > The alternative would be to define three new attach types (e.g., BPF_KPROBE, > > BPF_TRACEPOINT, and BPF_PERF_EVENT), but that seemed like unnecessary overkill > > and BPF_KPROBE will cause naming conflicts with BPF_KPROBE() macro, defined by > > libbpf. I chose to not do this to avoid unnecessary proliferation of > > bpf_attach_type enum values and not have to deal with naming conflicts. > > > > Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> > > Signed-off-by: Andrii Nakryiko <andrii@xxxxxxxxxx> > > --- > > include/linux/bpf_types.h | 3 + > > include/linux/trace_events.h | 3 + > > include/uapi/linux/bpf.h | 2 + > > kernel/bpf/syscall.c | 105 ++++++++++++++++++++++++++++++--- > > kernel/events/core.c | 10 ++-- > > tools/include/uapi/linux/bpf.h | 2 + > > 6 files changed, 112 insertions(+), 13 deletions(-) > > > > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h > > index a9db1eae6796..0a1ada7f174d 100644 > > --- a/include/linux/bpf_types.h > > +++ b/include/linux/bpf_types.h > > @@ -135,3 +135,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) > > #ifdef CONFIG_NET > > BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns) > > #endif > > +#ifdef CONFIG_PERF_EVENTS > > +BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) > > +#endif > > diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h > > index ad413b382a3c..8ac92560d3a3 100644 > > --- a/include/linux/trace_events.h > > +++ b/include/linux/trace_events.h > > @@ -803,6 +803,9 @@ extern void ftrace_profile_free_filter(struct perf_event *event); > > void perf_trace_buf_update(void *record, u16 type); > > void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); > > > > +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); > > +void perf_event_free_bpf_prog(struct perf_event *event); > > + > > void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); > > void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); > > void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index 2db6925e04f4..00b1267ab4f0 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -993,6 +993,7 @@ enum bpf_attach_type { > > BPF_SK_SKB_VERDICT, > > BPF_SK_REUSEPORT_SELECT, > > BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, > > + BPF_PERF_EVENT, > > __MAX_BPF_ATTACH_TYPE > > }; > > > > @@ -1006,6 +1007,7 @@ enum bpf_link_type { > > BPF_LINK_TYPE_ITER = 4, > > BPF_LINK_TYPE_NETNS = 5, > > BPF_LINK_TYPE_XDP = 6, > > + BPF_LINK_TYPE_PERF_EVENT = 6, > > As Jiri has pointed out, BPF_LINK_TYPE_PERF_EVENT = 7. yep, fixed > > > > > MAX_BPF_LINK_TYPE, > > }; > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > > index 9a2068e39d23..80c03bedd6e6 100644 > > --- a/kernel/bpf/syscall.c > > +++ b/kernel/bpf/syscall.c > > @@ -2906,6 +2906,79 @@ static const struct bpf_link_ops bpf_raw_tp_link_lops = { > > .fill_link_info = bpf_raw_tp_link_fill_link_info, > > }; > > > > +#ifdef CONFIG_PERF_EVENTS > > +struct bpf_perf_link { > > + struct bpf_link link; > > + struct file *perf_file; > > +}; > > + > > +static void bpf_perf_link_release(struct bpf_link *link) > > +{ > > + struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); > > + struct perf_event *event = perf_link->perf_file->private_data; > > + > > + perf_event_free_bpf_prog(event); > > + fput(perf_link->perf_file); > > +} > > + > > +static void bpf_perf_link_dealloc(struct bpf_link *link) > > +{ > > + struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); > > + > > + kfree(perf_link); > > +} > > + > > +static const struct bpf_link_ops bpf_perf_link_lops = { > > + .release = bpf_perf_link_release, > > + .dealloc = bpf_perf_link_dealloc, > > +}; > > + > > +static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) > > +{ > > + struct bpf_link_primer link_primer; > > + struct bpf_perf_link *link; > > + struct perf_event *event; > > + struct file *perf_file; > > + int err; > > + > > + if (attr->link_create.flags) > > + return -EINVAL; > > + > > + perf_file = perf_event_get(attr->link_create.target_fd); > > + if (IS_ERR(perf_file)) > > + return PTR_ERR(perf_file); > > + > > + link = kzalloc(sizeof(*link), GFP_USER); > > add __GFP_NOWARN flag? I looked at few other bpf_link_alloc places in this file, they don't use NOWARN flag. I think the idea with NOWARN flag is to avoid memory alloc warnings when amount of allocated memory depends on user-specified parameter (like the size of the map value). In this case it's just a single fixed-size kernel object, so while users can create lots of them, each is fixed in size. It's similar as any other kernel object (e.g., struct file). So I think it's good as is. > > > + if (!link) { > > + err = -ENOMEM; > > + goto out_put_file; > > + } > > + bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); > > + link->perf_file = perf_file; > > + > > + err = bpf_link_prime(&link->link, &link_primer); > > + if (err) { > > + kfree(link); > > + goto out_put_file; > > + } > > + > > + event = perf_file->private_data; > > + err = perf_event_set_bpf_prog(event, prog); > > + if (err) { > > + bpf_link_cleanup(&link_primer); > > Do you need kfree(link) here? bpf_link_cleanup() will call kfree() in deferred fashion. This is due to bpf_link_prime() allocating anon_inode file internally, so it needs to be freed carefully and that's what bpf_link_cleanup() is for. > > > + goto out_put_file; > > + } > > + /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */ > > + bpf_prog_inc(prog); > > + > > + return bpf_link_settle(&link_primer); > > + > > +out_put_file: > > + fput(perf_file); > > + return err; > > +} > > +#endif /* CONFIG_PERF_EVENTS */ > > + > > #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd > > > [...]