On 7/26/21 9:12 AM, Andrii Nakryiko wrote:
Introduce a new type of BPF link - BPF perf link. This brings perf_event-based
BPF program attachments (perf_event, tracepoints, kprobes, and uprobes) into
the common BPF link infrastructure, allowing to list all active perf_event
based attachments, auto-detaching BPF program from perf_event when link's FD
is closed, get generic BPF link fdinfo/get_info functionality.
BPF_LINK_CREATE command expects perf_event's FD as target_fd. No extra flags
are currently supported.
Force-detaching and atomic BPF program updates are not yet implemented, but
with perf_event-based BPF links we now have common framework for this without
the need to extend ioctl()-based perf_event interface.
One interesting consideration is a new value for bpf_attach_type, which
BPF_LINK_CREATE command expects. Generally, it's either 1-to-1 mapping from
bpf_attach_type to bpf_prog_type, or many-to-1 mapping from a subset of
bpf_attach_types to one bpf_prog_type (e.g., see BPF_PROG_TYPE_SK_SKB or
BPF_PROG_TYPE_CGROUP_SOCK). In this case, though, we have three different
program types (KPROBE, TRACEPOINT, PERF_EVENT) using the same perf_event-based
mechanism, so it's many bpf_prog_types to one bpf_attach_type. I chose to
define a single BPF_PERF_EVENT attach type for all of them and adjust
link_create()'s logic for checking correspondence between attach type and
program type.
The alternative would be to define three new attach types (e.g., BPF_KPROBE,
BPF_TRACEPOINT, and BPF_PERF_EVENT), but that seemed like unnecessary overkill
and BPF_KPROBE will cause naming conflicts with BPF_KPROBE() macro, defined by
libbpf. I chose to not do this to avoid unnecessary proliferation of
bpf_attach_type enum values and not have to deal with naming conflicts.
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Signed-off-by: Andrii Nakryiko <andrii@xxxxxxxxxx>
---
include/linux/bpf_types.h | 3 +
include/linux/trace_events.h | 3 +
include/uapi/linux/bpf.h | 2 +
kernel/bpf/syscall.c | 105 ++++++++++++++++++++++++++++++---
kernel/events/core.c | 10 ++--
tools/include/uapi/linux/bpf.h | 2 +
6 files changed, 112 insertions(+), 13 deletions(-)
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index a9db1eae6796..0a1ada7f174d 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -135,3 +135,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
#ifdef CONFIG_NET
BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
#endif
+#ifdef CONFIG_PERF_EVENTS
+BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
+#endif
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index ad413b382a3c..8ac92560d3a3 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -803,6 +803,9 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
void perf_trace_buf_update(void *record, u16 type);
void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
+int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+void perf_event_free_bpf_prog(struct perf_event *event);
+
void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2);
void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2db6925e04f4..00b1267ab4f0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -993,6 +993,7 @@ enum bpf_attach_type {
BPF_SK_SKB_VERDICT,
BPF_SK_REUSEPORT_SELECT,
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
+ BPF_PERF_EVENT,
__MAX_BPF_ATTACH_TYPE
};
@@ -1006,6 +1007,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_ITER = 4,
BPF_LINK_TYPE_NETNS = 5,
BPF_LINK_TYPE_XDP = 6,
+ BPF_LINK_TYPE_PERF_EVENT = 6,
As Jiri has pointed out, BPF_LINK_TYPE_PERF_EVENT = 7.
MAX_BPF_LINK_TYPE,
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9a2068e39d23..80c03bedd6e6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2906,6 +2906,79 @@ static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.fill_link_info = bpf_raw_tp_link_fill_link_info,
};
+#ifdef CONFIG_PERF_EVENTS
+struct bpf_perf_link {
+ struct bpf_link link;
+ struct file *perf_file;
+};
+
+static void bpf_perf_link_release(struct bpf_link *link)
+{
+ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+ struct perf_event *event = perf_link->perf_file->private_data;
+
+ perf_event_free_bpf_prog(event);
+ fput(perf_link->perf_file);
+}
+
+static void bpf_perf_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+
+ kfree(perf_link);
+}
+
+static const struct bpf_link_ops bpf_perf_link_lops = {
+ .release = bpf_perf_link_release,
+ .dealloc = bpf_perf_link_dealloc,
+};
+
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_perf_link *link;
+ struct perf_event *event;
+ struct file *perf_file;
+ int err;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ perf_file = perf_event_get(attr->link_create.target_fd);
+ if (IS_ERR(perf_file))
+ return PTR_ERR(perf_file);
+
+ link = kzalloc(sizeof(*link), GFP_USER);
add __GFP_NOWARN flag?
+ if (!link) {
+ err = -ENOMEM;
+ goto out_put_file;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
+ link->perf_file = perf_file;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ goto out_put_file;
+ }
+
+ event = perf_file->private_data;
+ err = perf_event_set_bpf_prog(event, prog);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
Do you need kfree(link) here?
+ goto out_put_file;
+ }
+ /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
+ bpf_prog_inc(prog);
+
+ return bpf_link_settle(&link_primer);
+
+out_put_file:
+ fput(perf_file);
+ return err;
+}
+#endif /* CONFIG_PERF_EVENTS */
+
#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
[...]