bpf progs are attached to cgroups as usual with the idea of effective progs remain the same. The perf event / tracepoint's fd is defined as attachment 'subtype'. The 'subtype' is passed along during attachment via bpf_attr, reusing replace_bpf_fd field. After the effective progs are calculated, perf_event is allocated using the 'subtype'/'fd' value for all cpus filtering on the perf cgroup that corresponds to the bpf-cgroup (with assumption of a unified hierarchy.) The effective bpf prog array is then attached to each newly allocated perf_event and subsequently enabled by activate_effective_progs. Change-Id: I07a4dcaa0a682bafa496f05411365100d6c84fff Signed-off-by: Kenny Ho <Kenny.Ho@xxxxxxx> --- include/linux/bpf-cgroup.h | 15 ++++-- include/linux/perf_event.h | 4 ++ kernel/bpf/cgroup.c | 96 +++++++++++++++++++++++++++++++------- kernel/cgroup/cgroup.c | 9 ++-- kernel/events/core.c | 45 ++++++++++++++++++ 5 files changed, 142 insertions(+), 27 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a5e4d9b19470..b6e22fd2aa6e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -154,6 +154,11 @@ struct cgroup_bpf { /* cgroup_bpf is released using a work queue */ struct work_struct release_work; + + /* list of perf events (per child cgroups) for tracepoint/kprobe/uprobe bpf attachment to cgroup */ + /* TODO: array of tp type with array of events for each cgroup + * currently only one tp type supported at a time */ + struct list_head per_cg_events; }; int cgroup_bpf_inherit(struct cgroup *cgrp); @@ -161,21 +166,21 @@ void cgroup_bpf_offline(struct cgroup *cgrp); int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, + struct bpf_cgroup_link *link, int bpf_attach_subtype, enum bpf_attach_type type, u32 flags); int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_cgroup_link *link, - enum bpf_attach_type type); + enum bpf_attach_type type, int bpf_attach_subtype); int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, enum bpf_attach_type type, - u32 flags); + struct bpf_cgroup_link *link, int bpf_attach_subtype, + enum bpf_attach_type type, u32 flags); int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type); + enum bpf_attach_type type, int bpf_attach_subtype); int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9c440db65c18..5a149d8865a1 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -776,6 +776,7 @@ struct perf_event { #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; /* cgroup event is attach to */ + struct list_head bpf_cg_list; #endif #ifdef CONFIG_SECURITY @@ -982,6 +983,9 @@ extern void perf_pmu_resched(struct pmu *pmu); extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); +extern int perf_event_create_for_all_cpus(struct perf_event_attr *attr, + struct cgroup *cgroup, + struct list_head *entries); extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 03145d45e3d5..0ecf465ddfb2 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -14,6 +14,8 @@ #include <linux/string.h> #include <linux/bpf.h> #include <linux/bpf-cgroup.h> +#include <linux/perf_event.h> +#include <linux/trace_events.h> #include <net/sock.h> #include <net/bpf_sk_storage.h> @@ -112,6 +114,8 @@ static void cgroup_bpf_release(struct work_struct *work) struct bpf_prog_array *old_array; struct list_head *storages = &cgrp->bpf.storages; struct bpf_cgroup_storage *storage, *stmp; + struct list_head *events = &cgrp->bpf.per_cg_events; + struct perf_event *event, *etmp; unsigned int atype; @@ -141,6 +145,10 @@ static void cgroup_bpf_release(struct work_struct *work) bpf_cgroup_storage_free(storage); } + list_for_each_entry_safe(event, etmp, events, bpf_cg_list) { + perf_event_release_kernel(event); + } + mutex_unlock(&cgroup_mutex); for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) @@ -226,13 +234,16 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, */ static int compute_effective_progs(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype, + int bpf_attach_subtype, struct bpf_prog_array **array) { struct bpf_prog_array_item *item; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; - int cnt = 0; + struct perf_event *event, *etmp; + struct perf_event_attr attr = {}; + int rc, cnt = 0; /* count number of effective programs by walking parents */ do { @@ -245,6 +256,21 @@ static int compute_effective_progs(struct cgroup *cgrp, if (!progs) return -ENOMEM; + if (atype == CGROUP_TRACEPOINT) { + /* TODO: only create event for cgroup that can have process */ + + attr.config = bpf_attach_subtype; + attr.type = PERF_TYPE_TRACEPOINT; + attr.sample_type = PERF_SAMPLE_RAW; + attr.sample_period = 1; + attr.wakeup_events = 1; + + rc = perf_event_create_for_all_cpus(&attr, cgrp, + &cgrp->bpf.per_cg_events); + if (rc) + goto err; + } + /* populate the array with effective progs */ cnt = 0; p = cgrp; @@ -264,20 +290,41 @@ static int compute_effective_progs(struct cgroup *cgrp, } } while ((p = cgroup_parent(p))); + if (atype == CGROUP_TRACEPOINT) { + list_for_each_entry_safe(event, etmp, &cgrp->bpf.per_cg_events, bpf_cg_list) { + rc = perf_event_attach_bpf_prog_array(event, progs); + if (rc) + goto err_attach; + } + } + *array = progs; return 0; +err_attach: + list_for_each_entry_safe(event, etmp, &cgrp->bpf.per_cg_events, bpf_cg_list) + perf_event_release_kernel(event); +err: + bpf_prog_array_free(progs); + return rc; } static void activate_effective_progs(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype, struct bpf_prog_array *old_array) { - old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array, - lockdep_is_held(&cgroup_mutex)); - /* free prog array after grace period, since __cgroup_bpf_run_*() - * might be still walking the array - */ - bpf_prog_array_free(old_array); + struct perf_event *event, *etmp; + + if (atype == CGROUP_TRACEPOINT) + list_for_each_entry_safe(event, etmp, &cgrp->bpf.per_cg_events, bpf_cg_list) + perf_event_enable(event); + else { + old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array, + lockdep_is_held(&cgroup_mutex)); + /* free prog array after grace period, since __cgroup_bpf_run_*() + * might be still walking the array + */ + bpf_prog_array_free(old_array); + } } /** @@ -306,9 +353,10 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); INIT_LIST_HEAD(&cgrp->bpf.storages); + INIT_LIST_HEAD(&cgrp->bpf.per_cg_events); for (i = 0; i < NR; i++) - if (compute_effective_progs(cgrp, i, &arrays[i])) + if (compute_effective_progs(cgrp, i, -1, &arrays[i])) goto cleanup; for (i = 0; i < NR; i++) @@ -328,7 +376,8 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) } static int update_effective_progs(struct cgroup *cgrp, - enum cgroup_bpf_attach_type atype) + enum cgroup_bpf_attach_type atype, + int bpf_attach_subtype) { struct cgroup_subsys_state *css; int err; @@ -340,7 +389,8 @@ static int update_effective_progs(struct cgroup *cgrp, if (percpu_ref_is_zero(&desc->bpf.refcnt)) continue; - err = compute_effective_progs(desc, atype, &desc->bpf.inactive); + err = compute_effective_progs(desc, atype, bpf_attach_subtype, + &desc->bpf.inactive); if (err) goto cleanup; } @@ -424,6 +474,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, * @prog: A program to attach * @link: A link to attach * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set + * @bpf_attach_subtype: Type ID of perf tracing event for tracepoint/kprobe/uprobe * @type: Type of attach operation * @flags: Option flags * @@ -432,7 +483,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, */ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, + struct bpf_cgroup_link *link, int bpf_attach_subtype, enum bpf_attach_type type, u32 flags) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); @@ -454,6 +505,14 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (!!replace_prog != !!(flags & BPF_F_REPLACE)) /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL; + if ((type == BPF_CGROUP_TRACEPOINT) && + ((flags & BPF_F_REPLACE) || (bpf_attach_subtype < 0) || !(flags & BPF_F_ALLOW_MULTI))) + /* replace fd is used to pass the subtype */ + /* subtype is required for BPF_CGROUP_TRACEPOINT */ + /* not allow multi BPF progs for the attach type for now */ + return -EINVAL; + + /* TODO check bpf_attach_subtype is valid */ atype = to_cgroup_bpf_attach_type(type); if (atype < 0) @@ -499,7 +558,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; - err = update_effective_progs(cgrp, atype); + err = update_effective_progs(cgrp, atype, bpf_attach_subtype); if (err) goto cleanup; @@ -679,7 +738,8 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, * Must be called with cgroup_mutex held. */ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_cgroup_link *link, enum bpf_attach_type type) + struct bpf_cgroup_link *link, enum bpf_attach_type type, + int bpf_attach_subtype) { enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; @@ -708,7 +768,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; pl->link = NULL; - err = update_effective_progs(cgrp, atype); + err = update_effective_progs(cgrp, atype, bpf_attach_subtype); if (err) goto cleanup; @@ -809,7 +869,7 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, } } - ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, attr->replace_bpf_fd, attr->attach_type, attr->attach_flags); if (replace_prog) @@ -832,7 +892,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) if (IS_ERR(prog)) prog = NULL; - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type); + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->replace_bpf_fd); if (prog) bpf_prog_put(prog); @@ -861,7 +921,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link) } WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, - cg_link->type)); + cg_link->type, -1)); cg = cg_link->cgroup; cg_link->cgroup = NULL; @@ -961,7 +1021,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto out_put_cgroup; } - err = cgroup_bpf_attach(cgrp, NULL, NULL, link, + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, -1, link->type, BPF_F_ALLOW_MULTI); if (err) { bpf_link_cleanup(&link_primer); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a645b212b69b..17a1269dc2f9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6626,25 +6626,26 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #ifdef CONFIG_CGROUP_BPF int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, + struct bpf_cgroup_link *link, int bpf_attach_subtype, enum bpf_attach_type type, u32 flags) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, + bpf_attach_subtype, type, flags); mutex_unlock(&cgroup_mutex); return ret; } int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) + enum bpf_attach_type type, int bpf_attach_subtype) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, bpf_attach_subtype); mutex_unlock(&cgroup_mutex); return ret; } diff --git a/kernel/events/core.c b/kernel/events/core.c index d34e00749c9b..71056af4322b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -12511,6 +12511,51 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); +int perf_event_create_for_all_cpus(struct perf_event_attr *attr, + struct cgroup *cgroup, + struct list_head *entries) +{ + struct perf_event **events; + struct perf_cgroup *perf_cgrp; + int cpu, i = 0; + + events = kzalloc(sizeof(struct perf_event *) * num_possible_cpus(), + GFP_KERNEL); + + if (!events) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + /* allocate first, connect the cgroup later */ + events[i] = perf_event_create_kernel_counter(attr, cpu, NULL, NULL, NULL); + + if (IS_ERR(events[i])) + goto err; + + i++; + } + + perf_cgrp = cgroup_tryget_perf_cgroup(cgroup); + if (!perf_cgrp) + goto err; + + for (i--; i >= 0; i--) { + events[i]->cgrp = perf_cgrp; + + list_add(&events[i]->bpf_cg_list, entries); + } + + kfree(events); + return 0; + +err: + for (i--; i >= 0; i--) + free_event(events[i]); + + kfree(events); + return -ENOMEM; +} + void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) { struct perf_event_context *src_ctx; -- 2.25.1