On Mon, May 9, 2022 at 5:18 PM Yosry Ahmed <yosryahmed@xxxxxxxxxx> wrote: > > This patch introduces a new bpf program type CGROUP_SUBSYS_RSTAT, > with new corresponding link and attach types. > > The main purpose of these programs is to allow BPF programs to collect > and maintain hierarchical cgroup stats easily and efficiently by making > using of the rstat framework in the kernel. > > Those programs attach to a cgroup subsystem. They typically contain logic > to aggregate per-cpu and per-cgroup stats collected by other BPF programs. > > Currently, only rstat flusher programs can be attached to cgroup > subsystems, but this can be extended later if a use-case arises. > > See the selftest in the final patch for a practical example. > > Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx> > --- > include/linux/bpf-cgroup-subsys.h | 30 ++++++ > include/linux/bpf_types.h | 2 + > include/linux/cgroup-defs.h | 4 + > include/uapi/linux/bpf.h | 12 +++ > kernel/bpf/Makefile | 1 + > kernel/bpf/cgroup_subsys.c | 166 ++++++++++++++++++++++++++++++ > kernel/bpf/syscall.c | 6 ++ > kernel/cgroup/cgroup.c | 1 + > tools/include/uapi/linux/bpf.h | 12 +++ > 9 files changed, 234 insertions(+) > create mode 100644 include/linux/bpf-cgroup-subsys.h > create mode 100644 kernel/bpf/cgroup_subsys.c > > diff --git a/include/linux/bpf-cgroup-subsys.h b/include/linux/bpf-cgroup-subsys.h > new file mode 100644 > index 000000000000..4dcde06b5599 > --- /dev/null > +++ b/include/linux/bpf-cgroup-subsys.h > @@ -0,0 +1,30 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright 2022 Google LLC. > + */ > +#ifndef _BPF_CGROUP_SUBSYS_H_ > +#define _BPF_CGROUP_SUBSYS_H_ > + > +#include <linux/bpf.h> > + > +struct cgroup_subsys_bpf { > + /* Head of the list of BPF rstat flushers attached to this subsystem */ > + struct list_head rstat_flushers; > + spinlock_t flushers_lock; > +}; > + > +struct bpf_subsys_rstat_flusher { > + struct bpf_prog *prog; > + /* List of BPF rtstat flushers, anchored at subsys->bpf */ > + struct list_head list; > +}; > + > +struct bpf_cgroup_subsys_link { > + struct bpf_link link; > + struct cgroup_subsys *ss; > +}; > + > +int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr, > + struct bpf_prog *prog); > + In the next version I will make sure everything here is also defined for when CONFIG_BPF_SYSCALL is not set, and move the structs that can be moved to the cc file there. > +#endif // _BPF_CGROUP_SUBSYS_H_ > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h > index 3e24ad0c4b3c..854ee958b0e4 100644 > --- a/include/linux/bpf_types.h > +++ b/include/linux/bpf_types.h > @@ -56,6 +56,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl, > struct bpf_sysctl, struct bpf_sysctl_kern) > BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt, > struct bpf_sockopt, struct bpf_sockopt_kern) > +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT, cgroup_subsys_rstat, > + struct bpf_rstat_ctx, struct bpf_rstat_ctx) > #endif > #ifdef CONFIG_BPF_LIRC_MODE2 > BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2, > diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h > index 1bfcfb1af352..3bd6eed1fa13 100644 > --- a/include/linux/cgroup-defs.h > +++ b/include/linux/cgroup-defs.h > @@ -20,6 +20,7 @@ > #include <linux/u64_stats_sync.h> > #include <linux/workqueue.h> > #include <linux/bpf-cgroup-defs.h> > +#include <linux/bpf-cgroup-subsys.h> > #include <linux/psi_types.h> > > #ifdef CONFIG_CGROUPS > @@ -706,6 +707,9 @@ struct cgroup_subsys { > * specifies the mask of subsystems that this one depends on. > */ > unsigned int depends_on; > + > + /* used to store bpf programs.*/ > + struct cgroup_subsys_bpf bpf; > }; > > extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index d14b10b85e51..0f4855fa85db 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -952,6 +952,7 @@ enum bpf_prog_type { > BPF_PROG_TYPE_LSM, > BPF_PROG_TYPE_SK_LOOKUP, > BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ > + BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT, > }; > > enum bpf_attach_type { > @@ -998,6 +999,7 @@ enum bpf_attach_type { > BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, > BPF_PERF_EVENT, > BPF_TRACE_KPROBE_MULTI, > + BPF_CGROUP_SUBSYS_RSTAT, > __MAX_BPF_ATTACH_TYPE > }; > > @@ -1013,6 +1015,7 @@ enum bpf_link_type { > BPF_LINK_TYPE_XDP = 6, > BPF_LINK_TYPE_PERF_EVENT = 7, > BPF_LINK_TYPE_KPROBE_MULTI = 8, > + BPF_LINK_TYPE_CGROUP_SUBSYS = 9, > > MAX_BPF_LINK_TYPE, > }; > @@ -1482,6 +1485,9 @@ union bpf_attr { > */ > __u64 bpf_cookie; > } perf_event; > + struct { > + __u64 name; > + } cgroup_subsys; > struct { > __u32 flags; > __u32 cnt; > @@ -6324,6 +6330,12 @@ struct bpf_cgroup_dev_ctx { > __u32 minor; > }; > > +struct bpf_rstat_ctx { > + __u64 cgroup_id; > + __u64 parent_cgroup_id; /* 0 if root */ > + __s32 cpu; > +}; > + > struct bpf_raw_tracepoint_args { > __u64 args[0]; > }; > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile > index c1a9be6a4b9f..6caf4a61e543 100644 > --- a/kernel/bpf/Makefile > +++ b/kernel/bpf/Makefile > @@ -25,6 +25,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y) > obj-$(CONFIG_BPF_SYSCALL) += stackmap.o > endif > obj-$(CONFIG_CGROUP_BPF) += cgroup.o > +obj-$(CONFIG_CGROUP_BPF) += cgroup_subsys.o In the next version I will replace this with: ifeq ($(CONFIG_CGROUP),y) obj-$(CONFIG_BPF_SYSCALL) += cgroup_subsys.o endif , as this program type doesn't attach to cgroups and does not depend on CONFIG_CGROUP_BPF, only CONFIG_CGROUP and CONFIG_BPF_SYSCALL. > ifeq ($(CONFIG_INET),y) > obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o > endif > diff --git a/kernel/bpf/cgroup_subsys.c b/kernel/bpf/cgroup_subsys.c > new file mode 100644 > index 000000000000..9673ce6aa84a > --- /dev/null > +++ b/kernel/bpf/cgroup_subsys.c > @@ -0,0 +1,166 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Functions to manage eBPF programs attached to cgroup subsystems > + * > + * Copyright 2022 Google LLC. > + */ > + > +#include <linux/bpf-cgroup-subsys.h> > +#include <linux/filter.h> > + > +#include "../cgroup/cgroup-internal.h" > + > + > +static int cgroup_subsys_bpf_attach(struct cgroup_subsys *ss, struct bpf_prog *prog) > +{ > + struct bpf_subsys_rstat_flusher *rstat_flusher; > + > + rstat_flusher = kmalloc(sizeof(*rstat_flusher), GFP_KERNEL); > + if (!rstat_flusher) > + return -ENOMEM; > + rstat_flusher->prog = prog; > + > + spin_lock(&ss->bpf.flushers_lock); > + list_add(&rstat_flusher->list, &ss->bpf.rstat_flushers); > + spin_unlock(&ss->bpf.flushers_lock); > + > + return 0; > +} > + > +static void cgroup_subsys_bpf_detach(struct cgroup_subsys *ss, struct bpf_prog *prog) > +{ > + struct bpf_subsys_rstat_flusher *rstat_flusher = NULL; > + > + spin_lock(&ss->bpf.flushers_lock); > + list_for_each_entry(rstat_flusher, &ss->bpf.rstat_flushers, list) > + if (rstat_flusher->prog == prog) > + break; > + > + if (rstat_flusher) { > + list_del(&rstat_flusher->list); > + bpf_prog_put(rstat_flusher->prog); > + kfree(rstat_flusher); > + } > + spin_unlock(&ss->bpf.flushers_lock); > +} > + > +static void bpf_cgroup_subsys_link_release(struct bpf_link *link) > +{ > + struct bpf_cgroup_subsys_link *ss_link = container_of(link, > + struct bpf_cgroup_subsys_link, > + link); > + if (ss_link->ss) { > + cgroup_subsys_bpf_detach(ss_link->ss, ss_link->link.prog); > + ss_link->ss = NULL; > + } > +} > + > +static int bpf_cgroup_subsys_link_detach(struct bpf_link *link) > +{ > + bpf_cgroup_subsys_link_release(link); > + return 0; > +} > + > +static void bpf_cgroup_subsys_link_dealloc(struct bpf_link *link) > +{ > + struct bpf_cgroup_subsys_link *ss_link = container_of(link, > + struct bpf_cgroup_subsys_link, > + link); > + kfree(ss_link); > +} > + > +static const struct bpf_link_ops bpf_cgroup_subsys_link_lops = { > + .detach = bpf_cgroup_subsys_link_detach, > + .release = bpf_cgroup_subsys_link_release, > + .dealloc = bpf_cgroup_subsys_link_dealloc, > +}; > + > +int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr, > + struct bpf_prog *prog) > +{ > + struct bpf_link_primer link_primer; > + struct bpf_cgroup_subsys_link *link; > + struct cgroup_subsys *ss, *attach_ss = NULL; > + const char __user *ss_name_user; > + char ss_name[MAX_CGROUP_TYPE_NAMELEN]; > + int ssid, err; > + > + if (attr->link_create.target_fd || attr->link_create.flags) > + return -EINVAL; > + > + ss_name_user = u64_to_user_ptr(attr->link_create.cgroup_subsys.name); > + if (strncpy_from_user(ss_name, ss_name_user, sizeof(ss_name) - 1) < 0) > + return -EFAULT; > + > + for_each_subsys(ss, ssid) > + if (!strcmp(ss_name, ss->name) || > + !strcmp(ss_name, ss->legacy_name)) > + attach_ss = ss; > + > + if (!attach_ss) > + return -EINVAL; > + > + link = kzalloc(sizeof(*link), GFP_USER); > + if (!link) > + return -ENOMEM; > + > + bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP_SUBSYS, > + &bpf_cgroup_subsys_link_lops, > + prog); > + link->ss = attach_ss; > + > + err = bpf_link_prime(&link->link, &link_primer); > + if (err) { > + kfree(link); > + return err; > + } > + > + err = cgroup_subsys_bpf_attach(attach_ss, prog); > + if (err) { > + bpf_link_cleanup(&link_primer); > + return err; > + } > + > + return bpf_link_settle(&link_primer); > +} > + > +static const struct bpf_func_proto * > +cgroup_subsys_rstat_func_proto(enum bpf_func_id func_id, > + const struct bpf_prog *prog) > +{ > + return bpf_base_func_proto(func_id); > +} > + > +static bool cgroup_subsys_rstat_is_valid_access(int off, int size, > + enum bpf_access_type type, > + const struct bpf_prog *prog, > + struct bpf_insn_access_aux *info) > +{ > + if (type == BPF_WRITE) > + return false; > + > + if (off < 0 || off + size > sizeof(struct bpf_rstat_ctx)) > + return false; > + /* The verifier guarantees that size > 0 */ > + if (off % size != 0) > + return false; > + > + switch (off) { > + case offsetof(struct bpf_rstat_ctx, cgroup_id): > + return size == sizeof(__u64); > + case offsetof(struct bpf_rstat_ctx, parent_cgroup_id): > + return size == sizeof(__u64); > + case offsetof(struct bpf_rstat_ctx, cpu): > + return size == sizeof(__s32); > + default: > + return false; > + } > +} > + > +const struct bpf_prog_ops cgroup_subsys_rstat_prog_ops = { > +}; > + > +const struct bpf_verifier_ops cgroup_subsys_rstat_verifier_ops = { > + .get_func_proto = cgroup_subsys_rstat_func_proto, > + .is_valid_access = cgroup_subsys_rstat_is_valid_access, > +}; > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index cdaa1152436a..48149c54d969 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -3,6 +3,7 @@ > */ > #include <linux/bpf.h> > #include <linux/bpf-cgroup.h> > +#include <linux/bpf-cgroup-subsys.h> > #include <linux/bpf_trace.h> > #include <linux/bpf_lirc.h> > #include <linux/bpf_verifier.h> > @@ -3194,6 +3195,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) > return BPF_PROG_TYPE_SK_LOOKUP; > case BPF_XDP: > return BPF_PROG_TYPE_XDP; > + case BPF_CGROUP_SUBSYS_RSTAT: > + return BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT; > default: > return BPF_PROG_TYPE_UNSPEC; > } > @@ -4341,6 +4344,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) > else > ret = bpf_kprobe_multi_link_attach(attr, prog); > break; > + case BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT: > + ret = cgroup_subsys_bpf_link_attach(attr, prog); > + break; > default: > ret = -EINVAL; > } > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index adb820e98f24..7b1448013009 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -5745,6 +5745,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) > > idr_init(&ss->css_idr); > INIT_LIST_HEAD(&ss->cfts); > + INIT_LIST_HEAD(&ss->bpf.rstat_flushers); > > /* Create the root cgroup state for this subsystem */ > ss->root = &cgrp_dfl_root; > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index d14b10b85e51..0f4855fa85db 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -952,6 +952,7 @@ enum bpf_prog_type { > BPF_PROG_TYPE_LSM, > BPF_PROG_TYPE_SK_LOOKUP, > BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ > + BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT, > }; > > enum bpf_attach_type { > @@ -998,6 +999,7 @@ enum bpf_attach_type { > BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, > BPF_PERF_EVENT, > BPF_TRACE_KPROBE_MULTI, > + BPF_CGROUP_SUBSYS_RSTAT, > __MAX_BPF_ATTACH_TYPE > }; > > @@ -1013,6 +1015,7 @@ enum bpf_link_type { > BPF_LINK_TYPE_XDP = 6, > BPF_LINK_TYPE_PERF_EVENT = 7, > BPF_LINK_TYPE_KPROBE_MULTI = 8, > + BPF_LINK_TYPE_CGROUP_SUBSYS = 9, > > MAX_BPF_LINK_TYPE, > }; > @@ -1482,6 +1485,9 @@ union bpf_attr { > */ > __u64 bpf_cookie; > } perf_event; > + struct { > + __u64 name; > + } cgroup_subsys; > struct { > __u32 flags; > __u32 cnt; > @@ -6324,6 +6330,12 @@ struct bpf_cgroup_dev_ctx { > __u32 minor; > }; > > +struct bpf_rstat_ctx { > + __u64 cgroup_id; > + __u64 parent_cgroup_id; /* 0 if root */ > + __s32 cpu; > +}; > + > struct bpf_raw_tracepoint_args { > __u64 args[0]; > }; > -- > 2.36.0.512.ge40c2bad7a-goog >