On Tue, May 10, 2022 at 11:07 AM Yosry Ahmed <yosryahmed@xxxxxxxxxx> wrote: > > On Mon, May 9, 2022 at 5:18 PM Yosry Ahmed <yosryahmed@xxxxxxxxxx> wrote: > > > > This patch introduces a new bpf program type CGROUP_SUBSYS_RSTAT, > > with new corresponding link and attach types. > > > > The main purpose of these programs is to allow BPF programs to collect > > and maintain hierarchical cgroup stats easily and efficiently by making > > using of the rstat framework in the kernel. > > > > Those programs attach to a cgroup subsystem. They typically contain logic > > to aggregate per-cpu and per-cgroup stats collected by other BPF programs. > > > > Currently, only rstat flusher programs can be attached to cgroup > > subsystems, but this can be extended later if a use-case arises. > > > > See the selftest in the final patch for a practical example. > > > > Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx> > > --- > > include/linux/bpf-cgroup-subsys.h | 30 ++++++ > > include/linux/bpf_types.h | 2 + > > include/linux/cgroup-defs.h | 4 + > > include/uapi/linux/bpf.h | 12 +++ > > kernel/bpf/Makefile | 1 + > > kernel/bpf/cgroup_subsys.c | 166 ++++++++++++++++++++++++++++++ > > kernel/bpf/syscall.c | 6 ++ > > kernel/cgroup/cgroup.c | 1 + > > tools/include/uapi/linux/bpf.h | 12 +++ > > 9 files changed, 234 insertions(+) > > create mode 100644 include/linux/bpf-cgroup-subsys.h > > create mode 100644 kernel/bpf/cgroup_subsys.c > > > > diff --git a/include/linux/bpf-cgroup-subsys.h b/include/linux/bpf-cgroup-subsys.h > > new file mode 100644 > > index 000000000000..4dcde06b5599 > > --- /dev/null > > +++ b/include/linux/bpf-cgroup-subsys.h > > @@ -0,0 +1,30 @@ > > +/* SPDX-License-Identifier: GPL-2.0 */ > > +/* > > + * Copyright 2022 Google LLC. > > + */ > > +#ifndef _BPF_CGROUP_SUBSYS_H_ > > +#define _BPF_CGROUP_SUBSYS_H_ > > + > > +#include <linux/bpf.h> > > + > > +struct cgroup_subsys_bpf { > > + /* Head of the list of BPF rstat flushers attached to this subsystem */ > > + struct list_head rstat_flushers; > > + spinlock_t flushers_lock; > > +}; > > + > > +struct bpf_subsys_rstat_flusher { > > + struct bpf_prog *prog; > > + /* List of BPF rtstat flushers, anchored at subsys->bpf */ > > + struct list_head list; > > +}; > > + > > +struct bpf_cgroup_subsys_link { > > + struct bpf_link link; > > + struct cgroup_subsys *ss; > > +}; > > + > > +int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr, > > + struct bpf_prog *prog); > > + > > In the next version I will make sure everything here is also defined > for when CONFIG_BPF_SYSCALL is not set, and move the structs that can > be moved to the cc file there. > > > +#endif // _BPF_CGROUP_SUBSYS_H_ > > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h > > index 3e24ad0c4b3c..854ee958b0e4 100644 > > --- a/include/linux/bpf_types.h > > +++ b/include/linux/bpf_types.h > > @@ -56,6 +56,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl, > > struct bpf_sysctl, struct bpf_sysctl_kern) > > BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt, > > struct bpf_sockopt, struct bpf_sockopt_kern) > > +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT, cgroup_subsys_rstat, > > + struct bpf_rstat_ctx, struct bpf_rstat_ctx) > > #endif > > #ifdef CONFIG_BPF_LIRC_MODE2 > > BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2, > > diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h > > index 1bfcfb1af352..3bd6eed1fa13 100644 > > --- a/include/linux/cgroup-defs.h > > +++ b/include/linux/cgroup-defs.h > > @@ -20,6 +20,7 @@ > > #include <linux/u64_stats_sync.h> > > #include <linux/workqueue.h> > > #include <linux/bpf-cgroup-defs.h> > > +#include <linux/bpf-cgroup-subsys.h> > > #include <linux/psi_types.h> > > > > #ifdef CONFIG_CGROUPS > > @@ -706,6 +707,9 @@ struct cgroup_subsys { > > * specifies the mask of subsystems that this one depends on. > > */ > > unsigned int depends_on; > > + > > + /* used to store bpf programs.*/ > > + struct cgroup_subsys_bpf bpf; > > }; > > > > extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index d14b10b85e51..0f4855fa85db 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -952,6 +952,7 @@ enum bpf_prog_type { > > BPF_PROG_TYPE_LSM, > > BPF_PROG_TYPE_SK_LOOKUP, > > BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ > > + BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT, > > }; > > > > enum bpf_attach_type { > > @@ -998,6 +999,7 @@ enum bpf_attach_type { > > BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, > > BPF_PERF_EVENT, > > BPF_TRACE_KPROBE_MULTI, > > + BPF_CGROUP_SUBSYS_RSTAT, > > __MAX_BPF_ATTACH_TYPE > > }; > > > > @@ -1013,6 +1015,7 @@ enum bpf_link_type { > > BPF_LINK_TYPE_XDP = 6, > > BPF_LINK_TYPE_PERF_EVENT = 7, > > BPF_LINK_TYPE_KPROBE_MULTI = 8, > > + BPF_LINK_TYPE_CGROUP_SUBSYS = 9, > > > > MAX_BPF_LINK_TYPE, > > }; > > @@ -1482,6 +1485,9 @@ union bpf_attr { > > */ > > __u64 bpf_cookie; > > } perf_event; > > + struct { > > + __u64 name; > > + } cgroup_subsys; > > struct { > > __u32 flags; > > __u32 cnt; > > @@ -6324,6 +6330,12 @@ struct bpf_cgroup_dev_ctx { > > __u32 minor; > > }; > > > > +struct bpf_rstat_ctx { > > + __u64 cgroup_id; > > + __u64 parent_cgroup_id; /* 0 if root */ > > + __s32 cpu; > > +}; > > + > > struct bpf_raw_tracepoint_args { > > __u64 args[0]; > > }; > > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile > > index c1a9be6a4b9f..6caf4a61e543 100644 > > --- a/kernel/bpf/Makefile > > +++ b/kernel/bpf/Makefile > > @@ -25,6 +25,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y) > > obj-$(CONFIG_BPF_SYSCALL) += stackmap.o > > endif > > obj-$(CONFIG_CGROUP_BPF) += cgroup.o > > +obj-$(CONFIG_CGROUP_BPF) += cgroup_subsys.o > > In the next version I will replace this with: > ifeq ($(CONFIG_CGROUP),y) > obj-$(CONFIG_BPF_SYSCALL) += cgroup_subsys.o > endif > > , as this program type doesn't attach to cgroups and does not depend > on CONFIG_CGROUP_BPF, only CONFIG_CGROUP and CONFIG_BPF_SYSCALL. On second thought it might be simpler and cleaner to leave this code under CONFIG_CGROUP_BPF. > > > ifeq ($(CONFIG_INET),y) > > obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o > > endif > > diff --git a/kernel/bpf/cgroup_subsys.c b/kernel/bpf/cgroup_subsys.c > > new file mode 100644 > > index 000000000000..9673ce6aa84a > > --- /dev/null > > +++ b/kernel/bpf/cgroup_subsys.c > > @@ -0,0 +1,166 @@ > > +// SPDX-License-Identifier: GPL-2.0-only > > +/* > > + * Functions to manage eBPF programs attached to cgroup subsystems > > + * > > + * Copyright 2022 Google LLC. > > + */ > > + > > +#include <linux/bpf-cgroup-subsys.h> > > +#include <linux/filter.h> > > + > > +#include "../cgroup/cgroup-internal.h" > > + > > + > > +static int cgroup_subsys_bpf_attach(struct cgroup_subsys *ss, struct bpf_prog *prog) > > +{ > > + struct bpf_subsys_rstat_flusher *rstat_flusher; > > + > > + rstat_flusher = kmalloc(sizeof(*rstat_flusher), GFP_KERNEL); > > + if (!rstat_flusher) > > + return -ENOMEM; > > + rstat_flusher->prog = prog; > > + > > + spin_lock(&ss->bpf.flushers_lock); > > + list_add(&rstat_flusher->list, &ss->bpf.rstat_flushers); > > + spin_unlock(&ss->bpf.flushers_lock); > > + > > + return 0; > > +} > > + > > +static void cgroup_subsys_bpf_detach(struct cgroup_subsys *ss, struct bpf_prog *prog) > > +{ > > + struct bpf_subsys_rstat_flusher *rstat_flusher = NULL; > > + > > + spin_lock(&ss->bpf.flushers_lock); > > + list_for_each_entry(rstat_flusher, &ss->bpf.rstat_flushers, list) > > + if (rstat_flusher->prog == prog) > > + break; > > + > > + if (rstat_flusher) { > > + list_del(&rstat_flusher->list); > > + bpf_prog_put(rstat_flusher->prog); > > + kfree(rstat_flusher); > > + } > > + spin_unlock(&ss->bpf.flushers_lock); > > +} > > + > > +static void bpf_cgroup_subsys_link_release(struct bpf_link *link) > > +{ > > + struct bpf_cgroup_subsys_link *ss_link = container_of(link, > > + struct bpf_cgroup_subsys_link, > > + link); > > + if (ss_link->ss) { > > + cgroup_subsys_bpf_detach(ss_link->ss, ss_link->link.prog); > > + ss_link->ss = NULL; > > + } > > +} > > + > > +static int bpf_cgroup_subsys_link_detach(struct bpf_link *link) > > +{ > > + bpf_cgroup_subsys_link_release(link); > > + return 0; > > +} > > + > > +static void bpf_cgroup_subsys_link_dealloc(struct bpf_link *link) > > +{ > > + struct bpf_cgroup_subsys_link *ss_link = container_of(link, > > + struct bpf_cgroup_subsys_link, > > + link); > > + kfree(ss_link); > > +} > > + > > +static const struct bpf_link_ops bpf_cgroup_subsys_link_lops = { > > + .detach = bpf_cgroup_subsys_link_detach, > > + .release = bpf_cgroup_subsys_link_release, > > + .dealloc = bpf_cgroup_subsys_link_dealloc, > > +}; > > + > > +int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr, > > + struct bpf_prog *prog) > > +{ > > + struct bpf_link_primer link_primer; > > + struct bpf_cgroup_subsys_link *link; > > + struct cgroup_subsys *ss, *attach_ss = NULL; > > + const char __user *ss_name_user; > > + char ss_name[MAX_CGROUP_TYPE_NAMELEN]; > > + int ssid, err; > > + > > + if (attr->link_create.target_fd || attr->link_create.flags) > > + return -EINVAL; > > + > > + ss_name_user = u64_to_user_ptr(attr->link_create.cgroup_subsys.name); > > + if (strncpy_from_user(ss_name, ss_name_user, sizeof(ss_name) - 1) < 0) > > + return -EFAULT; > > + > > + for_each_subsys(ss, ssid) > > + if (!strcmp(ss_name, ss->name) || > > + !strcmp(ss_name, ss->legacy_name)) > > + attach_ss = ss; > > + > > + if (!attach_ss) > > + return -EINVAL; > > + > > + link = kzalloc(sizeof(*link), GFP_USER); > > + if (!link) > > + return -ENOMEM; > > + > > + bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP_SUBSYS, > > + &bpf_cgroup_subsys_link_lops, > > + prog); > > + link->ss = attach_ss; > > + > > + err = bpf_link_prime(&link->link, &link_primer); > > + if (err) { > > + kfree(link); > > + return err; > > + } > > + > > + err = cgroup_subsys_bpf_attach(attach_ss, prog); > > + if (err) { > > + bpf_link_cleanup(&link_primer); > > + return err; > > + } > > + > > + return bpf_link_settle(&link_primer); > > +} > > + > > +static const struct bpf_func_proto * > > +cgroup_subsys_rstat_func_proto(enum bpf_func_id func_id, > > + const struct bpf_prog *prog) > > +{ > > + return bpf_base_func_proto(func_id); > > +} > > + > > +static bool cgroup_subsys_rstat_is_valid_access(int off, int size, > > + enum bpf_access_type type, > > + const struct bpf_prog *prog, > > + struct bpf_insn_access_aux *info) > > +{ > > + if (type == BPF_WRITE) > > + return false; > > + > > + if (off < 0 || off + size > sizeof(struct bpf_rstat_ctx)) > > + return false; > > + /* The verifier guarantees that size > 0 */ > > + if (off % size != 0) > > + return false; > > + > > + switch (off) { > > + case offsetof(struct bpf_rstat_ctx, cgroup_id): > > + return size == sizeof(__u64); > > + case offsetof(struct bpf_rstat_ctx, parent_cgroup_id): > > + return size == sizeof(__u64); > > + case offsetof(struct bpf_rstat_ctx, cpu): > > + return size == sizeof(__s32); > > + default: > > + return false; > > + } > > +} > > + > > +const struct bpf_prog_ops cgroup_subsys_rstat_prog_ops = { > > +}; > > + > > +const struct bpf_verifier_ops cgroup_subsys_rstat_verifier_ops = { > > + .get_func_proto = cgroup_subsys_rstat_func_proto, > > + .is_valid_access = cgroup_subsys_rstat_is_valid_access, > > +}; > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > > index cdaa1152436a..48149c54d969 100644 > > --- a/kernel/bpf/syscall.c > > +++ b/kernel/bpf/syscall.c > > @@ -3,6 +3,7 @@ > > */ > > #include <linux/bpf.h> > > #include <linux/bpf-cgroup.h> > > +#include <linux/bpf-cgroup-subsys.h> > > #include <linux/bpf_trace.h> > > #include <linux/bpf_lirc.h> > > #include <linux/bpf_verifier.h> > > @@ -3194,6 +3195,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) > > return BPF_PROG_TYPE_SK_LOOKUP; > > case BPF_XDP: > > return BPF_PROG_TYPE_XDP; > > + case BPF_CGROUP_SUBSYS_RSTAT: > > + return BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT; > > default: > > return BPF_PROG_TYPE_UNSPEC; > > } > > @@ -4341,6 +4344,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) > > else > > ret = bpf_kprobe_multi_link_attach(attr, prog); > > break; > > + case BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT: > > + ret = cgroup_subsys_bpf_link_attach(attr, prog); > > + break; > > default: > > ret = -EINVAL; > > } > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > > index adb820e98f24..7b1448013009 100644 > > --- a/kernel/cgroup/cgroup.c > > +++ b/kernel/cgroup/cgroup.c > > @@ -5745,6 +5745,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) > > > > idr_init(&ss->css_idr); > > INIT_LIST_HEAD(&ss->cfts); > > + INIT_LIST_HEAD(&ss->bpf.rstat_flushers); > > > > /* Create the root cgroup state for this subsystem */ > > ss->root = &cgrp_dfl_root; > > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > > index d14b10b85e51..0f4855fa85db 100644 > > --- a/tools/include/uapi/linux/bpf.h > > +++ b/tools/include/uapi/linux/bpf.h > > @@ -952,6 +952,7 @@ enum bpf_prog_type { > > BPF_PROG_TYPE_LSM, > > BPF_PROG_TYPE_SK_LOOKUP, > > BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ > > + BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT, > > }; > > > > enum bpf_attach_type { > > @@ -998,6 +999,7 @@ enum bpf_attach_type { > > BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, > > BPF_PERF_EVENT, > > BPF_TRACE_KPROBE_MULTI, > > + BPF_CGROUP_SUBSYS_RSTAT, > > __MAX_BPF_ATTACH_TYPE > > }; > > > > @@ -1013,6 +1015,7 @@ enum bpf_link_type { > > BPF_LINK_TYPE_XDP = 6, > > BPF_LINK_TYPE_PERF_EVENT = 7, > > BPF_LINK_TYPE_KPROBE_MULTI = 8, > > + BPF_LINK_TYPE_CGROUP_SUBSYS = 9, > > > > MAX_BPF_LINK_TYPE, > > }; > > @@ -1482,6 +1485,9 @@ union bpf_attr { > > */ > > __u64 bpf_cookie; > > } perf_event; > > + struct { > > + __u64 name; > > + } cgroup_subsys; > > struct { > > __u32 flags; > > __u32 cnt; > > @@ -6324,6 +6330,12 @@ struct bpf_cgroup_dev_ctx { > > __u32 minor; > > }; > > > > +struct bpf_rstat_ctx { > > + __u64 cgroup_id; > > + __u64 parent_cgroup_id; /* 0 if root */ > > + __s32 cpu; > > +}; > > + > > struct bpf_raw_tracepoint_args { > > __u64 args[0]; > > }; > > -- > > 2.36.0.512.ge40c2bad7a-goog > >