This patch introduces a new bpf program type, RSTAT_FLUSH, with new corresponding link and attach types. These programs acts as a callback for the rstat framework to call when a stats flush is ongoing. It allows BPF programs to collect and maintain hierarchical stats cgroup stats efficiently by integrating with the rstat framework. See the selftest in the final patch for a practical example. Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx> --- include/linux/bpf-rstat.h | 25 +++++ include/linux/bpf_types.h | 4 + include/uapi/linux/bpf.h | 9 ++ kernel/bpf/Makefile | 3 + kernel/bpf/rstat.c | 166 +++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 ++ tools/include/uapi/linux/bpf.h | 9 ++ 7 files changed, 222 insertions(+) create mode 100644 include/linux/bpf-rstat.h create mode 100644 kernel/bpf/rstat.c diff --git a/include/linux/bpf-rstat.h b/include/linux/bpf-rstat.h new file mode 100644 index 000000000000..23cad23b5fc2 --- /dev/null +++ b/include/linux/bpf-rstat.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2022 Google LLC. + */ +#ifndef _BPF_RSTAT_H_ +#define _BPF_RSTAT_H_ + +#include <linux/bpf.h> + +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_CGROUPS) + +int bpf_rstat_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog); + +#else /* defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_CGROUPS) */ + +static inline int bpf_rstat_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -ENOTSUPP; +} + +#endif /* defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_CGROUPS) */ + +#endif /* _BPF_RSTAT */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b9112b80171..ff92299f76a9 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -77,6 +77,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, void *, void *) #endif /* CONFIG_BPF_LSM */ #endif +#ifdef CONFIG_CGROUPS +BPF_PROG_TYPE(BPF_PROG_TYPE_RSTAT_FLUSH, rstat_flush, + struct bpf_rstat_flush_ctx, struct bpf_rstat_flush_ctx) +#endif /* CONFIG_CGROUPS */ BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall, void *, void *) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0210f85131b3..968e3cb02580 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -952,6 +952,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + BPF_PROG_TYPE_RSTAT_FLUSH, }; enum bpf_attach_type { @@ -998,6 +999,7 @@ enum bpf_attach_type { BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, + BPF_RSTAT_FLUSH, __MAX_BPF_ATTACH_TYPE }; @@ -1014,6 +1016,7 @@ enum bpf_link_type { BPF_LINK_TYPE_PERF_EVENT = 7, BPF_LINK_TYPE_KPROBE_MULTI = 8, BPF_LINK_TYPE_STRUCT_OPS = 9, + BPF_LINK_TYPE_RSTAT = 10, MAX_BPF_LINK_TYPE, }; @@ -6359,6 +6362,12 @@ struct bpf_cgroup_dev_ctx { __u32 minor; }; +struct bpf_rstat_flush_ctx { + __bpf_md_ptr(struct cgroup *, cgrp); + __bpf_md_ptr(struct cgroup *, parent); + __s32 cpu; +}; + struct bpf_raw_tracepoint_args { __u64 args[0]; }; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 057ba8e01e70..0487133b799f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -36,6 +36,9 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif obj-$(CONFIG_BPF_PRELOAD) += preload/ +ifeq ($(CONFIG_CGROUPS),y) +obj-$(CONFIG_BPF_SYSCALL) += rstat.o +endif obj-$(CONFIG_BPF_SYSCALL) += relo_core.o $(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE diff --git a/kernel/bpf/rstat.c b/kernel/bpf/rstat.c new file mode 100644 index 000000000000..5f529002d4b9 --- /dev/null +++ b/kernel/bpf/rstat.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Functions to manage eBPF programs attached to cgroup subsystems + * + * Copyright 2022 Google LLC. + */ + +#include <linux/bpf-rstat.h> +#include <linux/btf_ids.h> +#include <linux/cgroup.h> +#include <linux/filter.h> + +static LIST_HEAD(bpf_rstat_flushers); +static DEFINE_SPINLOCK(bpf_rstat_flushers_lock); + + +struct bpf_rstat_flusher { + struct bpf_prog *prog; + /* List of BPF rtstat flushers, anchored at subsys->bpf */ + struct list_head list; +}; + +struct bpf_rstat_link { + struct bpf_link link; + struct bpf_rstat_flusher *flusher; +}; + +static int bpf_rstat_flush_attach(struct bpf_prog *prog, + struct bpf_rstat_link *rlink) +{ + struct bpf_rstat_flusher *flusher; + + flusher = kmalloc(sizeof(*flusher), GFP_KERNEL); + if (!flusher) + return -ENOMEM; + + flusher->prog = prog; + rlink->flusher = flusher; + + spin_lock(&bpf_rstat_flushers_lock); + list_add(&flusher->list, &bpf_rstat_flushers); + spin_unlock(&bpf_rstat_flushers_lock); + + return 0; +} + +static void bpf_rstat_flush_detach(struct bpf_rstat_link *rstat_link) +{ + struct bpf_rstat_flusher *flusher = rstat_link->flusher; + + if (!flusher) + return; + + spin_lock(&bpf_rstat_flushers_lock); + list_del(&flusher->list); + bpf_prog_put(flusher->prog); + kfree(flusher); + spin_unlock(&bpf_rstat_flushers_lock); +} + +static const struct bpf_func_proto * +bpf_rstat_flush_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id); +} + +BTF_ID_LIST_SINGLE(bpf_cgroup_btf_ids, struct, cgroup) + +static bool bpf_rstat_flush_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + if (off < 0 || off + size > sizeof(struct bpf_rstat_flush_ctx)) + return false; + /* The verifier guarantees that size > 0 */ + if (off % size != 0) + return false; + + switch (off) { + case bpf_ctx_range_ptr(struct bpf_rstat_flush_ctx, cgrp): + info->reg_type = PTR_TO_BTF_ID; + info->btf_id = bpf_cgroup_btf_ids[0]; + info->btf = bpf_get_btf_vmlinux(); + return !IS_ERR(info->btf) && info->btf && size == sizeof(__u64); + case bpf_ctx_range_ptr(struct bpf_rstat_flush_ctx, parent): + info->reg_type = PTR_TO_BTF_ID_OR_NULL; + info->btf_id = bpf_cgroup_btf_ids[0]; + info->btf = bpf_get_btf_vmlinux(); + return !IS_ERR(info->btf) && info->btf && size == sizeof(__u64); + case bpf_ctx_range(struct bpf_rstat_flush_ctx, cpu): + return size == sizeof(__s32); + default: + return false; + } +} + +const struct bpf_prog_ops rstat_flush_prog_ops = { +}; + +const struct bpf_verifier_ops rstat_flush_verifier_ops = { + .get_func_proto = bpf_rstat_flush_func_proto, + .is_valid_access = bpf_rstat_flush_is_valid_access, +}; + +static void bpf_rstat_link_release(struct bpf_link *link) +{ + struct bpf_rstat_link *rlink; + + rlink = container_of(link, + struct bpf_rstat_link, + link); + + /* rstat flushers are currently the only supported rstat programs */ + bpf_rstat_flush_detach(rlink); +} + +static void bpf_rstat_link_dealloc(struct bpf_link *link) +{ + struct bpf_rstat_link *rlink = container_of(link, + struct bpf_rstat_link, + link); + kfree(rlink); +} + +static const struct bpf_link_ops bpf_rstat_link_lops = { + .release = bpf_rstat_link_release, + .dealloc = bpf_rstat_link_dealloc, +}; + +int bpf_rstat_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_rstat_link *link; + int err; + + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_RSTAT, + &bpf_rstat_link_lops, prog); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + return err; + } + + /* rstat flushers are currently the only supported rstat programs */ + err = bpf_rstat_flush_attach(prog, link); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + + return bpf_link_settle(&link_primer); +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 72e53489165d..ffeed8379b35 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3,6 +3,7 @@ */ #include <linux/bpf.h> #include <linux/bpf-cgroup.h> +#include <linux/bpf-rstat.h> #include <linux/bpf_trace.h> #include <linux/bpf_lirc.h> #include <linux/bpf_verifier.h> @@ -3416,6 +3417,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: return BPF_PROG_TYPE_XDP; + case BPF_RSTAT_FLUSH: + return BPF_PROG_TYPE_RSTAT_FLUSH; default: return BPF_PROG_TYPE_UNSPEC; } @@ -4564,6 +4567,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) else ret = bpf_kprobe_multi_link_attach(attr, prog); break; + case BPF_PROG_TYPE_RSTAT_FLUSH: + ret = bpf_rstat_link_attach(attr, prog); + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0210f85131b3..968e3cb02580 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -952,6 +952,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + BPF_PROG_TYPE_RSTAT_FLUSH, }; enum bpf_attach_type { @@ -998,6 +999,7 @@ enum bpf_attach_type { BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, + BPF_RSTAT_FLUSH, __MAX_BPF_ATTACH_TYPE }; @@ -1014,6 +1016,7 @@ enum bpf_link_type { BPF_LINK_TYPE_PERF_EVENT = 7, BPF_LINK_TYPE_KPROBE_MULTI = 8, BPF_LINK_TYPE_STRUCT_OPS = 9, + BPF_LINK_TYPE_RSTAT = 10, MAX_BPF_LINK_TYPE, }; @@ -6359,6 +6362,12 @@ struct bpf_cgroup_dev_ctx { __u32 minor; }; +struct bpf_rstat_flush_ctx { + __bpf_md_ptr(struct cgroup *, cgrp); + __bpf_md_ptr(struct cgroup *, parent); + __s32 cpu; +}; + struct bpf_raw_tracepoint_args { __u64 args[0]; }; -- 2.36.0.550.gb090851708-goog