Re: [RFC PATCH bpf-next 1/9] bpf: introduce CGROUP_SUBSYS_RSTAT program type

Yosry Ahmed <yosryahmed@xxxxxxxxxx> · Tue, 10 May 2022 11:07:59 -0700

On Mon, May 9, 2022 at 5:18 PM Yosry Ahmed <yosryahmed@xxxxxxxxxx> wrote:
>
> This patch introduces a new bpf program type CGROUP_SUBSYS_RSTAT,
> with new corresponding link and attach types.
>
> The main purpose of these programs is to allow BPF programs to collect
> and maintain hierarchical cgroup stats easily and efficiently by making
> using of the rstat framework in the kernel.
>
> Those programs attach to a cgroup subsystem. They typically contain logic
> to aggregate per-cpu and per-cgroup stats collected by other BPF programs.
>
> Currently, only rstat flusher programs can be attached to cgroup
> subsystems, but this can be extended later if a use-case arises.
>
> See the selftest in the final patch for a practical example.
>
> Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx>
> ---
>  include/linux/bpf-cgroup-subsys.h |  30 ++++++
>  include/linux/bpf_types.h         |   2 +
>  include/linux/cgroup-defs.h       |   4 +
>  include/uapi/linux/bpf.h          |  12 +++
>  kernel/bpf/Makefile               |   1 +
>  kernel/bpf/cgroup_subsys.c        | 166 ++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c              |   6 ++
>  kernel/cgroup/cgroup.c            |   1 +
>  tools/include/uapi/linux/bpf.h    |  12 +++
>  9 files changed, 234 insertions(+)
>  create mode 100644 include/linux/bpf-cgroup-subsys.h
>  create mode 100644 kernel/bpf/cgroup_subsys.c
>
> diff --git a/include/linux/bpf-cgroup-subsys.h b/include/linux/bpf-cgroup-subsys.h
> new file mode 100644
> index 000000000000..4dcde06b5599
> --- /dev/null
> +++ b/include/linux/bpf-cgroup-subsys.h
> @@ -0,0 +1,30 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright 2022 Google LLC.
> + */
> +#ifndef _BPF_CGROUP_SUBSYS_H_
> +#define _BPF_CGROUP_SUBSYS_H_
> +
> +#include <linux/bpf.h>
> +
> +struct cgroup_subsys_bpf {
> +       /* Head of the list of BPF rstat flushers attached to this subsystem */
> +       struct list_head rstat_flushers;
> +       spinlock_t flushers_lock;
> +};
> +
> +struct bpf_subsys_rstat_flusher {
> +       struct bpf_prog *prog;
> +       /* List of BPF rtstat flushers, anchored at subsys->bpf */
> +       struct list_head list;
> +};
> +
> +struct bpf_cgroup_subsys_link {
> +       struct bpf_link link;
> +       struct cgroup_subsys *ss;
> +};
> +
> +int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr,
> +                                 struct bpf_prog *prog);
> +

In the next version I will make sure everything here is also defined
for when CONFIG_BPF_SYSCALL is not set, and move the structs that can
be moved to the cc file there.

> +#endif  // _BPF_CGROUP_SUBSYS_H_
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index 3e24ad0c4b3c..854ee958b0e4 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -56,6 +56,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl,
>               struct bpf_sysctl, struct bpf_sysctl_kern)
>  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt,
>               struct bpf_sockopt, struct bpf_sockopt_kern)
> +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT, cgroup_subsys_rstat,
> +             struct bpf_rstat_ctx, struct bpf_rstat_ctx)
>  #endif
>  #ifdef CONFIG_BPF_LIRC_MODE2
>  BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 1bfcfb1af352..3bd6eed1fa13 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -20,6 +20,7 @@
>  #include <linux/u64_stats_sync.h>
>  #include <linux/workqueue.h>
>  #include <linux/bpf-cgroup-defs.h>
> +#include <linux/bpf-cgroup-subsys.h>
>  #include <linux/psi_types.h>
>
>  #ifdef CONFIG_CGROUPS
> @@ -706,6 +707,9 @@ struct cgroup_subsys {
>          * specifies the mask of subsystems that this one depends on.
>          */
>         unsigned int depends_on;
> +
> +       /* used to store bpf programs.*/
> +       struct cgroup_subsys_bpf bpf;
>  };
>
>  extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index d14b10b85e51..0f4855fa85db 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -952,6 +952,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_LSM,
>         BPF_PROG_TYPE_SK_LOOKUP,
>         BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> +       BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT,
>  };
>
>  enum bpf_attach_type {
> @@ -998,6 +999,7 @@ enum bpf_attach_type {
>         BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
>         BPF_PERF_EVENT,
>         BPF_TRACE_KPROBE_MULTI,
> +       BPF_CGROUP_SUBSYS_RSTAT,
>         __MAX_BPF_ATTACH_TYPE
>  };
>
> @@ -1013,6 +1015,7 @@ enum bpf_link_type {
>         BPF_LINK_TYPE_XDP = 6,
>         BPF_LINK_TYPE_PERF_EVENT = 7,
>         BPF_LINK_TYPE_KPROBE_MULTI = 8,
> +       BPF_LINK_TYPE_CGROUP_SUBSYS = 9,
>
>         MAX_BPF_LINK_TYPE,
>  };
> @@ -1482,6 +1485,9 @@ union bpf_attr {
>                                  */
>                                 __u64           bpf_cookie;
>                         } perf_event;
> +                       struct {
> +                               __u64           name;
> +                       } cgroup_subsys;
>                         struct {
>                                 __u32           flags;
>                                 __u32           cnt;
> @@ -6324,6 +6330,12 @@ struct bpf_cgroup_dev_ctx {
>         __u32 minor;
>  };
>
> +struct bpf_rstat_ctx {
> +       __u64 cgroup_id;
> +       __u64 parent_cgroup_id; /* 0 if root */
> +       __s32 cpu;
> +};
> +
>  struct bpf_raw_tracepoint_args {
>         __u64 args[0];
>  };
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index c1a9be6a4b9f..6caf4a61e543 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -25,6 +25,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
>  obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
>  endif
>  obj-$(CONFIG_CGROUP_BPF) += cgroup.o
> +obj-$(CONFIG_CGROUP_BPF) += cgroup_subsys.o

In the next version I will replace this with:
ifeq ($(CONFIG_CGROUP),y)
obj-$(CONFIG_BPF_SYSCALL) += cgroup_subsys.o
endif

, as this program type doesn't attach to cgroups and does not depend
on CONFIG_CGROUP_BPF, only CONFIG_CGROUP and CONFIG_BPF_SYSCALL.

>  ifeq ($(CONFIG_INET),y)
>  obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
>  endif
> diff --git a/kernel/bpf/cgroup_subsys.c b/kernel/bpf/cgroup_subsys.c
> new file mode 100644
> index 000000000000..9673ce6aa84a
> --- /dev/null
> +++ b/kernel/bpf/cgroup_subsys.c
> @@ -0,0 +1,166 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Functions to manage eBPF programs attached to cgroup subsystems
> + *
> + * Copyright 2022 Google LLC.
> + */
> +
> +#include <linux/bpf-cgroup-subsys.h>
> +#include <linux/filter.h>
> +
> +#include "../cgroup/cgroup-internal.h"
> +
> +
> +static int cgroup_subsys_bpf_attach(struct cgroup_subsys *ss, struct bpf_prog *prog)
> +{
> +       struct bpf_subsys_rstat_flusher *rstat_flusher;
> +
> +       rstat_flusher = kmalloc(sizeof(*rstat_flusher), GFP_KERNEL);
> +       if (!rstat_flusher)
> +               return -ENOMEM;
> +       rstat_flusher->prog = prog;
> +
> +       spin_lock(&ss->bpf.flushers_lock);
> +       list_add(&rstat_flusher->list, &ss->bpf.rstat_flushers);
> +       spin_unlock(&ss->bpf.flushers_lock);
> +
> +       return 0;
> +}
> +
> +static void cgroup_subsys_bpf_detach(struct cgroup_subsys *ss, struct bpf_prog *prog)
> +{
> +       struct bpf_subsys_rstat_flusher *rstat_flusher = NULL;
> +
> +       spin_lock(&ss->bpf.flushers_lock);
> +       list_for_each_entry(rstat_flusher, &ss->bpf.rstat_flushers, list)
> +               if (rstat_flusher->prog == prog)
> +                       break;
> +
> +       if (rstat_flusher) {
> +               list_del(&rstat_flusher->list);
> +               bpf_prog_put(rstat_flusher->prog);
> +               kfree(rstat_flusher);
> +       }
> +       spin_unlock(&ss->bpf.flushers_lock);
> +}
> +
> +static void bpf_cgroup_subsys_link_release(struct bpf_link *link)
> +{
> +       struct bpf_cgroup_subsys_link *ss_link = container_of(link,
> +                                                      struct bpf_cgroup_subsys_link,
> +                                                      link);
> +       if (ss_link->ss) {
> +               cgroup_subsys_bpf_detach(ss_link->ss, ss_link->link.prog);
> +               ss_link->ss = NULL;
> +       }
> +}
> +
> +static int bpf_cgroup_subsys_link_detach(struct bpf_link *link)
> +{
> +       bpf_cgroup_subsys_link_release(link);
> +       return 0;
> +}
> +
> +static void bpf_cgroup_subsys_link_dealloc(struct bpf_link *link)
> +{
> +       struct bpf_cgroup_subsys_link *ss_link = container_of(link,
> +                                                      struct bpf_cgroup_subsys_link,
> +                                                      link);
> +       kfree(ss_link);
> +}
> +
> +static const struct bpf_link_ops bpf_cgroup_subsys_link_lops = {
> +       .detach = bpf_cgroup_subsys_link_detach,
> +       .release = bpf_cgroup_subsys_link_release,
> +       .dealloc = bpf_cgroup_subsys_link_dealloc,
> +};
> +
> +int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr,
> +                                 struct bpf_prog *prog)
> +{
> +       struct bpf_link_primer link_primer;
> +       struct bpf_cgroup_subsys_link *link;
> +       struct cgroup_subsys *ss, *attach_ss = NULL;
> +       const char __user *ss_name_user;
> +       char ss_name[MAX_CGROUP_TYPE_NAMELEN];
> +       int ssid, err;
> +
> +       if (attr->link_create.target_fd || attr->link_create.flags)
> +               return -EINVAL;
> +
> +       ss_name_user = u64_to_user_ptr(attr->link_create.cgroup_subsys.name);
> +       if (strncpy_from_user(ss_name, ss_name_user, sizeof(ss_name) - 1) < 0)
> +               return -EFAULT;
> +
> +       for_each_subsys(ss, ssid)
> +               if (!strcmp(ss_name, ss->name) ||
> +                   !strcmp(ss_name, ss->legacy_name))
> +                       attach_ss = ss;
> +
> +       if (!attach_ss)
> +               return -EINVAL;
> +
> +       link = kzalloc(sizeof(*link), GFP_USER);
> +       if (!link)
> +               return -ENOMEM;
> +
> +       bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP_SUBSYS,
> +                     &bpf_cgroup_subsys_link_lops,
> +                     prog);
> +       link->ss = attach_ss;
> +
> +       err = bpf_link_prime(&link->link, &link_primer);
> +       if (err) {
> +               kfree(link);
> +               return err;
> +       }
> +
> +       err = cgroup_subsys_bpf_attach(attach_ss, prog);
> +       if (err) {
> +               bpf_link_cleanup(&link_primer);
> +               return err;
> +       }
> +
> +       return bpf_link_settle(&link_primer);
> +}
> +
> +static const struct bpf_func_proto *
> +cgroup_subsys_rstat_func_proto(enum bpf_func_id func_id,
> +                              const struct bpf_prog *prog)
> +{
> +       return bpf_base_func_proto(func_id);
> +}
> +
> +static bool cgroup_subsys_rstat_is_valid_access(int off, int size,
> +                                          enum bpf_access_type type,
> +                                          const struct bpf_prog *prog,
> +                                          struct bpf_insn_access_aux *info)
> +{
> +       if (type == BPF_WRITE)
> +               return false;
> +
> +       if (off < 0 || off + size > sizeof(struct bpf_rstat_ctx))
> +               return false;
> +       /* The verifier guarantees that size > 0 */
> +       if (off % size != 0)
> +               return false;
> +
> +       switch (off) {
> +       case offsetof(struct bpf_rstat_ctx, cgroup_id):
> +               return size == sizeof(__u64);
> +       case offsetof(struct bpf_rstat_ctx, parent_cgroup_id):
> +               return size == sizeof(__u64);
> +       case offsetof(struct bpf_rstat_ctx, cpu):
> +               return size == sizeof(__s32);
> +       default:
> +               return false;
> +       }
> +}
> +
> +const struct bpf_prog_ops cgroup_subsys_rstat_prog_ops = {
> +};
> +
> +const struct bpf_verifier_ops cgroup_subsys_rstat_verifier_ops = {
> +       .get_func_proto         = cgroup_subsys_rstat_func_proto,
> +       .is_valid_access        = cgroup_subsys_rstat_is_valid_access,
> +};
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index cdaa1152436a..48149c54d969 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -3,6 +3,7 @@
>   */
>  #include <linux/bpf.h>
>  #include <linux/bpf-cgroup.h>
> +#include <linux/bpf-cgroup-subsys.h>
>  #include <linux/bpf_trace.h>
>  #include <linux/bpf_lirc.h>
>  #include <linux/bpf_verifier.h>
> @@ -3194,6 +3195,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
>                 return BPF_PROG_TYPE_SK_LOOKUP;
>         case BPF_XDP:
>                 return BPF_PROG_TYPE_XDP;
> +       case BPF_CGROUP_SUBSYS_RSTAT:
> +               return BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT;
>         default:
>                 return BPF_PROG_TYPE_UNSPEC;
>         }
> @@ -4341,6 +4344,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
>                 else
>                         ret = bpf_kprobe_multi_link_attach(attr, prog);
>                 break;
> +       case BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT:
> +               ret = cgroup_subsys_bpf_link_attach(attr, prog);
> +               break;
>         default:
>                 ret = -EINVAL;
>         }
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index adb820e98f24..7b1448013009 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -5745,6 +5745,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
>
>         idr_init(&ss->css_idr);
>         INIT_LIST_HEAD(&ss->cfts);
> +       INIT_LIST_HEAD(&ss->bpf.rstat_flushers);
>
>         /* Create the root cgroup state for this subsystem */
>         ss->root = &cgrp_dfl_root;
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index d14b10b85e51..0f4855fa85db 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -952,6 +952,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_LSM,
>         BPF_PROG_TYPE_SK_LOOKUP,
>         BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> +       BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT,
>  };
>
>  enum bpf_attach_type {
> @@ -998,6 +999,7 @@ enum bpf_attach_type {
>         BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
>         BPF_PERF_EVENT,
>         BPF_TRACE_KPROBE_MULTI,
> +       BPF_CGROUP_SUBSYS_RSTAT,
>         __MAX_BPF_ATTACH_TYPE
>  };
>
> @@ -1013,6 +1015,7 @@ enum bpf_link_type {
>         BPF_LINK_TYPE_XDP = 6,
>         BPF_LINK_TYPE_PERF_EVENT = 7,
>         BPF_LINK_TYPE_KPROBE_MULTI = 8,
> +       BPF_LINK_TYPE_CGROUP_SUBSYS = 9,
>
>         MAX_BPF_LINK_TYPE,
>  };
> @@ -1482,6 +1485,9 @@ union bpf_attr {
>                                  */
>                                 __u64           bpf_cookie;
>                         } perf_event;
> +                       struct {
> +                               __u64           name;
> +                       } cgroup_subsys;
>                         struct {
>                                 __u32           flags;
>                                 __u32           cnt;
> @@ -6324,6 +6330,12 @@ struct bpf_cgroup_dev_ctx {
>         __u32 minor;
>  };
>
> +struct bpf_rstat_ctx {
> +       __u64 cgroup_id;
> +       __u64 parent_cgroup_id; /* 0 if root */
> +       __s32 cpu;
> +};
> +
>  struct bpf_raw_tracepoint_args {
>         __u64 args[0];
>  };
> --
> 2.36.0.512.ge40c2bad7a-goog
>