Re: [PATCH bpf-next v3 1/2] bpf: Allow pre-ordering for bpf cgroup progs

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Mon, 24 Feb 2025 10:41:58 -0800



On Thu, Feb 13, 2025 at 8:48 AM Yonghong Song <yonghong.song@xxxxxxxxx> wrote:
>
> Currently for bpf progs in a cgroup hierarchy, the effective prog array
> is computed from bottom cgroup to upper cgroups (post-ordering). For
> example, the following cgroup hierarchy
>     root cgroup: p1, p2
>         subcgroup: p3, p4
> have BPF_F_ALLOW_MULTI for both cgroup levels.
> The effective cgroup array ordering looks like
>     p3 p4 p1 p2
> and at run time, progs will execute based on that order.
>
> But in some cases, it is desirable to have root prog executes earlier than
> children progs (pre-ordering). For example,
>   - prog p1 intends to collect original pkt dest addresses.
>   - prog p3 will modify original pkt dest addresses to a proxy address for
>     security reason.
> The end result is that prog p1 gets proxy address which is not what it
> wants. Putting p1 to every child cgroup is not desirable either as it
> will duplicate itself in many child cgroups. And this is exactly a use case
> we are encountering in Meta.
>
> To fix this issue, let us introduce a flag BPF_F_PREORDER. If the flag
> is specified at attachment time, the prog has higher priority and the
> ordering with that flag will be from top to bottom (pre-ordering).
> For example, in the above example,
>     root cgroup: p1, p2
>         subcgroup: p3, p4
> Let us say p2 and p4 are marked with BPF_F_PREORDER. The final
> effective array ordering will be
>     p2 p4 p3 p1
>
> Suggested-by: Andrii Nakryiko <andrii@xxxxxxxxxx>
> Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx>
> ---
>  include/linux/bpf-cgroup.h     |  1 +
>  include/uapi/linux/bpf.h       |  1 +
>  kernel/bpf/cgroup.c            | 33 +++++++++++++++++++++++++--------
>  kernel/bpf/syscall.c           |  3 ++-
>  tools/include/uapi/linux/bpf.h |  1 +
>  5 files changed, 30 insertions(+), 9 deletions(-)
>

LGTM, see one suggestion below, but it doesn't change the essence

Acked-by: Andrii Nakryiko <andrii@xxxxxxxxxx>

> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 7fc69083e745..9de7adb68294 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -111,6 +111,7 @@ struct bpf_prog_list {
>         struct bpf_prog *prog;
>         struct bpf_cgroup_link *link;
>         struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
> +       u32 flags;
>  };
>
>  int cgroup_bpf_inherit(struct cgroup *cgrp);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index fff6cdb8d11a..beac5cdf2d2c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1207,6 +1207,7 @@ enum bpf_perf_event_type {
>  #define BPF_F_BEFORE           (1U << 3)
>  #define BPF_F_AFTER            (1U << 4)
>  #define BPF_F_ID               (1U << 5)
> +#define BPF_F_PREORDER         (1U << 6)
>  #define BPF_F_LINK             BPF_F_LINK /* 1 << 13 */
>
>  /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 46e5db65dbc8..31d33058174c 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -369,7 +369,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
>  /* count number of elements in the list.
>   * it's slow but the list cannot be long
>   */
> -static u32 prog_list_length(struct hlist_head *head)
> +static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
>  {
>         struct bpf_prog_list *pl;
>         u32 cnt = 0;
> @@ -377,6 +377,8 @@ static u32 prog_list_length(struct hlist_head *head)
>         hlist_for_each_entry(pl, head, node) {
>                 if (!prog_list_prog(pl))
>                         continue;
> +               if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
> +                       (*preorder_cnt)++;
>                 cnt++;
>         }
>         return cnt;
> @@ -400,7 +402,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
>
>                 if (flags & BPF_F_ALLOW_MULTI)
>                         return true;
> -               cnt = prog_list_length(&p->bpf.progs[atype]);
> +               cnt = prog_list_length(&p->bpf.progs[atype], NULL);
>                 WARN_ON_ONCE(cnt > 1);
>                 if (cnt == 1)
>                         return !!(flags & BPF_F_ALLOW_OVERRIDE);
> @@ -423,12 +425,12 @@ static int compute_effective_progs(struct cgroup *cgrp,
>         struct bpf_prog_array *progs;
>         struct bpf_prog_list *pl;
>         struct cgroup *p = cgrp;
> -       int cnt = 0;
> +       int i, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
>
>         /* count number of effective programs by walking parents */
>         do {
>                 if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
> -                       cnt += prog_list_length(&p->bpf.progs[atype]);
> +                       cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
>                 p = cgroup_parent(p);
>         } while (p);
>
> @@ -439,20 +441,34 @@ static int compute_effective_progs(struct cgroup *cgrp,
>         /* populate the array with effective progs */
>         cnt = 0;
>         p = cgrp;
> +       fstart = preorder_cnt;
> +       bstart = preorder_cnt - 1;
>         do {
>                 if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
>                         continue;
>
> +               init_bstart = bstart;
>                 hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
>                         if (!prog_list_prog(pl))
>                                 continue;
>
> -                       item = &progs->items[cnt];
> +                       if (pl->flags & BPF_F_PREORDER) {
> +                               item = &progs->items[bstart];
> +                               bstart--;
> +                       } else {
> +                               item = &progs->items[fstart];
> +                               fstart++;
> +                       }
>                         item->prog = prog_list_prog(pl);
>                         bpf_cgroup_storages_assign(item->cgroup_storage,
>                                                    pl->storage);
>                         cnt++;
>                 }
> +
> +               /* reverse pre-ordering progs at this cgroup level */
> +               for (i = 0; i < (init_bstart - bstart)/2; i++)
> +                       swap(progs->items[init_bstart - i], progs->items[bstart + 1 + i]);

nit: this is a bit mind-bending to read and verify, let's do it a bit
more bullet-proof way:

for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
    swap(progs->items[i], progs->items[j]);

?

> +
>         } while ((p = cgroup_parent(p)));
>
>         *array = progs;
> @@ -663,7 +679,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
>                  */
>                 return -EPERM;
>
> -       if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
> +       if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS)
>                 return -E2BIG;
>
>         pl = find_attach_entry(progs, prog, link, replace_prog,
> @@ -698,6 +714,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
>
>         pl->prog = prog;
>         pl->link = link;
> +       pl->flags = flags;
>         bpf_cgroup_storages_assign(pl->storage, storage);
>         cgrp->bpf.flags[atype] = saved_flags;
>
> @@ -1073,7 +1090,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>                                                               lockdep_is_held(&cgroup_mutex));
>                         total_cnt += bpf_prog_array_length(effective);
>                 } else {
> -                       total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
> +                       total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
>                 }
>         }
>
> @@ -1105,7 +1122,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>                         u32 id;
>
>                         progs = &cgrp->bpf.progs[atype];
> -                       cnt = min_t(int, prog_list_length(progs), total_cnt);
> +                       cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
>                         i = 0;
>                         hlist_for_each_entry(pl, progs, node) {
>                                 prog = prog_list_prog(pl);
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index c420edbfb7c8..18de4d301368 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -4168,7 +4168,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
>  #define BPF_F_ATTACH_MASK_BASE \
>         (BPF_F_ALLOW_OVERRIDE | \
>          BPF_F_ALLOW_MULTI |    \
> -        BPF_F_REPLACE)
> +        BPF_F_REPLACE |        \
> +        BPF_F_PREORDER)
>
>  #define BPF_F_ATTACH_MASK_MPROG        \
>         (BPF_F_REPLACE |        \
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index fff6cdb8d11a..beac5cdf2d2c 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -1207,6 +1207,7 @@ enum bpf_perf_event_type {
>  #define BPF_F_BEFORE           (1U << 3)
>  #define BPF_F_AFTER            (1U << 4)
>  #define BPF_F_ID               (1U << 5)
> +#define BPF_F_PREORDER         (1U << 6)
>  #define BPF_F_LINK             BPF_F_LINK /* 1 << 13 */
>
>  /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
> --
> 2.43.5
>