On Thu, Feb 13, 2025 at 8:48 AM Yonghong Song <yonghong.song@xxxxxxxxx> wrote: > > Currently for bpf progs in a cgroup hierarchy, the effective prog array > is computed from bottom cgroup to upper cgroups (post-ordering). For > example, the following cgroup hierarchy > root cgroup: p1, p2 > subcgroup: p3, p4 > have BPF_F_ALLOW_MULTI for both cgroup levels. > The effective cgroup array ordering looks like > p3 p4 p1 p2 > and at run time, progs will execute based on that order. > > But in some cases, it is desirable to have root prog executes earlier than > children progs (pre-ordering). For example, > - prog p1 intends to collect original pkt dest addresses. > - prog p3 will modify original pkt dest addresses to a proxy address for > security reason. > The end result is that prog p1 gets proxy address which is not what it > wants. Putting p1 to every child cgroup is not desirable either as it > will duplicate itself in many child cgroups. And this is exactly a use case > we are encountering in Meta. > > To fix this issue, let us introduce a flag BPF_F_PREORDER. If the flag > is specified at attachment time, the prog has higher priority and the > ordering with that flag will be from top to bottom (pre-ordering). > For example, in the above example, > root cgroup: p1, p2 > subcgroup: p3, p4 > Let us say p2 and p4 are marked with BPF_F_PREORDER. The final > effective array ordering will be > p2 p4 p3 p1 > > Suggested-by: Andrii Nakryiko <andrii@xxxxxxxxxx> > Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx> > --- > include/linux/bpf-cgroup.h | 1 + > include/uapi/linux/bpf.h | 1 + > kernel/bpf/cgroup.c | 33 +++++++++++++++++++++++++-------- > kernel/bpf/syscall.c | 3 ++- > tools/include/uapi/linux/bpf.h | 1 + > 5 files changed, 30 insertions(+), 9 deletions(-) > LGTM, see one suggestion below, but it doesn't change the essence Acked-by: Andrii Nakryiko <andrii@xxxxxxxxxx> > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h > index 7fc69083e745..9de7adb68294 100644 > --- a/include/linux/bpf-cgroup.h > +++ b/include/linux/bpf-cgroup.h > @@ -111,6 +111,7 @@ struct bpf_prog_list { > struct bpf_prog *prog; > struct bpf_cgroup_link *link; > struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; > + u32 flags; > }; > > int cgroup_bpf_inherit(struct cgroup *cgrp); > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index fff6cdb8d11a..beac5cdf2d2c 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -1207,6 +1207,7 @@ enum bpf_perf_event_type { > #define BPF_F_BEFORE (1U << 3) > #define BPF_F_AFTER (1U << 4) > #define BPF_F_ID (1U << 5) > +#define BPF_F_PREORDER (1U << 6) > #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ > > /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c > index 46e5db65dbc8..31d33058174c 100644 > --- a/kernel/bpf/cgroup.c > +++ b/kernel/bpf/cgroup.c > @@ -369,7 +369,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) > /* count number of elements in the list. > * it's slow but the list cannot be long > */ > -static u32 prog_list_length(struct hlist_head *head) > +static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt) > { > struct bpf_prog_list *pl; > u32 cnt = 0; > @@ -377,6 +377,8 @@ static u32 prog_list_length(struct hlist_head *head) > hlist_for_each_entry(pl, head, node) { > if (!prog_list_prog(pl)) > continue; > + if (preorder_cnt && (pl->flags & BPF_F_PREORDER)) > + (*preorder_cnt)++; > cnt++; > } > return cnt; > @@ -400,7 +402,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, > > if (flags & BPF_F_ALLOW_MULTI) > return true; > - cnt = prog_list_length(&p->bpf.progs[atype]); > + cnt = prog_list_length(&p->bpf.progs[atype], NULL); > WARN_ON_ONCE(cnt > 1); > if (cnt == 1) > return !!(flags & BPF_F_ALLOW_OVERRIDE); > @@ -423,12 +425,12 @@ static int compute_effective_progs(struct cgroup *cgrp, > struct bpf_prog_array *progs; > struct bpf_prog_list *pl; > struct cgroup *p = cgrp; > - int cnt = 0; > + int i, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart; > > /* count number of effective programs by walking parents */ > do { > if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) > - cnt += prog_list_length(&p->bpf.progs[atype]); > + cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt); > p = cgroup_parent(p); > } while (p); > > @@ -439,20 +441,34 @@ static int compute_effective_progs(struct cgroup *cgrp, > /* populate the array with effective progs */ > cnt = 0; > p = cgrp; > + fstart = preorder_cnt; > + bstart = preorder_cnt - 1; > do { > if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) > continue; > > + init_bstart = bstart; > hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { > if (!prog_list_prog(pl)) > continue; > > - item = &progs->items[cnt]; > + if (pl->flags & BPF_F_PREORDER) { > + item = &progs->items[bstart]; > + bstart--; > + } else { > + item = &progs->items[fstart]; > + fstart++; > + } > item->prog = prog_list_prog(pl); > bpf_cgroup_storages_assign(item->cgroup_storage, > pl->storage); > cnt++; > } > + > + /* reverse pre-ordering progs at this cgroup level */ > + for (i = 0; i < (init_bstart - bstart)/2; i++) > + swap(progs->items[init_bstart - i], progs->items[bstart + 1 + i]); nit: this is a bit mind-bending to read and verify, let's do it a bit more bullet-proof way: for (i = bstart + 1, j = init_bstart; i < j; i++, j--) swap(progs->items[i], progs->items[j]); ? > + > } while ((p = cgroup_parent(p))); > > *array = progs; > @@ -663,7 +679,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, > */ > return -EPERM; > > - if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) > + if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) > return -E2BIG; > > pl = find_attach_entry(progs, prog, link, replace_prog, > @@ -698,6 +714,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, > > pl->prog = prog; > pl->link = link; > + pl->flags = flags; > bpf_cgroup_storages_assign(pl->storage, storage); > cgrp->bpf.flags[atype] = saved_flags; > > @@ -1073,7 +1090,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > lockdep_is_held(&cgroup_mutex)); > total_cnt += bpf_prog_array_length(effective); > } else { > - total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); > + total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL); > } > } > > @@ -1105,7 +1122,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, > u32 id; > > progs = &cgrp->bpf.progs[atype]; > - cnt = min_t(int, prog_list_length(progs), total_cnt); > + cnt = min_t(int, prog_list_length(progs, NULL), total_cnt); > i = 0; > hlist_for_each_entry(pl, progs, node) { > prog = prog_list_prog(pl); > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index c420edbfb7c8..18de4d301368 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -4168,7 +4168,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, > #define BPF_F_ATTACH_MASK_BASE \ > (BPF_F_ALLOW_OVERRIDE | \ > BPF_F_ALLOW_MULTI | \ > - BPF_F_REPLACE) > + BPF_F_REPLACE | \ > + BPF_F_PREORDER) > > #define BPF_F_ATTACH_MASK_MPROG \ > (BPF_F_REPLACE | \ > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index fff6cdb8d11a..beac5cdf2d2c 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -1207,6 +1207,7 @@ enum bpf_perf_event_type { > #define BPF_F_BEFORE (1U << 3) > #define BPF_F_AFTER (1U << 4) > #define BPF_F_ID (1U << 5) > +#define BPF_F_PREORDER (1U << 6) > #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ > > /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the > -- > 2.43.5 >