Currently for bpf progs in a cgroup hierarchy, the effective prog array is computed from bottom cgroup to upper cgroups. For example, the following cgroup hierarchy root cgroup: p1, p2 subcgroup: p3, p4 have BPF_F_ALLOW_MULTI for both cgroup levels. The effective cgroup array ordering looks like p3 p4 p1 p2 and at run time, the progs will execute based on that order. But in some cases, it is desirable to have root prog executes earlier than children progs. For example, - prog p1 intends to collect original pkt dest addresses. - prog p3 will modify original pkt dest addresses to a proxy address for security reason. The end result is that prog p1 gets proxy address which is not what it wants. Also, putting p1 to every child cgroup is not desirable either as it will duplicate itself in many child cgroups. And this is exactly a use case we are encountering in Meta. To fix this issue, let us introduce a flag BPF_F_PRIO_TOPDOWN. If the flag is specified at attachment time, the prog has higher priority and the ordering with that flag will be from top to bottom. For example, in the above example, root cgroup: p1, p2 subcgroup: p3, p4 Let us say p1, p2 and p4 are marked with BPF_F_PRIO_TOPDOWN. The final effective array ordering will be p1 p2 p4 p3 Suggested-by: Andrii Nakryiko <andrii@xxxxxxxxxx> Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx> --- include/linux/bpf-cgroup.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 37 +++++++++++++++++++++++++++++++--- kernel/bpf/syscall.c | 3 ++- tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 39 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 7fc69083e745..3d4f221df9ef 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -111,6 +111,7 @@ struct bpf_prog_list { struct bpf_prog *prog; struct bpf_cgroup_link *link; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; + bool is_prio_topdown; }; int cgroup_bpf_inherit(struct cgroup *cgrp); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fff6cdb8d11a..7ae8e8751e78 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1207,6 +1207,7 @@ enum bpf_perf_event_type { #define BPF_F_BEFORE (1U << 3) #define BPF_F_AFTER (1U << 4) #define BPF_F_ID (1U << 5) +#define BPF_F_PRIO_TOPDOWN (1U << 6) #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 46e5db65dbc8..f31250c6025b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -382,6 +382,21 @@ static u32 prog_list_length(struct hlist_head *head) return cnt; } +static u32 prog_list_length_with_topdown_cnt(struct hlist_head *head, int *topdown_cnt) +{ + struct bpf_prog_list *pl; + u32 cnt = 0; + + hlist_for_each_entry(pl, head, node) { + if (!prog_list_prog(pl)) + continue; + cnt++; + if (pl->is_prio_topdown) + (*topdown_cnt) += 1; + } + return cnt; +} + /* if parent has non-overridable prog attached, * disallow attaching new programs to the descendent cgroup. * if parent has overridable or multi-prog, allow attaching @@ -423,12 +438,13 @@ static int compute_effective_progs(struct cgroup *cgrp, struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; - int cnt = 0; + int i, cnt = 0, topdown_cnt = 0, fstart, bstart, init_bstart; /* count number of effective programs by walking parents */ do { if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[atype]); + cnt += prog_list_length_with_topdown_cnt(&p->bpf.progs[atype], + &topdown_cnt); p = cgroup_parent(p); } while (p); @@ -439,20 +455,34 @@ static int compute_effective_progs(struct cgroup *cgrp, /* populate the array with effective progs */ cnt = 0; p = cgrp; + fstart = topdown_cnt; + bstart = topdown_cnt - 1; do { if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; + init_bstart = bstart; hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; - item = &progs->items[cnt]; + if (!pl->is_prio_topdown) { + item = &progs->items[fstart]; + fstart++; + } else { + item = &progs->items[bstart]; + bstart--; + } item->prog = prog_list_prog(pl); bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage); cnt++; } + + /* reverse topdown priority progs ordering at this cgroup level */ + for (i = 0; i < (init_bstart - bstart)/2; i++) + swap(progs->items[init_bstart - i], progs->items[bstart + 1 + i]); + } while ((p = cgroup_parent(p))); *array = progs; @@ -698,6 +728,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; + pl->is_prio_topdown = !!(flags & BPF_F_PRIO_TOPDOWN); bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c420edbfb7c8..2711f6769263 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4168,7 +4168,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_F_ATTACH_MASK_BASE \ (BPF_F_ALLOW_OVERRIDE | \ BPF_F_ALLOW_MULTI | \ - BPF_F_REPLACE) + BPF_F_REPLACE | \ + BPF_F_PRIO_TOPDOWN) #define BPF_F_ATTACH_MASK_MPROG \ (BPF_F_REPLACE | \ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2acf9b336371..de962fbb7c4c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1207,6 +1207,7 @@ enum bpf_perf_event_type { #define BPF_F_BEFORE (1U << 3) #define BPF_F_AFTER (1U << 4) #define BPF_F_ID (1U << 5) +#define BPF_F_PRIO_TOPDOWN (1U << 6) #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the -- 2.43.5