On 14.08.23 17:54, Alexander Mikhalitsyn wrote: > On Mon, Aug 14, 2023 at 4:32 PM Michael Weiß > <michael.weiss@xxxxxxxxxxxxxxxxxxx> wrote: >> >> Introduce the BPF_F_CGROUP_DEVICE_GUARD flag for BPF_PROG_LOAD >> which allows to set a cgroup device program to be a device guard. >> Later this may be used to guard actions on device nodes in >> non-initial userns. For this reason we provide the helper function >> cgroup_bpf_device_guard_enabled() to check if a task has a cgroups >> device program which is a device guard in its effective set of bpf >> programs. >> >> Signed-off-by: Michael Weiß <michael.weiss@xxxxxxxxxxxxxxxxxxx> > > Hi Michael! > > Thanks for working on this. It's also very useful for the LXC system > containers project. > >> --- >> include/linux/bpf-cgroup.h | 7 +++++++ >> include/linux/bpf.h | 1 + >> include/uapi/linux/bpf.h | 5 +++++ >> kernel/bpf/cgroup.c | 30 ++++++++++++++++++++++++++++++ >> kernel/bpf/syscall.c | 5 ++++- >> tools/include/uapi/linux/bpf.h | 5 +++++ >> 6 files changed, 52 insertions(+), 1 deletion(-) >> >> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h >> index 57e9e109257e..112b6093f9fd 100644 >> --- a/include/linux/bpf-cgroup.h >> +++ b/include/linux/bpf-cgroup.h >> @@ -184,6 +184,8 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, >> return array != &bpf_empty_prog_array.hdr; >> } >> >> +bool cgroup_bpf_device_guard_enabled(struct task_struct *task); >> + >> /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ >> #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ >> ({ \ >> @@ -476,6 +478,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, >> return 0; >> } >> >> +static bool cgroup_bpf_device_guard_enabled(struct task_struct *task) >> +{ >> + return false; >> +} >> + >> #define cgroup_bpf_enabled(atype) (0) >> #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; }) >> #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; }) >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index f58895830ada..313cce8aee05 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -1384,6 +1384,7 @@ struct bpf_prog_aux { >> bool sleepable; >> bool tail_call_reachable; >> bool xdp_has_frags; >> + bool cgroup_device_guard; >> /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ >> const struct btf_type *attach_func_proto; >> /* function name for valid attach_btf_id */ >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 60a9d59beeab..3be57f7957b1 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -1165,6 +1165,11 @@ enum bpf_link_type { >> */ >> #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) >> >> +/* If BPF_F_CGROUP_DEVICE_GUARD is used in BPF_PROG_LOAD command, the loaded >> + * program will be allowed to guard device access inside user namespaces. >> + */ >> +#define BPF_F_CGROUP_DEVICE_GUARD (1U << 7) >> + >> /* link_create.kprobe_multi.flags used in LINK_CREATE command for >> * BPF_TRACE_KPROBE_MULTI attach type to create return probe. >> */ >> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c >> index 5b2741aa0d9b..230693ca4cdb 100644 >> --- a/kernel/bpf/cgroup.c >> +++ b/kernel/bpf/cgroup.c >> @@ -2505,6 +2505,36 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = { >> const struct bpf_prog_ops cg_sockopt_prog_ops = { >> }; >> >> +bool >> +cgroup_bpf_device_guard_enabled(struct task_struct *task) >> +{ >> + bool ret; >> + const struct bpf_prog_array *array; >> + const struct bpf_prog_array_item *item; >> + const struct bpf_prog *prog; >> + struct cgroup *cgrp = task_dfl_cgroup(task); >> + >> + ret = false; >> + >> + array = rcu_access_pointer(cgrp->bpf.effective[CGROUP_DEVICE]); >> + if (array == &bpf_empty_prog_array.hdr) >> + return ret; >> + >> + mutex_lock(&cgroup_mutex); > > -> cgroup_lock(); > >> + array = rcu_dereference_protected(cgrp->bpf.effective[CGROUP_DEVICE], >> + lockdep_is_held(&cgroup_mutex)); >> + item = &array->items[0]; >> + while ((prog = READ_ONCE(item->prog))) { >> + if (prog->aux->cgroup_device_guard) { >> + ret = true; >> + break; >> + } >> + item++; >> + } >> + mutex_unlock(&cgroup_mutex); > > Don't we want to make this function specific to "current" task? This > allows to make locking lighter like in > __cgroup_bpf_check_dev_permission > https://github.com/torvalds/linux/blob/2ccdd1b13c591d306f0401d98dedc4bdcd02b421/kernel/bpf/cgroup.c#L1531 > Here we have only RCU read lock. > > AFAIK, cgroup_mutex is a heavily-contended lock. Seems legit. So definitely we should do that. Thanks. Cheers, Michael > > Kind regards, > Alex > >> + return ret; >> +} >> + >> /* Common helpers for cgroup hooks. */ >> const struct bpf_func_proto * >> cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> index a2aef900519c..33ea67c702c1 100644 >> --- a/kernel/bpf/syscall.c >> +++ b/kernel/bpf/syscall.c >> @@ -2564,7 +2564,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) >> BPF_F_SLEEPABLE | >> BPF_F_TEST_RND_HI32 | >> BPF_F_XDP_HAS_FRAGS | >> - BPF_F_XDP_DEV_BOUND_ONLY)) >> + BPF_F_XDP_DEV_BOUND_ONLY | >> + BPF_F_CGROUP_DEVICE_GUARD)) >> return -EINVAL; >> >> if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && >> @@ -2651,6 +2652,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) >> prog->aux->dev_bound = !!attr->prog_ifindex; >> prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; >> prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; >> + prog->aux->cgroup_device_guard = >> + attr->prog_flags & BPF_F_CGROUP_DEVICE_GUARD; >> >> err = security_bpf_prog_alloc(prog->aux); >> if (err) >> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h >> index 60a9d59beeab..3be57f7957b1 100644 >> --- a/tools/include/uapi/linux/bpf.h >> +++ b/tools/include/uapi/linux/bpf.h >> @@ -1165,6 +1165,11 @@ enum bpf_link_type { >> */ >> #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) >> >> +/* If BPF_F_CGROUP_DEVICE_GUARD is used in BPF_PROG_LOAD command, the loaded >> + * program will be allowed to guard device access inside user namespaces. >> + */ >> +#define BPF_F_CGROUP_DEVICE_GUARD (1U << 7) >> + >> /* link_create.kprobe_multi.flags used in LINK_CREATE command for >> * BPF_TRACE_KPROBE_MULTI attach type to create return probe. >> */ >> >> -- >> 2.30.2 >>