Re: [PATCH RFC 1/4] bpf: add cgroup device guard to flag a cgroup device prog

Michael Weiß <michael.weiss@xxxxxxxxxxxxxxxxxxx> · Thu, 17 Aug 2023 17:50:29 +0200

On 14.08.23 17:54, Alexander Mikhalitsyn wrote:
> On Mon, Aug 14, 2023 at 4:32 PM Michael Weiß
> <michael.weiss@xxxxxxxxxxxxxxxxxxx> wrote:
>>
>> Introduce the BPF_F_CGROUP_DEVICE_GUARD flag for BPF_PROG_LOAD
>> which allows to set a cgroup device program to be a device guard.
>> Later this may be used to guard actions on device nodes in
>> non-initial userns. For this reason we provide the helper function
>> cgroup_bpf_device_guard_enabled() to check if a task has a cgroups
>> device program which is a device guard in its effective set of bpf
>> programs.
>>
>> Signed-off-by: Michael Weiß <michael.weiss@xxxxxxxxxxxxxxxxxxx>
> 
> Hi Michael!
> 
> Thanks for working on this. It's also very useful for the LXC system
> containers project.
> 
>> ---
>>  include/linux/bpf-cgroup.h     |  7 +++++++
>>  include/linux/bpf.h            |  1 +
>>  include/uapi/linux/bpf.h       |  5 +++++
>>  kernel/bpf/cgroup.c            | 30 ++++++++++++++++++++++++++++++
>>  kernel/bpf/syscall.c           |  5 ++++-
>>  tools/include/uapi/linux/bpf.h |  5 +++++
>>  6 files changed, 52 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
>> index 57e9e109257e..112b6093f9fd 100644
>> --- a/include/linux/bpf-cgroup.h
>> +++ b/include/linux/bpf-cgroup.h
>> @@ -184,6 +184,8 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
>>         return array != &bpf_empty_prog_array.hdr;
>>  }
>>
>> +bool cgroup_bpf_device_guard_enabled(struct task_struct *task);
>> +
>>  /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
>>  #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)                            \
>>  ({                                                                           \
>> @@ -476,6 +478,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
>>         return 0;
>>  }
>>
>> +static bool cgroup_bpf_device_guard_enabled(struct task_struct *task)
>> +{
>> +       return false;
>> +}
>> +
>>  #define cgroup_bpf_enabled(atype) (0)
>>  #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
>>  #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; })
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index f58895830ada..313cce8aee05 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1384,6 +1384,7 @@ struct bpf_prog_aux {
>>         bool sleepable;
>>         bool tail_call_reachable;
>>         bool xdp_has_frags;
>> +       bool cgroup_device_guard;
>>         /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
>>         const struct btf_type *attach_func_proto;
>>         /* function name for valid attach_btf_id */
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 60a9d59beeab..3be57f7957b1 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1165,6 +1165,11 @@ enum bpf_link_type {
>>   */
>>  #define BPF_F_XDP_DEV_BOUND_ONLY       (1U << 6)
>>
>> +/* If BPF_F_CGROUP_DEVICE_GUARD is used in BPF_PROG_LOAD command, the loaded
>> + * program will be allowed to guard device access inside user namespaces.
>> + */
>> +#define BPF_F_CGROUP_DEVICE_GUARD      (1U << 7)
>> +
>>  /* link_create.kprobe_multi.flags used in LINK_CREATE command for
>>   * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
>>   */
>> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
>> index 5b2741aa0d9b..230693ca4cdb 100644
>> --- a/kernel/bpf/cgroup.c
>> +++ b/kernel/bpf/cgroup.c
>> @@ -2505,6 +2505,36 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
>>  const struct bpf_prog_ops cg_sockopt_prog_ops = {
>>  };
>>
>> +bool
>> +cgroup_bpf_device_guard_enabled(struct task_struct *task)
>> +{
>> +       bool ret;
>> +       const struct bpf_prog_array *array;
>> +       const struct bpf_prog_array_item *item;
>> +       const struct bpf_prog *prog;
>> +       struct cgroup *cgrp = task_dfl_cgroup(task);
>> +
>> +       ret = false;
>> +
>> +       array = rcu_access_pointer(cgrp->bpf.effective[CGROUP_DEVICE]);
>> +       if (array == &bpf_empty_prog_array.hdr)
>> +               return ret;
>> +
>> +       mutex_lock(&cgroup_mutex);
> 
> -> cgroup_lock();
> 
>> +       array = rcu_dereference_protected(cgrp->bpf.effective[CGROUP_DEVICE],
>> +                                             lockdep_is_held(&cgroup_mutex));
>> +       item = &array->items[0];
>> +       while ((prog = READ_ONCE(item->prog))) {
>> +               if (prog->aux->cgroup_device_guard) {
>> +                       ret = true;
>> +                       break;
>> +               }
>> +               item++;
>> +       }
>> +       mutex_unlock(&cgroup_mutex);
> 
> Don't we want to make this function specific to "current" task? This
> allows to make locking lighter like in
> __cgroup_bpf_check_dev_permission
> https://github.com/torvalds/linux/blob/2ccdd1b13c591d306f0401d98dedc4bdcd02b421/kernel/bpf/cgroup.c#L1531
> Here we have only RCU read lock.
> 
> AFAIK, cgroup_mutex is a heavily-contended lock.

Seems legit. So definitely we should do that. Thanks.

Cheers,
Michael
> 
> Kind regards,
> Alex
> 
>> +       return ret;
>> +}
>> +
>>  /* Common helpers for cgroup hooks. */
>>  const struct bpf_func_proto *
>>  cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index a2aef900519c..33ea67c702c1 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -2564,7 +2564,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>>                                  BPF_F_SLEEPABLE |
>>                                  BPF_F_TEST_RND_HI32 |
>>                                  BPF_F_XDP_HAS_FRAGS |
>> -                                BPF_F_XDP_DEV_BOUND_ONLY))
>> +                                BPF_F_XDP_DEV_BOUND_ONLY |
>> +                                BPF_F_CGROUP_DEVICE_GUARD))
>>                 return -EINVAL;
>>
>>         if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
>> @@ -2651,6 +2652,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>>         prog->aux->dev_bound = !!attr->prog_ifindex;
>>         prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
>>         prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
>> +       prog->aux->cgroup_device_guard =
>> +               attr->prog_flags & BPF_F_CGROUP_DEVICE_GUARD;
>>
>>         err = security_bpf_prog_alloc(prog->aux);
>>         if (err)
>> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
>> index 60a9d59beeab..3be57f7957b1 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -1165,6 +1165,11 @@ enum bpf_link_type {
>>   */
>>  #define BPF_F_XDP_DEV_BOUND_ONLY       (1U << 6)
>>
>> +/* If BPF_F_CGROUP_DEVICE_GUARD is used in BPF_PROG_LOAD command, the loaded
>> + * program will be allowed to guard device access inside user namespaces.
>> + */
>> +#define BPF_F_CGROUP_DEVICE_GUARD      (1U << 7)
>> +
>>  /* link_create.kprobe_multi.flags used in LINK_CREATE command for
>>   * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
>>   */
>>
>> --
>> 2.30.2
>>