Re: [PATCH net-next 1/3] bpf, seccomp: Add eBPF filter capabilities

Sargun Dhillon <sargun@xxxxxxxxx> · Fri, 16 Feb 2018 22:29:32 -0800



On Tue, Feb 13, 2018 at 12:34 PM, Kees Cook <keescook@xxxxxxxxxxxx> wrote:
> On Tue, Feb 13, 2018 at 7:42 AM, Sargun Dhillon <sargun@xxxxxxxxx> wrote:
>> From: Sargun Dhillon <sargun@xxxxxxxxxxx>
>>
>> This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
>> to be used for seccomp filters as an alternative to cBPF filters. The
>> program type has relatively limited capabilities in terms of helpers,
>> but that can be extended later on.
>>
>> It also introduces a new mechanism to attach these filters via the
>> prctl and seccomp syscalls -- SECCOMP_MODE_FILTER_EXTENDED, and
>> SECCOMP_SET_MODE_FILTER_EXTENDED respectively.
>>
>> Signed-off-by: Sargun Dhillon <sargun@xxxxxxxxx>
>> ---
>>  arch/Kconfig                 |   7 ++
>>  include/linux/bpf_types.h    |   3 +
>>  include/uapi/linux/bpf.h     |   2 +
>>  include/uapi/linux/seccomp.h |  15 +++--
>>  kernel/bpf/syscall.c         |   1 +
>>  kernel/seccomp.c             | 148 +++++++++++++++++++++++++++++++++++++------
>>  6 files changed, 150 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 76c0b54443b1..db675888577c 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -401,6 +401,13 @@ config SECCOMP_FILTER
>>
>>           See Documentation/prctl/seccomp_filter.txt for details.
>>
>> +config SECCOMP_FILTER_EXTENDED
>> +       bool "Extended BPF seccomp filters"
>> +       depends on SECCOMP_FILTER && BPF_SYSCALL
>> +       help
>> +         Enables seccomp filters to be written in eBPF, as opposed
>> +         to just cBPF filters.
>> +
>>  config HAVE_GCC_PLUGINS
>>         bool
>>         help
>> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
>> index 19b8349a3809..945c65c4e461 100644
>> --- a/include/linux/bpf_types.h
>> +++ b/include/linux/bpf_types.h
>> @@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
>>  #ifdef CONFIG_CGROUP_BPF
>>  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
>>  #endif
>> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
>> +BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
>> +#endif
>>
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index db6bdc375126..5f96cb7ed954 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1,3 +1,4 @@
>> +
>>  /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
>>  /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
>>   *
>> @@ -133,6 +134,7 @@ enum bpf_prog_type {
>>         BPF_PROG_TYPE_SOCK_OPS,
>>         BPF_PROG_TYPE_SK_SKB,
>>         BPF_PROG_TYPE_CGROUP_DEVICE,
>> +       BPF_PROG_TYPE_SECCOMP,
>>  };
>>
>>  enum bpf_attach_type {
>> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
>> index 2a0bd9dd104d..7da8b39f2a6a 100644
>> --- a/include/uapi/linux/seccomp.h
>> +++ b/include/uapi/linux/seccomp.h
>> @@ -7,14 +7,17 @@
>>
>>
>>  /* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
>> -#define SECCOMP_MODE_DISABLED  0 /* seccomp is not in use. */
>> -#define SECCOMP_MODE_STRICT    1 /* uses hard-coded filter. */
>> -#define SECCOMP_MODE_FILTER    2 /* uses user-supplied filter. */
>> +#define SECCOMP_MODE_DISABLED          0 /* seccomp is not in use. */
>> +#define SECCOMP_MODE_STRICT            1 /* uses hard-coded filter. */
>> +#define SECCOMP_MODE_FILTER            2 /* uses user-supplied filter. */
>> +#define SECCOMP_MODE_FILTER_EXTENDED   3 /* uses eBPF filter from fd */
>
> This MODE flag isn't needed: it's still using a filter, and the
> interface changes should be sufficient with
> SECCOMP_SET_MODE_FILTER_EXTENDED below.
>
>>  /* Valid operations for seccomp syscall. */
>> -#define SECCOMP_SET_MODE_STRICT                0
>> -#define SECCOMP_SET_MODE_FILTER                1
>> -#define SECCOMP_GET_ACTION_AVAIL       2
>> +#define SECCOMP_SET_MODE_STRICT                        0
>> +#define SECCOMP_SET_MODE_FILTER                        1
>> +#define SECCOMP_GET_ACTION_AVAIL               2
>> +#define SECCOMP_SET_MODE_FILTER_EXTENDED       3
>
> It seems like this should be a FILTER flag, not a syscall op change?
>
>> +
>>
>>  /* Valid flags for SECCOMP_SET_MODE_FILTER */
>>  #define SECCOMP_FILTER_FLAG_TSYNC      1
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index e24aa3241387..86d6ec8b916d 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr)
>>
>>         if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
>>             type != BPF_PROG_TYPE_CGROUP_SKB &&
>> +           type != BPF_PROG_TYPE_SECCOMP &&
>>             !capable(CAP_SYS_ADMIN))
>>                 return -EPERM;
>
> So only init_ns-CAP_SYS_ADMIN would be able to use seccomp eBPF?
>
No, this is specifically so non-init CAP_SYS_ADMIN cal load BPF
filters that are either socket_filter, cgroup_skb, or seccomp.

>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> index 940fa408a288..b30dd25c1cb8 100644
>> --- a/kernel/seccomp.c
>> +++ b/kernel/seccomp.c
>> @@ -37,6 +37,7 @@
>>  #include <linux/security.h>
>>  #include <linux/tracehook.h>
>>  #include <linux/uaccess.h>
>> +#include <linux/bpf.h>
>>
>>  /**
>>   * struct seccomp_filter - container for seccomp BPF programs
>> @@ -367,17 +368,6 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
>>
>>         BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
>>
>> -       /*
>> -        * Installing a seccomp filter requires that the task has
>> -        * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
>> -        * This avoids scenarios where unprivileged tasks can affect the
>> -        * behavior of privileged children.
>> -        */
>> -       if (!task_no_new_privs(current) &&
>> -           security_capable_noaudit(current_cred(), current_user_ns(),
>> -                                    CAP_SYS_ADMIN) != 0)
>> -               return ERR_PTR(-EACCES);
>> -
>>         /* Allocate a new seccomp_filter */
>>         sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
>>         if (!sfilter)
>> @@ -423,6 +413,48 @@ seccomp_prepare_user_filter(const char __user *user_filter)
>>         return filter;
>>  }
>>
>> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
>> +/**
>> + * seccomp_prepare_extended_filter - prepares a user-supplied eBPF fd
>> + * @user_filter: pointer to the user data containing an fd.
>> + *
>> + * Returns 0 on success and non-zero otherwise.
>> + */
>> +static struct seccomp_filter *
>> +seccomp_prepare_extended_filter(const char __user *user_fd)
>> +{
>> +       struct seccomp_filter *sfilter;
>> +       struct bpf_prog *fp;
>> +       int fd;
>> +
>> +       /* Fetch the fd from userspace */
>> +       if (get_user(fd, (int __user *)user_fd))
>> +               return ERR_PTR(-EFAULT);
>> +
>> +       /* Allocate a new seccomp_filter */
>> +       sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
>> +       if (!sfilter)
>> +               return ERR_PTR(-ENOMEM);
>> +
>> +       fp = bpf_prog_get_type(fd, BPF_PROG_TYPE_SECCOMP);
>> +       if (IS_ERR(fp)) {
>> +               kfree(sfilter);
>> +               return ERR_CAST(fp);
>> +       }
>> +
>> +       sfilter->prog = fp;
>> +       refcount_set(&sfilter->usage, 1);
>> +
>> +       return sfilter;
>> +}
>> +#else
>> +static struct seccomp_filter *
>> +seccomp_prepare_extended_filter(const char __user *filter_fd)
>> +{
>> +       return ERR_PTR(-EINVAL);
>> +}
>> +#endif
>> +
>>  /**
>>   * seccomp_attach_filter: validate and attach filter
>>   * @flags:  flags to change filter behavior
>> @@ -492,7 +524,10 @@ void get_seccomp_filter(struct task_struct *tsk)
>>  static inline void seccomp_filter_free(struct seccomp_filter *filter)
>>  {
>>         if (filter) {
>> -               bpf_prog_destroy(filter->prog);
>> +               if (bpf_prog_was_classic(filter->prog))
>> +                       bpf_prog_destroy(filter->prog);
>> +               else
>> +                       bpf_prog_put(filter->prog);
>>                 kfree(filter);
>>         }
>>  }
>> @@ -842,18 +877,36 @@ static long seccomp_set_mode_strict(void)
>>   * Returns 0 on success or -EINVAL on failure.
>>   */
>>  static long seccomp_set_mode_filter(unsigned int flags,
>> -                                   const char __user *filter)
>> +                                   const char __user *filter,
>> +                                   unsigned long filter_type)
>
> I think this can just live in flags?
>
>>  {
>> -       const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
>> +       /* We use SECCOMP_MODE_FILTER for both eBPF and cBPF filters */
>> +       const unsigned long filter_mode = SECCOMP_MODE_FILTER;
>>         struct seccomp_filter *prepared = NULL;
>>         long ret = -EINVAL;
>>
>>         /* Validate flags. */
>>         if (flags & ~SECCOMP_FILTER_FLAG_MASK)
>>                 return -EINVAL;
>> +       /*
>> +        * Installing a seccomp filter requires that the task has
>> +        * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
>> +        * This avoids scenarios where unprivileged tasks can affect the
>> +        * behavior of privileged children.
>> +        */
>> +       if (!task_no_new_privs(current) &&
>> +           security_capable_noaudit(current_cred(), current_user_ns(),
>> +                                    CAP_SYS_ADMIN) != 0)
>> +               return -EACCES;
>
> This changes the order of checks -- before, too-large filters would
> get EINVAL even if they lacked the needed permissions. As long as this
> doesn't break anything in the real world, it should be fine, but I
> might want to instead create a perm-check function and just call it in
> both functions. (And likely write a self-test that checks this order,
> if it doesn't already exist.)
>
>>
>>         /* Prepare the new filter before holding any locks. */
>> -       prepared = seccomp_prepare_user_filter(filter);
>> +       if (filter_type == SECCOMP_SET_MODE_FILTER_EXTENDED)
>> +               prepared = seccomp_prepare_extended_filter(filter);
>> +       else if (filter_type == SECCOMP_SET_MODE_FILTER)
>> +               prepared = seccomp_prepare_user_filter(filter);
>> +       else
>> +               return -EINVAL;
>> +
>>         if (IS_ERR(prepared))
>>                 return PTR_ERR(prepared);
>>
>> @@ -867,7 +920,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
>>
>>         spin_lock_irq(&current->sighand->siglock);
>>
>> -       if (!seccomp_may_assign_mode(seccomp_mode))
>> +       if (!seccomp_may_assign_mode(filter_mode))
>>                 goto out;
>>
>>         ret = seccomp_attach_filter(flags, prepared);
>> @@ -876,7 +929,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
>>         /* Do not free the successfully attached filter. */
>>         prepared = NULL;
>>
>> -       seccomp_assign_mode(current, seccomp_mode);
>> +       seccomp_assign_mode(current, filter_mode);
>
> With a filter flag, the above hunks don't need to be changed, for example.
>
>>  out:
>>         spin_unlock_irq(&current->sighand->siglock);
>>         if (flags & SECCOMP_FILTER_FLAG_TSYNC)
>> @@ -926,7 +979,9 @@ static long do_seccomp(unsigned int op, unsigned int flags,
>>                         return -EINVAL;
>>                 return seccomp_set_mode_strict();
>>         case SECCOMP_SET_MODE_FILTER:
>> -               return seccomp_set_mode_filter(flags, uargs);
>> +               return seccomp_set_mode_filter(flags, uargs, op);
>> +       case SECCOMP_SET_MODE_FILTER_EXTENDED:
>> +               return seccomp_set_mode_filter(flags, uargs, op);
>
> And this isn't needed, since it would be passed as a flag.
>
>>         case SECCOMP_GET_ACTION_AVAIL:
>>                 if (flags != 0)
>>                         return -EINVAL;
>> @@ -969,6 +1024,10 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
>>                 op = SECCOMP_SET_MODE_FILTER;
>>                 uargs = filter;
>>                 break;
>> +       case SECCOMP_MODE_FILTER_EXTENDED:
>> +               op = SECCOMP_SET_MODE_FILTER_EXTENDED;
>> +               uargs = filter;
>> +               break;
>
> Same.
>
>>         default:
>>                 return -EINVAL;
>>         }
>> @@ -1040,8 +1099,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
>>         if (IS_ERR(filter))
>>                 return PTR_ERR(filter);
>>
>> -       fprog = filter->prog->orig_prog;
>> -       if (!fprog) {
>> +       if (!bpf_prog_was_classic(filter->prog)) {
>>                 /* This must be a new non-cBPF filter, since we save
>>                  * every cBPF filter's orig_prog above when
>>                  * CONFIG_CHECKPOINT_RESTORE is enabled.
>> @@ -1050,6 +1108,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
>>                 goto out;
>>         }
>>
>> +       fprog = filter->prog->orig_prog;
>
> I wonder if it would be easier to review to split eBPF install from
> the eBPF "get filter" changes as separate patches?
>
Yes, will respin. Thanks for your feedback. I appreciate the quick review.

>>         ret = fprog->len;
>>         if (!data)
>>                 goto out;
>> @@ -1239,6 +1298,55 @@ static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
>>         return 0;
>>  }
>>
>> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
>> +static bool seccomp_is_valid_access(int off, int size,
>> +                                   enum bpf_access_type type,
>> +                                   struct bpf_insn_access_aux *info)
>> +{
>> +       if (type != BPF_READ)
>> +               return false;
>> +
>> +       if (off < 0 || off + size > sizeof(struct seccomp_data))
>> +               return false;
>> +
>> +       switch (off) {
>> +       case bpf_ctx_range_till(struct seccomp_data, args[0], args[5]):
>> +               return (size == sizeof(__u64));
>> +       case bpf_ctx_range(struct seccomp_data, nr):
>> +               return (size == FIELD_SIZEOF(struct seccomp_data, nr));
>> +       case bpf_ctx_range(struct seccomp_data, arch):
>> +               return (size == FIELD_SIZEOF(struct seccomp_data, arch));
>> +       case bpf_ctx_range(struct seccomp_data, instruction_pointer):
>> +               return (size == FIELD_SIZEOF(struct seccomp_data,
>> +                                            instruction_pointer));
>> +       }
>> +
>> +       return false;
>> +}
>> +
>> +static const struct bpf_func_proto *
>> +seccomp_func_proto(enum bpf_func_id func_id)
>> +{
>> +       switch (func_id) {
>> +       case BPF_FUNC_get_current_uid_gid:
>> +               return &bpf_get_current_uid_gid_proto;
>> +       case BPF_FUNC_trace_printk:
>> +               if (capable(CAP_SYS_ADMIN))
>> +                       return bpf_get_trace_printk_proto();
>> +       default:
>> +               return NULL;
>> +       }
>> +}
>
> This makes me so uncomfortable. :) Why is uid/gid needed? Why add
> printk support here? (And why is it CAP_SYS_ADMIN checked if the
> entire filter is CAP_SYS_ADMIN checked before being attached?)
>
See comment above. Anyone can load filters. You can load the filter as
a normal user, drop privliged, and install the filter later with
cap_sys_admin, or no_new_privs.
>> +
>> +const struct bpf_prog_ops seccomp_prog_ops = {
>> +};
>> +
>> +const struct bpf_verifier_ops seccomp_verifier_ops = {
>> +       .get_func_proto         = seccomp_func_proto,
>> +       .is_valid_access        = seccomp_is_valid_access,
>> +};
>> +#endif /* CONFIG_SECCOMP_FILTER_EXTENDED */
>> +
>>  static struct ctl_path seccomp_sysctl_path[] = {
>>         { .procname = "kernel", },
>>         { .procname = "seccomp", },
>> --
>> 2.14.1
>>
>
> -Kees
>
> --
> Kees Cook
> Pixel Security
_______________________________________________
Containers mailing list
Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/containers