On Tue, Feb 13, 2018 at 7:42 AM, Sargun Dhillon <sargun@xxxxxxxxx> wrote: > From: Sargun Dhillon <sargun@xxxxxxxxxxx> > > This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant > to be used for seccomp filters as an alternative to cBPF filters. The > program type has relatively limited capabilities in terms of helpers, > but that can be extended later on. > > It also introduces a new mechanism to attach these filters via the > prctl and seccomp syscalls -- SECCOMP_MODE_FILTER_EXTENDED, and > SECCOMP_SET_MODE_FILTER_EXTENDED respectively. > > Signed-off-by: Sargun Dhillon <sargun@xxxxxxxxx> > --- > arch/Kconfig | 7 ++ > include/linux/bpf_types.h | 3 + > include/uapi/linux/bpf.h | 2 + > include/uapi/linux/seccomp.h | 15 +++-- > kernel/bpf/syscall.c | 1 + > kernel/seccomp.c | 148 +++++++++++++++++++++++++++++++++++++------ > 6 files changed, 150 insertions(+), 26 deletions(-) > > diff --git a/arch/Kconfig b/arch/Kconfig > index 76c0b54443b1..db675888577c 100644 > --- a/arch/Kconfig > +++ b/arch/Kconfig > @@ -401,6 +401,13 @@ config SECCOMP_FILTER > > See Documentation/prctl/seccomp_filter.txt for details. > > +config SECCOMP_FILTER_EXTENDED > + bool "Extended BPF seccomp filters" > + depends on SECCOMP_FILTER && BPF_SYSCALL > + help > + Enables seccomp filters to be written in eBPF, as opposed > + to just cBPF filters. > + > config HAVE_GCC_PLUGINS > bool > help > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h > index 19b8349a3809..945c65c4e461 100644 > --- a/include/linux/bpf_types.h > +++ b/include/linux/bpf_types.h > @@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) > #ifdef CONFIG_CGROUP_BPF > BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) > #endif > +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED > +BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp) > +#endif > > BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) > BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index db6bdc375126..5f96cb7ed954 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -1,3 +1,4 @@ > + > /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ > /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com > * > @@ -133,6 +134,7 @@ enum bpf_prog_type { > BPF_PROG_TYPE_SOCK_OPS, > BPF_PROG_TYPE_SK_SKB, > BPF_PROG_TYPE_CGROUP_DEVICE, > + BPF_PROG_TYPE_SECCOMP, > }; > > enum bpf_attach_type { > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h > index 2a0bd9dd104d..7da8b39f2a6a 100644 > --- a/include/uapi/linux/seccomp.h > +++ b/include/uapi/linux/seccomp.h > @@ -7,14 +7,17 @@ > > > /* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */ > -#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */ > -#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */ > -#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ > +#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */ > +#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */ > +#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ > +#define SECCOMP_MODE_FILTER_EXTENDED 3 /* uses eBPF filter from fd */ This MODE flag isn't needed: it's still using a filter, and the interface changes should be sufficient with SECCOMP_SET_MODE_FILTER_EXTENDED below. > /* Valid operations for seccomp syscall. */ > -#define SECCOMP_SET_MODE_STRICT 0 > -#define SECCOMP_SET_MODE_FILTER 1 > -#define SECCOMP_GET_ACTION_AVAIL 2 > +#define SECCOMP_SET_MODE_STRICT 0 > +#define SECCOMP_SET_MODE_FILTER 1 > +#define SECCOMP_GET_ACTION_AVAIL 2 > +#define SECCOMP_SET_MODE_FILTER_EXTENDED 3 It seems like this should be a FILTER flag, not a syscall op change? > + > > /* Valid flags for SECCOMP_SET_MODE_FILTER */ > #define SECCOMP_FILTER_FLAG_TSYNC 1 > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index e24aa3241387..86d6ec8b916d 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr) > > if (type != BPF_PROG_TYPE_SOCKET_FILTER && > type != BPF_PROG_TYPE_CGROUP_SKB && > + type != BPF_PROG_TYPE_SECCOMP && > !capable(CAP_SYS_ADMIN)) > return -EPERM; So only init_ns-CAP_SYS_ADMIN would be able to use seccomp eBPF? > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index 940fa408a288..b30dd25c1cb8 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -37,6 +37,7 @@ > #include <linux/security.h> > #include <linux/tracehook.h> > #include <linux/uaccess.h> > +#include <linux/bpf.h> > > /** > * struct seccomp_filter - container for seccomp BPF programs > @@ -367,17 +368,6 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) > > BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); > > - /* > - * Installing a seccomp filter requires that the task has > - * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. > - * This avoids scenarios where unprivileged tasks can affect the > - * behavior of privileged children. > - */ > - if (!task_no_new_privs(current) && > - security_capable_noaudit(current_cred(), current_user_ns(), > - CAP_SYS_ADMIN) != 0) > - return ERR_PTR(-EACCES); > - > /* Allocate a new seccomp_filter */ > sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN); > if (!sfilter) > @@ -423,6 +413,48 @@ seccomp_prepare_user_filter(const char __user *user_filter) > return filter; > } > > +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED > +/** > + * seccomp_prepare_extended_filter - prepares a user-supplied eBPF fd > + * @user_filter: pointer to the user data containing an fd. > + * > + * Returns 0 on success and non-zero otherwise. > + */ > +static struct seccomp_filter * > +seccomp_prepare_extended_filter(const char __user *user_fd) > +{ > + struct seccomp_filter *sfilter; > + struct bpf_prog *fp; > + int fd; > + > + /* Fetch the fd from userspace */ > + if (get_user(fd, (int __user *)user_fd)) > + return ERR_PTR(-EFAULT); > + > + /* Allocate a new seccomp_filter */ > + sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN); > + if (!sfilter) > + return ERR_PTR(-ENOMEM); > + > + fp = bpf_prog_get_type(fd, BPF_PROG_TYPE_SECCOMP); > + if (IS_ERR(fp)) { > + kfree(sfilter); > + return ERR_CAST(fp); > + } > + > + sfilter->prog = fp; > + refcount_set(&sfilter->usage, 1); > + > + return sfilter; > +} > +#else > +static struct seccomp_filter * > +seccomp_prepare_extended_filter(const char __user *filter_fd) > +{ > + return ERR_PTR(-EINVAL); > +} > +#endif > + > /** > * seccomp_attach_filter: validate and attach filter > * @flags: flags to change filter behavior > @@ -492,7 +524,10 @@ void get_seccomp_filter(struct task_struct *tsk) > static inline void seccomp_filter_free(struct seccomp_filter *filter) > { > if (filter) { > - bpf_prog_destroy(filter->prog); > + if (bpf_prog_was_classic(filter->prog)) > + bpf_prog_destroy(filter->prog); > + else > + bpf_prog_put(filter->prog); > kfree(filter); > } > } > @@ -842,18 +877,36 @@ static long seccomp_set_mode_strict(void) > * Returns 0 on success or -EINVAL on failure. > */ > static long seccomp_set_mode_filter(unsigned int flags, > - const char __user *filter) > + const char __user *filter, > + unsigned long filter_type) I think this can just live in flags? > { > - const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; > + /* We use SECCOMP_MODE_FILTER for both eBPF and cBPF filters */ > + const unsigned long filter_mode = SECCOMP_MODE_FILTER; > struct seccomp_filter *prepared = NULL; > long ret = -EINVAL; > > /* Validate flags. */ > if (flags & ~SECCOMP_FILTER_FLAG_MASK) > return -EINVAL; > + /* > + * Installing a seccomp filter requires that the task has > + * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. > + * This avoids scenarios where unprivileged tasks can affect the > + * behavior of privileged children. > + */ > + if (!task_no_new_privs(current) && > + security_capable_noaudit(current_cred(), current_user_ns(), > + CAP_SYS_ADMIN) != 0) > + return -EACCES; This changes the order of checks -- before, too-large filters would get EINVAL even if they lacked the needed permissions. As long as this doesn't break anything in the real world, it should be fine, but I might want to instead create a perm-check function and just call it in both functions. (And likely write a self-test that checks this order, if it doesn't already exist.) > > /* Prepare the new filter before holding any locks. */ > - prepared = seccomp_prepare_user_filter(filter); > + if (filter_type == SECCOMP_SET_MODE_FILTER_EXTENDED) > + prepared = seccomp_prepare_extended_filter(filter); > + else if (filter_type == SECCOMP_SET_MODE_FILTER) > + prepared = seccomp_prepare_user_filter(filter); > + else > + return -EINVAL; > + > if (IS_ERR(prepared)) > return PTR_ERR(prepared); > > @@ -867,7 +920,7 @@ static long seccomp_set_mode_filter(unsigned int flags, > > spin_lock_irq(¤t->sighand->siglock); > > - if (!seccomp_may_assign_mode(seccomp_mode)) > + if (!seccomp_may_assign_mode(filter_mode)) > goto out; > > ret = seccomp_attach_filter(flags, prepared); > @@ -876,7 +929,7 @@ static long seccomp_set_mode_filter(unsigned int flags, > /* Do not free the successfully attached filter. */ > prepared = NULL; > > - seccomp_assign_mode(current, seccomp_mode); > + seccomp_assign_mode(current, filter_mode); With a filter flag, the above hunks don't need to be changed, for example. > out: > spin_unlock_irq(¤t->sighand->siglock); > if (flags & SECCOMP_FILTER_FLAG_TSYNC) > @@ -926,7 +979,9 @@ static long do_seccomp(unsigned int op, unsigned int flags, > return -EINVAL; > return seccomp_set_mode_strict(); > case SECCOMP_SET_MODE_FILTER: > - return seccomp_set_mode_filter(flags, uargs); > + return seccomp_set_mode_filter(flags, uargs, op); > + case SECCOMP_SET_MODE_FILTER_EXTENDED: > + return seccomp_set_mode_filter(flags, uargs, op); And this isn't needed, since it would be passed as a flag. > case SECCOMP_GET_ACTION_AVAIL: > if (flags != 0) > return -EINVAL; > @@ -969,6 +1024,10 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) > op = SECCOMP_SET_MODE_FILTER; > uargs = filter; > break; > + case SECCOMP_MODE_FILTER_EXTENDED: > + op = SECCOMP_SET_MODE_FILTER_EXTENDED; > + uargs = filter; > + break; Same. > default: > return -EINVAL; > } > @@ -1040,8 +1099,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, > if (IS_ERR(filter)) > return PTR_ERR(filter); > > - fprog = filter->prog->orig_prog; > - if (!fprog) { > + if (!bpf_prog_was_classic(filter->prog)) { > /* This must be a new non-cBPF filter, since we save > * every cBPF filter's orig_prog above when > * CONFIG_CHECKPOINT_RESTORE is enabled. > @@ -1050,6 +1108,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, > goto out; > } > > + fprog = filter->prog->orig_prog; I wonder if it would be easier to review to split eBPF install from the eBPF "get filter" changes as separate patches? > ret = fprog->len; > if (!data) > goto out; > @@ -1239,6 +1298,55 @@ static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, > return 0; > } > > +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED > +static bool seccomp_is_valid_access(int off, int size, > + enum bpf_access_type type, > + struct bpf_insn_access_aux *info) > +{ > + if (type != BPF_READ) > + return false; > + > + if (off < 0 || off + size > sizeof(struct seccomp_data)) > + return false; > + > + switch (off) { > + case bpf_ctx_range_till(struct seccomp_data, args[0], args[5]): > + return (size == sizeof(__u64)); > + case bpf_ctx_range(struct seccomp_data, nr): > + return (size == FIELD_SIZEOF(struct seccomp_data, nr)); > + case bpf_ctx_range(struct seccomp_data, arch): > + return (size == FIELD_SIZEOF(struct seccomp_data, arch)); > + case bpf_ctx_range(struct seccomp_data, instruction_pointer): > + return (size == FIELD_SIZEOF(struct seccomp_data, > + instruction_pointer)); > + } > + > + return false; > +} > + > +static const struct bpf_func_proto * > +seccomp_func_proto(enum bpf_func_id func_id) > +{ > + switch (func_id) { > + case BPF_FUNC_get_current_uid_gid: > + return &bpf_get_current_uid_gid_proto; > + case BPF_FUNC_trace_printk: > + if (capable(CAP_SYS_ADMIN)) > + return bpf_get_trace_printk_proto(); > + default: > + return NULL; > + } > +} This makes me so uncomfortable. :) Why is uid/gid needed? Why add printk support here? (And why is it CAP_SYS_ADMIN checked if the entire filter is CAP_SYS_ADMIN checked before being attached?) > + > +const struct bpf_prog_ops seccomp_prog_ops = { > +}; > + > +const struct bpf_verifier_ops seccomp_verifier_ops = { > + .get_func_proto = seccomp_func_proto, > + .is_valid_access = seccomp_is_valid_access, > +}; > +#endif /* CONFIG_SECCOMP_FILTER_EXTENDED */ > + > static struct ctl_path seccomp_sysctl_path[] = { > { .procname = "kernel", }, > { .procname = "seccomp", }, > -- > 2.14.1 > -Kees -- Kees Cook Pixel Security _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers