On 11 September 2015 at 02:21, Tycho Andersen <tycho.andersen@xxxxxxxxxxxxx> wrote: > This is the final bit needed to support seccomp filters created via the bpf > syscall. The patch adds a new seccomp operation SECCOMP_MODE_FILTER_EBPF, > which takes exactly one command (presumably to be expanded upon later when > seccomp EBPFs support more interesting things) and an argument struct > similar to that of bpf(), although the size is explicit in the struct to > avoid changing the signature of seccomp(). > > v2: Don't abuse seccomp's third argument; use a separate command and a > pointer to a structure instead. Hi Tycho, Here, I'm entering broken record territory :-). Seems like a man-pages patch is warranted here also? Cheers, Michael > Signed-off-by: Tycho Andersen <tycho.andersen@xxxxxxxxxxxxx> > CC: Kees Cook <keescook@xxxxxxxxxxxx> > CC: Will Drewry <wad@xxxxxxxxxxxx> > CC: Oleg Nesterov <oleg@xxxxxxxxxx> > CC: Andy Lutomirski <luto@xxxxxxxxxxxxxx> > CC: Pavel Emelyanov <xemul@xxxxxxxxxxxxx> > CC: Serge E. Hallyn <serge.hallyn@xxxxxxxxxx> > CC: Alexei Starovoitov <ast@xxxxxxxxxx> > CC: Daniel Borkmann <daniel@xxxxxxxxxxxxx> > --- > include/uapi/linux/seccomp.h | 16 +++++ > kernel/seccomp.c | 135 ++++++++++++++++++++++++++++++++++++++----- > 2 files changed, 138 insertions(+), 13 deletions(-) > > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h > index 0f238a4..a8694e2 100644 > --- a/include/uapi/linux/seccomp.h > +++ b/include/uapi/linux/seccomp.h > @@ -13,10 +13,14 @@ > /* Valid operations for seccomp syscall. */ > #define SECCOMP_SET_MODE_STRICT 0 > #define SECCOMP_SET_MODE_FILTER 1 > +#define SECCOMP_MODE_FILTER_EBPF 2 > > /* Valid flags for SECCOMP_SET_MODE_FILTER */ > #define SECCOMP_FILTER_FLAG_TSYNC 1 > > +/* Valid cmds for SECCOMP_MODE_FILTER_EBPF */ > +#define SECCOMP_EBPF_ADD_FD 0 > + > /* > * All BPF programs must return a 32-bit value. > * The bottom 16-bits are for optional return data. > @@ -51,4 +55,16 @@ struct seccomp_data { > __u64 args[6]; > }; > > +struct seccomp_ebpf { > + unsigned int size; > + > + union { > + /* SECCOMP_EBPF_ADD_FD */ > + struct { > + unsigned int add_flags; > + __u32 add_fd; > + }; > + }; > +}; > + > #endif /* _UAPI_LINUX_SECCOMP_H */ > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index 1856f69..e78175a 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -65,6 +65,9 @@ struct seccomp_filter { > /* Limit any path through the tree to 256KB worth of instructions. */ > #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) > > +static long seccomp_install_filter(unsigned int flags, > + struct seccomp_filter *prepared); > + > /* > * Endianness is explicitly ignored and left for BPF program authors to manage > * as per the specific architecture. > @@ -356,17 +359,6 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) > > BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); > > - /* > - * Installing a seccomp filter requires that the task has > - * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. > - * This avoids scenarios where unprivileged tasks can affect the > - * behavior of privileged children. > - */ > - if (!task_no_new_privs(current) && > - security_capable_noaudit(current_cred(), current_user_ns(), > - CAP_SYS_ADMIN) != 0) > - return ERR_PTR(-EACCES); > - > /* Allocate a new seccomp_filter */ > sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN); > if (!sfilter) > @@ -510,8 +502,105 @@ static void seccomp_send_sigsys(int syscall, int reason) > info.si_syscall = syscall; > force_sig_info(SIGSYS, &info, current); > } > + > #endif /* CONFIG_SECCOMP_FILTER */ > > +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SECCOMP_FILTER) > +static struct seccomp_filter *seccomp_prepare_ebpf(int fd) > +{ > + struct seccomp_filter *ret; > + struct bpf_prog *prog; > + > + prog = bpf_prog_get(fd); > + if (IS_ERR(prog)) > + return (struct seccomp_filter *) prog; > + > + if (prog->type != BPF_PROG_TYPE_SECCOMP) { > + bpf_prog_put(prog); > + return ERR_PTR(-EINVAL); > + } > + > + ret = kzalloc(sizeof(*ret), GFP_KERNEL | __GFP_NOWARN); > + if (!ret) { > + bpf_prog_put(prog); > + return ERR_PTR(-ENOMEM); > + } > + > + ret->prog = prog; > + atomic_set(&ret->usage, 1); > + > + /* Intentionally don't bpf_prog_put() here, because the underlying prog > + * is refcounted too and we're holding a reference from the struct > + * seccomp_filter object. > + */ > + return ret; > +} > + > +static long seccomp_ebpf_add_fd(struct seccomp_ebpf *ebpf) > +{ > + struct seccomp_filter *prepared; > + > + prepared = seccomp_prepare_ebpf(ebpf->add_fd); > + if (IS_ERR(prepared)) > + return PTR_ERR(prepared); > + > + return seccomp_install_filter(ebpf->add_flags, prepared); > +} > + > +static long seccomp_mode_filter_ebpf(unsigned int cmd, const char __user *uargs) > +{ > + const struct seccomp_ebpf __user *uebpf; > + struct seccomp_ebpf ebpf; > + unsigned int size; > + long ret = -EFAULT; > + > + uebpf = (const struct seccomp_ebpf __user *) uargs; > + > + if (get_user(size, &uebpf->size) != 0) > + return -EFAULT; > + > + /* If we're handed a bigger struct than we know of, > + * ensure all the unknown bits are 0 - i.e. new > + * user-space does not rely on any kernel feature > + * extensions we dont know about yet. > + */ > + if (size > sizeof(ebpf)) { > + unsigned char __user *addr; > + unsigned char __user *end; > + unsigned char val; > + > + addr = (void __user *)uebpf + sizeof(ebpf); > + end = (void __user *)uebpf + size; > + > + for (; addr < end; addr++) { > + int err = get_user(val, addr); > + > + if (err) > + return err; > + if (val) > + return -E2BIG; > + } > + size = sizeof(ebpf); > + } > + > + if (copy_from_user(&ebpf, uebpf, size) != 0) > + return -EFAULT; > + > + switch (cmd) { > + case SECCOMP_EBPF_ADD_FD: > + ret = seccomp_ebpf_add_fd(&ebpf); > + break; > + } > + > + return ret; > +} > +#else > +static long seccomp_mode_filter_ebpf(unsigned int cmd, const char __user *uargs) > +{ > + return -EINVAL; > +} > +#endif > + > /* > * Secure computing mode 1 allows only read/write/exit/sigreturn. > * To be fully secure this must be combined with rlimit > @@ -760,9 +849,7 @@ out: > static long seccomp_set_mode_filter(unsigned int flags, > const char __user *filter) > { > - const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; > struct seccomp_filter *prepared = NULL; > - long ret = -EINVAL; > > /* Validate flags. */ > if (flags & ~SECCOMP_FILTER_FLAG_MASK) > @@ -773,6 +860,26 @@ static long seccomp_set_mode_filter(unsigned int flags, > if (IS_ERR(prepared)) > return PTR_ERR(prepared); > > + return seccomp_install_filter(flags, prepared); > +} > + > +static long seccomp_install_filter(unsigned int flags, > + struct seccomp_filter *prepared) > +{ > + const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; > + long ret = -EINVAL; > + > + /* > + * Installing a seccomp filter requires that the task has > + * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. > + * This avoids scenarios where unprivileged tasks can affect the > + * behavior of privileged children. > + */ > + if (!task_no_new_privs(current) && > + security_capable_noaudit(current_cred(), current_user_ns(), > + CAP_SYS_ADMIN) != 0) > + return -EACCES; > + > /* > * Make sure we cannot change seccomp or nnp state via TSYNC > * while another thread is in the middle of calling exec. > @@ -875,6 +982,8 @@ static long do_seccomp(unsigned int op, unsigned int flags, > return seccomp_set_mode_strict(); > case SECCOMP_SET_MODE_FILTER: > return seccomp_set_mode_filter(flags, uargs); > + case SECCOMP_MODE_FILTER_EBPF: > + return seccomp_mode_filter_ebpf(flags, uargs); > default: > return -EINVAL; > } > -- > 2.1.4 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-api" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Linux/UNIX System Programming Training: http://man7.org/training/ -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html