On Fri, Sep 11, 2015 at 02:37:59PM +0200, Daniel Borkmann wrote: > On 09/11/2015 02:21 AM, Tycho Andersen wrote: > >This is the final bit needed to support seccomp filters created via the bpf > >syscall. The patch adds a new seccomp operation SECCOMP_MODE_FILTER_EBPF, > >which takes exactly one command (presumably to be expanded upon later when > >seccomp EBPFs support more interesting things) and an argument struct > >similar to that of bpf(), although the size is explicit in the struct to > >avoid changing the signature of seccomp(). > > > >v2: Don't abuse seccomp's third argument; use a separate command and a > > pointer to a structure instead. > > Comments below ... > > >Signed-off-by: Tycho Andersen <tycho.andersen@xxxxxxxxxxxxx> > >CC: Kees Cook <keescook@xxxxxxxxxxxx> > >CC: Will Drewry <wad@xxxxxxxxxxxx> > >CC: Oleg Nesterov <oleg@xxxxxxxxxx> > >CC: Andy Lutomirski <luto@xxxxxxxxxxxxxx> > >CC: Pavel Emelyanov <xemul@xxxxxxxxxxxxx> > >CC: Serge E. Hallyn <serge.hallyn@xxxxxxxxxx> > >CC: Alexei Starovoitov <ast@xxxxxxxxxx> > >CC: Daniel Borkmann <daniel@xxxxxxxxxxxxx> > >--- > > include/uapi/linux/seccomp.h | 16 +++++ > > kernel/seccomp.c | 135 ++++++++++++++++++++++++++++++++++++++----- > > 2 files changed, 138 insertions(+), 13 deletions(-) > > > >diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h > >index 0f238a4..a8694e2 100644 > >--- a/include/uapi/linux/seccomp.h > >+++ b/include/uapi/linux/seccomp.h > >@@ -13,10 +13,14 @@ > > /* Valid operations for seccomp syscall. */ > > #define SECCOMP_SET_MODE_STRICT 0 > > #define SECCOMP_SET_MODE_FILTER 1 > >+#define SECCOMP_MODE_FILTER_EBPF 2 > > Should this be SECCOMP_SET_MODE_FILTER_EBPF or just SECCOMP_SET_MODE_EBPF? I just stole the name Kees gave it in the previous thread, but I think that perhaps there are other plans for manipulating seccomp ebpfs (?). The command is SECCOMP_EBPF_ADD_FD, so it seems like we could add a command like SECCOMP_EBPF_SOMETHING in the future. > > /* Valid flags for SECCOMP_SET_MODE_FILTER */ > > #define SECCOMP_FILTER_FLAG_TSYNC 1 > > > >+/* Valid cmds for SECCOMP_MODE_FILTER_EBPF */ > >+#define SECCOMP_EBPF_ADD_FD 0 > >+ > > /* > > * All BPF programs must return a 32-bit value. > > * The bottom 16-bits are for optional return data. > >@@ -51,4 +55,16 @@ struct seccomp_data { > > __u64 args[6]; > > }; > > > >+struct seccomp_ebpf { > >+ unsigned int size; > >+ > >+ union { > >+ /* SECCOMP_EBPF_ADD_FD */ > >+ struct { > >+ unsigned int add_flags; > >+ __u32 add_fd; > >+ }; > >+ }; > >+}; > >+ > > #endif /* _UAPI_LINUX_SECCOMP_H */ > >diff --git a/kernel/seccomp.c b/kernel/seccomp.c > >index 1856f69..e78175a 100644 > >--- a/kernel/seccomp.c > >+++ b/kernel/seccomp.c > >@@ -65,6 +65,9 @@ struct seccomp_filter { > > /* Limit any path through the tree to 256KB worth of instructions. */ > > #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) > > > >+static long seccomp_install_filter(unsigned int flags, > >+ struct seccomp_filter *prepared); > >+ > > /* > > * Endianness is explicitly ignored and left for BPF program authors to manage > > * as per the specific architecture. > >@@ -356,17 +359,6 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) > > > > BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); > > > >- /* > >- * Installing a seccomp filter requires that the task has > >- * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. > >- * This avoids scenarios where unprivileged tasks can affect the > >- * behavior of privileged children. > >- */ > >- if (!task_no_new_privs(current) && > >- security_capable_noaudit(current_cred(), current_user_ns(), > >- CAP_SYS_ADMIN) != 0) > >- return ERR_PTR(-EACCES); > >- > > /* Allocate a new seccomp_filter */ > > sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN); > > if (!sfilter) > >@@ -510,8 +502,105 @@ static void seccomp_send_sigsys(int syscall, int reason) > > info.si_syscall = syscall; > > force_sig_info(SIGSYS, &info, current); > > } > >+ > > #endif /* CONFIG_SECCOMP_FILTER */ > > > >+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SECCOMP_FILTER) > >+static struct seccomp_filter *seccomp_prepare_ebpf(int fd) > >+{ > >+ struct seccomp_filter *ret; > >+ struct bpf_prog *prog; > >+ > >+ prog = bpf_prog_get(fd); > >+ if (IS_ERR(prog)) > >+ return (struct seccomp_filter *) prog; > > ERR_CAST() > > >+ > >+ if (prog->type != BPF_PROG_TYPE_SECCOMP) { > >+ bpf_prog_put(prog); > >+ return ERR_PTR(-EINVAL); > >+ } > >+ > >+ ret = kzalloc(sizeof(*ret), GFP_KERNEL | __GFP_NOWARN); > >+ if (!ret) { > >+ bpf_prog_put(prog); > >+ return ERR_PTR(-ENOMEM); > >+ } > >+ > >+ ret->prog = prog; > >+ atomic_set(&ret->usage, 1); > >+ > >+ /* Intentionally don't bpf_prog_put() here, because the underlying prog > >+ * is refcounted too and we're holding a reference from the struct > >+ * seccomp_filter object. > >+ */ > >+ return ret; > >+} > >+ > >+static long seccomp_ebpf_add_fd(struct seccomp_ebpf *ebpf) > >+{ > >+ struct seccomp_filter *prepared; > >+ > >+ prepared = seccomp_prepare_ebpf(ebpf->add_fd); > >+ if (IS_ERR(prepared)) > >+ return PTR_ERR(prepared); > >+ > >+ return seccomp_install_filter(ebpf->add_flags, prepared); > >+} > >+ > >+static long seccomp_mode_filter_ebpf(unsigned int cmd, const char __user *uargs) > >+{ > >+ const struct seccomp_ebpf __user *uebpf; > >+ struct seccomp_ebpf ebpf; > >+ unsigned int size; > >+ long ret = -EFAULT; > >+ > >+ uebpf = (const struct seccomp_ebpf __user *) uargs; > >+ > >+ if (get_user(size, &uebpf->size) != 0) > >+ return -EFAULT; > >+ > >+ /* If we're handed a bigger struct than we know of, > >+ * ensure all the unknown bits are 0 - i.e. new > >+ * user-space does not rely on any kernel feature > >+ * extensions we dont know about yet. > >+ */ > >+ if (size > sizeof(ebpf)) { > >+ unsigned char __user *addr; > >+ unsigned char __user *end; > >+ unsigned char val; > >+ > >+ addr = (void __user *)uebpf + sizeof(ebpf); > >+ end = (void __user *)uebpf + size; > >+ > >+ for (; addr < end; addr++) { > >+ int err = get_user(val, addr); > >+ > >+ if (err) > >+ return err; > >+ if (val) > >+ return -E2BIG; > >+ } > >+ size = sizeof(ebpf); > >+ } > >+ > >+ if (copy_from_user(&ebpf, uebpf, size) != 0) > >+ return -EFAULT; > > Not sure it's worth adding all this bpf(2)-alike interface complexity into > this, but fair enough, I guess there are some very good reasons and bigger > additions coming then ... I'm not sure what bigger additions are coming, although it seems Andy might have something. I think this is just an attempt to future proof things. > >+ switch (cmd) { > >+ case SECCOMP_EBPF_ADD_FD: > >+ ret = seccomp_ebpf_add_fd(&ebpf); > >+ break; > >+ } > >+ > >+ return ret; > >+} > >+#else > >+static long seccomp_mode_filter_ebpf(unsigned int cmd, const char __user *uargs) > >+{ > >+ return -EINVAL; > >+} > >+#endif > >+ > > /* > > * Secure computing mode 1 allows only read/write/exit/sigreturn. > > * To be fully secure this must be combined with rlimit > >@@ -760,9 +849,7 @@ out: > > static long seccomp_set_mode_filter(unsigned int flags, > > const char __user *filter) > > { > >- const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; > > struct seccomp_filter *prepared = NULL; > >- long ret = -EINVAL; > > > > /* Validate flags. */ > > if (flags & ~SECCOMP_FILTER_FLAG_MASK) > >@@ -773,6 +860,26 @@ static long seccomp_set_mode_filter(unsigned int flags, > > if (IS_ERR(prepared)) > > return PTR_ERR(prepared); > > > >+ return seccomp_install_filter(flags, prepared); > > I (truly) hope, I'm overseeing something ;) ... > > ... but why doing all the (classic) seccomp-BPF preparation work (which is rather > a lot) up to this point, where you have it ready, only to *then* find out we don't > have the actual permissions ?! Yes, this seems dumb. I was trying to avoid having the check in two places, but that's probably what's necessary. > Plus, when seccomp_install_filter() fails with -EACCES, who is releasing all the > allocated foo resp. dropping taken program refs !? Yes, seccomp_install_filter is /supposed/ to free things if the install fails, although it doesn't in the permissions case because of the copy paste error, doh. > I see the same in seccomp_ebpf_add_fd(). Same as above, seccomp_install_filter is supposed to call seccomp_filter_free in case of an error, but it doesn't. Thanks for the look. I'll make the changes for the next set. Tycho > So, an unprivileged child could increase the parent's bpf_prog's reference count > w/o having the actual permissions to do so, and thus controlling it to the point > where the next bpf_prog_put() would unintentionally release it? > > (So yeah, I'm hoping I misread something ... ;)) > > >+} > >+ > >+static long seccomp_install_filter(unsigned int flags, > >+ struct seccomp_filter *prepared) > >+{ > >+ const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; > >+ long ret = -EINVAL; > >+ > >+ /* > >+ * Installing a seccomp filter requires that the task has > >+ * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. > >+ * This avoids scenarios where unprivileged tasks can affect the > >+ * behavior of privileged children. > >+ */ > >+ if (!task_no_new_privs(current) && > >+ security_capable_noaudit(current_cred(), current_user_ns(), > >+ CAP_SYS_ADMIN) != 0) > >+ return -EACCES; > >+ > > /* > > * Make sure we cannot change seccomp or nnp state via TSYNC > > * while another thread is in the middle of calling exec. > >@@ -875,6 +982,8 @@ static long do_seccomp(unsigned int op, unsigned int flags, > > return seccomp_set_mode_strict(); > > case SECCOMP_SET_MODE_FILTER: > > return seccomp_set_mode_filter(flags, uargs); > >+ case SECCOMP_MODE_FILTER_EBPF: > >+ return seccomp_mode_filter_ebpf(flags, uargs); > > default: > > return -EINVAL; > > } > > > -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html