On Thu, Jan 12, 2012 at 2:53 AM, Serge Hallyn <serge.hallyn@xxxxxxxxxxxxx> wrote: > Quoting Will Drewry (wad@xxxxxxxxxxxx): >> This patch adds support for seccomp mode 2. This mode enables dynamic >> enforcement of system call filtering policy in the kernel as specified >> by a userland task. The policy is expressed in terms of a BPF program, >> as is used for userland-exposed socket filtering. Instead of network >> data, the BPF program is evaluated over struct user_regs_struct at the >> time of the system call (as retrieved using regviews). >> >> A filter program may be installed by a userland task by calling >> prctl(PR_ATTACH_SECCOMP_FILTER, &fprog); >> where fprog is of type struct sock_fprog. >> >> If the first filter program allows subsequent prctl(2) calls, then >> additional filter programs may be attached. All attached programs >> must be evaluated before a system call will be allowed to proceed. >> >> To avoid CONFIG_COMPAT related landmines, once a filter program is >> installed using specific is_compat_task() and current->personality, it >> is not allowed to make system calls or attach additional filters which >> use a different combination of is_compat_task() and >> current->personality. >> >> Filter programs may _only_ cross the execve(2) barrier if last filter >> program was attached by a task with CAP_SYS_ADMIN capabilities in its >> user namespace. Once a task-local filter program is attached from a >> process without privileges, execve will fail. This ensures that only >> privileged parent task can affect its privileged children (e.g., setuid >> binary). >> >> There are a number of benefits to this approach. A few of which are >> as follows: >> - BPF has been exposed to userland for a long time. >> - Userland already knows its ABI: expected register layout and system >> call numbers. >> - Full register information is provided which may be relevant for >> certain syscalls (fork, rt_sigreturn) or for other userland >> filtering tactics (checking the PC). >> - No time-of-check-time-of-use vulnerable data accesses are possible. >> >> This patch includes its own BPF evaluator, but relies on the >> net/core/filter.c BPF checking code. It is possible to share >> evaluators, but the performance sensitive nature of the network >> filtering path makes it an iterative optimization which (I think :) can >> be tackled separately via separate patchsets. (And at some point sharing >> BPF JIT code!) >> >> Signed-off-by: Will Drewry <wad@xxxxxxxxxxxx> > > Hey Will, > > A few comments below, but otherwise > > Acked-by: Serge Hallyn <serge.hallyn@xxxxxxxxxxxxx> Thanks! Unimportant responses below. Fixes will be incorporated in the next round (along with Oleg's feedback). cheers, will > thanks, > -serge > >> --- >> fs/exec.c | 5 + >> include/linux/prctl.h | 3 + >> include/linux/seccomp.h | 70 +++++- >> kernel/Makefile | 1 + >> kernel/fork.c | 4 + >> kernel/seccomp.c | 8 + >> kernel/seccomp_filter.c | 639 +++++++++++++++++++++++++++++++++++++++++++++++ >> kernel/sys.c | 4 + >> security/Kconfig | 12 + >> 9 files changed, 743 insertions(+), 3 deletions(-) >> create mode 100644 kernel/seccomp_filter.c >> >> diff --git a/fs/exec.c b/fs/exec.c >> index 3625464..e9cc89c 100644 >> --- a/fs/exec.c >> +++ b/fs/exec.c >> @@ -44,6 +44,7 @@ >> #include <linux/namei.h> >> #include <linux/mount.h> >> #include <linux/security.h> >> +#include <linux/seccomp.h> >> #include <linux/syscalls.h> >> #include <linux/tsacct_kern.h> >> #include <linux/cn_proc.h> >> @@ -1477,6 +1478,10 @@ static int do_execve_common(const char *filename, >> if (retval) >> goto out_ret; >> >> + retval = seccomp_check_exec(); >> + if (retval) >> + goto out_ret; >> + >> retval = -ENOMEM; >> bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); >> if (!bprm) >> diff --git a/include/linux/prctl.h b/include/linux/prctl.h >> index a3baeb2..15e2460 100644 >> --- a/include/linux/prctl.h >> +++ b/include/linux/prctl.h >> @@ -64,6 +64,9 @@ >> #define PR_GET_SECCOMP 21 >> #define PR_SET_SECCOMP 22 >> >> +/* Set process seccomp filters */ >> +#define PR_ATTACH_SECCOMP_FILTER 36 >> + >> /* Get/set the capability bounding set (as per security/commoncap.c) */ >> #define PR_CAPBSET_READ 23 >> #define PR_CAPBSET_DROP 24 >> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h >> index cc7a4e9..99d163e 100644 >> --- a/include/linux/seccomp.h >> +++ b/include/linux/seccomp.h >> @@ -5,9 +5,28 @@ >> #ifdef CONFIG_SECCOMP >> >> #include <linux/thread_info.h> >> +#include <linux/types.h> >> #include <asm/seccomp.h> >> >> -typedef struct { int mode; } seccomp_t; >> +struct seccomp_filter; >> +/** >> + * struct seccomp_struct - the state of a seccomp'ed process >> + * >> + * @mode: >> + * if this is 0, seccomp is not in use. >> + * is 1, the process is under standard seccomp rules. >> + * is 2, the process is only allowed to make system calls where >> + * associated filters evaluate successfully. >> + * @filter: Metadata for filter if using CONFIG_SECCOMP_FILTER. >> + * @filter must only be accessed from the context of current as there >> + * is no guard. >> + */ >> +typedef struct seccomp_struct { >> + int mode; >> +#ifdef CONFIG_SECCOMP_FILTER >> + struct seccomp_filter *filter; >> +#endif >> +} seccomp_t; >> >> extern void __secure_computing(int); >> static inline void secure_computing(int this_syscall) >> @@ -28,8 +47,7 @@ static inline int seccomp_mode(seccomp_t *s) >> >> #include <linux/errno.h> >> >> -typedef struct { } seccomp_t; >> - >> +typedef struct seccomp_struct { } seccomp_t; >> #define secure_computing(x) do { } while (0) >> >> static inline long prctl_get_seccomp(void) >> @@ -49,4 +67,50 @@ static inline int seccomp_mode(seccomp_t *s) >> >> #endif /* CONFIG_SECCOMP */ >> >> +#ifdef CONFIG_SECCOMP_FILTER >> + >> +#define seccomp_filter_init_task(_tsk) do { \ >> + (_tsk)->seccomp.filter = NULL; \ >> +} while (0); >> + >> +/* No locking is needed here because the task_struct will >> + * have no parallel consumers. >> + */ >> +#define seccomp_filter_free_task(_tsk) do { \ >> + put_seccomp_filter((_tsk)->seccomp.filter); \ >> +} while (0); >> + >> +extern int seccomp_check_exec(void); >> + >> +extern long prctl_attach_seccomp_filter(char __user *); >> + >> +extern struct seccomp_filter *get_seccomp_filter(struct seccomp_filter *); >> +extern void put_seccomp_filter(struct seccomp_filter *); >> + >> +extern int seccomp_test_filters(int); >> +extern void seccomp_filter_log_failure(int); >> +extern void seccomp_filter_fork(struct task_struct *child, >> + struct task_struct *parent); >> + >> +#else /* CONFIG_SECCOMP_FILTER */ >> + >> +#include <linux/errno.h> >> + >> +struct seccomp_filter { }; >> +#define seccomp_filter_init_task(_tsk) do { } while (0); >> +#define seccomp_filter_fork(_tsk, _orig) do { } while (0); >> +#define seccomp_filter_free_task(_tsk) do { } while (0); >> + >> +static inline int seccomp_check_exec(void) >> +{ >> + return 0; >> +} >> + >> + >> +static inline long prctl_attach_seccomp_filter(char __user *a2) >> +{ >> + return -ENOSYS; >> +} >> + >> +#endif /* CONFIG_SECCOMP_FILTER */ >> #endif /* _LINUX_SECCOMP_H */ >> diff --git a/kernel/Makefile b/kernel/Makefile >> index e898c5b..0584090 100644 >> --- a/kernel/Makefile >> +++ b/kernel/Makefile >> @@ -79,6 +79,7 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o >> obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o >> obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ >> obj-$(CONFIG_SECCOMP) += seccomp.o >> +obj-$(CONFIG_SECCOMP_FILTER) += seccomp_filter.o >> obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o >> obj-$(CONFIG_TREE_RCU) += rcutree.o >> obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o >> diff --git a/kernel/fork.c b/kernel/fork.c >> index da4a6a1..cc1d628 100644 >> --- a/kernel/fork.c >> +++ b/kernel/fork.c >> @@ -34,6 +34,7 @@ >> #include <linux/cgroup.h> >> #include <linux/security.h> >> #include <linux/hugetlb.h> >> +#include <linux/seccomp.h> >> #include <linux/swap.h> >> #include <linux/syscalls.h> >> #include <linux/jiffies.h> >> @@ -166,6 +167,7 @@ void free_task(struct task_struct *tsk) >> free_thread_info(tsk->stack); >> rt_mutex_debug_task_free(tsk); >> ftrace_graph_exit_task(tsk); >> + seccomp_filter_free_task(tsk); >> free_task_struct(tsk); >> } >> EXPORT_SYMBOL(free_task); >> @@ -1209,6 +1211,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, >> /* Perform scheduler related setup. Assign this task to a CPU. */ >> sched_fork(p); >> >> + seccomp_filter_init_task(p); >> retval = perf_event_init_task(p); >> if (retval) >> goto bad_fork_cleanup_policy; >> @@ -1375,6 +1378,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, >> if (clone_flags & CLONE_THREAD) >> threadgroup_fork_read_unlock(current); >> perf_event_fork(p); >> + seccomp_filter_fork(p, current); >> return p; >> >> bad_fork_free_pid: >> diff --git a/kernel/seccomp.c b/kernel/seccomp.c >> index 57d4b13..78719be 100644 >> --- a/kernel/seccomp.c >> +++ b/kernel/seccomp.c >> @@ -47,6 +47,14 @@ void __secure_computing(int this_syscall) >> return; >> } while (*++syscall); >> break; >> +#ifdef CONFIG_SECCOMP_FILTER >> + case 2: >> + if (seccomp_test_filters(this_syscall) == 0) >> + return; >> + >> + seccomp_filter_log_failure(this_syscall); >> + break; >> +#endif >> default: >> BUG(); >> } >> diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c >> new file mode 100644 >> index 0000000..4770847 >> --- /dev/null >> +++ b/kernel/seccomp_filter.c >> @@ -0,0 +1,639 @@ >> +/* bpf program-based system call filtering >> + * >> + * This program is free software; you can redistribute it and/or modify >> + * it under the terms of the GNU General Public License as published by >> + * the Free Software Foundation; either version 2 of the License, or >> + * (at your option) any later version. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public License >> + * along with this program; if not, write to the Free Software >> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. >> + * >> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@xxxxxxxxxxxx> >> + */ >> + >> +#include <linux/capability.h> >> +#include <linux/compat.h> >> +#include <linux/err.h> >> +#include <linux/errno.h> >> +#include <linux/rculist.h> >> +#include <linux/filter.h> >> +#include <linux/kallsyms.h> >> +#include <linux/kref.h> >> +#include <linux/module.h> >> +#include <linux/pid.h> >> +#include <linux/prctl.h> >> +#include <linux/ptrace.h> >> +#include <linux/ratelimit.h> >> +#include <linux/reciprocal_div.h> >> +#include <linux/regset.h> >> +#include <linux/seccomp.h> >> +#include <linux/security.h> >> +#include <linux/sched.h> >> +#include <linux/slab.h> >> +#include <linux/uaccess.h> >> +#include <linux/user.h> >> + >> + >> +/** >> + * struct seccomp_filter - container for seccomp BPF programs >> + * >> + * @usage: reference count to manage the object lifetime. >> + * get/put helpers should be used when accessing an instance >> + * outside of a lifetime-guarded section. In general, this >> + * is only needed for handling filters shared across tasks. >> + * @creator: pointer to the pid that created this filter >> + * @parent: pointer to the ancestor which this filter will be composed with. >> + * @flags: provide information about filter from creation time. >> + * @personality: personality of the process at filter creation time. >> + * @insns: the BPF program instructions to evaluate >> + * @count: the number of instructions in the program. >> + * >> + * seccomp_filter objects should never be modified after being attached >> + * to a task_struct (other than @usage). >> + */ >> +struct seccomp_filter { >> + struct kref usage; >> + struct pid *creator; >> + struct seccomp_filter *parent; >> + struct { >> + uint32_t admin:1, /* can allow execve */ >> + compat:1, /* CONFIG_COMPAT */ >> + __reserved:30; >> + } flags; >> + int personality; >> + unsigned short count; /* Instruction count */ >> + struct sock_filter insns[0]; >> +}; >> + >> +static unsigned int seccomp_run_filter(const u8 *buf, >> + const size_t buflen, >> + const struct sock_filter *); >> + >> +/** >> + * seccomp_filter_alloc - allocates a new filter object >> + * @padding: size of the insns[0] array in bytes >> + * >> + * The @padding should be a multiple of >> + * sizeof(struct sock_filter). >> + * >> + * Returns ERR_PTR on error or an allocated object. >> + */ >> +static struct seccomp_filter *seccomp_filter_alloc(unsigned long padding) >> +{ >> + struct seccomp_filter *f; >> + unsigned long bpf_blocks = padding / sizeof(struct sock_filter); >> + >> + /* Drop oversized requests. */ >> + if (bpf_blocks == 0 || bpf_blocks > BPF_MAXINSNS) >> + return ERR_PTR(-EINVAL); >> + >> + /* Padding should always be in sock_filter increments. */ >> + BUG_ON(padding % sizeof(struct sock_filter)); > > I still think the BUG_ON here is harsh given that the progsize is passed > in by userspace. Was there a reason not to return -EINVAL here? I've changed it in the next revision. As is, I don't believe userspace can control the size of padding directly, just the increment since it specifies its length in terms of bpf blocks (sizeof(struct sock_filter)). But EINVAL is certainly less aggressive :) >> + >> + f = kzalloc(sizeof(struct seccomp_filter) + padding, GFP_KERNEL); >> + if (!f) >> + return ERR_PTR(-ENOMEM); >> + kref_init(&f->usage); >> + f->creator = get_task_pid(current, PIDTYPE_PID); >> + f->count = bpf_blocks; >> + return f; >> +} >> + >> +/** >> + * seccomp_filter_free - frees the allocated filter. >> + * @filter: NULL or live object to be completely destructed. >> + */ >> +static void seccomp_filter_free(struct seccomp_filter *filter) >> +{ >> + if (!filter) >> + return; >> + put_seccomp_filter(filter->parent); >> + put_pid(filter->creator); >> + kfree(filter); >> +} >> + >> +static void __put_seccomp_filter(struct kref *kref) >> +{ >> + struct seccomp_filter *orig = >> + container_of(kref, struct seccomp_filter, usage); >> + seccomp_filter_free(orig); >> +} >> + >> +void seccomp_filter_log_failure(int syscall) >> +{ >> + pr_info("%s[%d]: system call %d blocked at 0x%lx\n", >> + current->comm, task_pid_nr(current), syscall, >> + KSTK_EIP(current)); >> +} >> + >> +/* put_seccomp_filter - decrements the ref count of @orig and may free. */ >> +void put_seccomp_filter(struct seccomp_filter *orig) >> +{ >> + if (!orig) >> + return; >> + kref_put(&orig->usage, __put_seccomp_filter); >> +} >> + >> +/* get_seccomp_filter - increments the reference count of @orig. */ >> +struct seccomp_filter *get_seccomp_filter(struct seccomp_filter *orig) >> +{ >> + if (!orig) >> + return NULL; >> + kref_get(&orig->usage); >> + return orig; >> +} >> + >> +static int seccomp_check_personality(struct seccomp_filter *filter) >> +{ >> + if (filter->personality != current->personality) >> + return -EACCES; >> +#ifdef CONFIG_COMPAT >> + if (filter->flags.compat != (!!(is_compat_task()))) >> + return -EACCES; >> +#endif >> + return 0; >> +} >> + >> +static const struct user_regset * >> +find_prstatus(const struct user_regset_view *view) >> +{ >> + const struct user_regset *regset; >> + int n; >> + >> + /* Skip 0. */ >> + for (n = 1; n < view->n; ++n) { >> + regset = view->regsets + n; >> + if (regset->core_note_type == NT_PRSTATUS) >> + return regset; >> + } >> + >> + return NULL; >> +} >> + >> +/** >> + * seccomp_get_regs - returns a pointer to struct user_regs_struct >> + * @scratch: preallocated storage of size @available >> + * @available: pointer to the size of scratch. >> + * >> + * Returns NULL if the registers cannot be acquired or copied. >> + * Returns a populated pointer to @scratch by default. >> + * Otherwise, returns a pointer to a a u8 array containing the struct >> + * user_regs_struct appropriate for the task personality. The pointer >> + * may be to the beginning of @scratch or to an externally managed data >> + * structure. On success, @available should be updated with the >> + * valid region size of the returned pointer. >> + * >> + * If the architecture overrides the linkage, then the pointer may pointer to >> + * another location. >> + */ >> +__weak u8 *seccomp_get_regs(u8 *scratch, size_t *available) >> +{ >> + /* regset is usually returned based on task personality, not current >> + * system call convention. This behavior makes it unsafe to execute >> + * BPF programs over regviews if is_compat_task or the personality >> + * have changed since the program was installed. >> + */ >> + const struct user_regset_view *view = task_user_regset_view(current); >> + const struct user_regset *regset = &view->regsets[0]; >> + size_t scratch_size = *available; >> + if (regset->core_note_type != NT_PRSTATUS) { >> + /* The architecture should override this method for speed. */ >> + regset = find_prstatus(view); >> + if (!regset) >> + return NULL; >> + } >> + *available = regset->n * regset->size; >> + /* Make sure the scratch space isn't exceeded. */ >> + if (*available > scratch_size) >> + *available = scratch_size; >> + if (regset->get(current, regset, 0, *available, scratch, NULL)) >> + return NULL; >> + return scratch; >> +} >> + >> +/** >> + * seccomp_test_filters - tests 'current' against the given syscall >> + * @syscall: number of the system call to test >> + * >> + * Returns 0 on ok and non-zero on error/failure. >> + */ >> +int seccomp_test_filters(int syscall) >> +{ >> + struct seccomp_filter *filter; >> + u8 regs_tmp[sizeof(struct user_regs_struct)], *regs; >> + size_t regs_size = sizeof(struct user_regs_struct); >> + int ret = -EACCES; >> + >> + filter = current->seccomp.filter; /* uses task ref */ >> + if (!filter) >> + goto out; >> + >> + /* All filters in the list are required to share the same system call >> + * convention so only the first filter is ever checked. >> + */ >> + if (seccomp_check_personality(filter)) >> + goto out; >> + >> + /* Grab the user_regs_struct. Normally, regs == ®s_tmp, but >> + * that is not mandatory. E.g., it may return a point to >> + * task_pt_regs(current). NULL checking is mandatory. >> + */ >> + regs = seccomp_get_regs(regs_tmp, ®s_size); >> + if (!regs) >> + goto out; >> + >> + /* Only allow a system call if it is allowed in all ancestors. */ >> + ret = 0; >> + for ( ; filter != NULL; filter = filter->parent) { >> + /* Allowed if return value is the size of the data supplied. */ >> + if (seccomp_run_filter(regs, regs_size, filter->insns) != >> + regs_size) >> + ret = -EACCES; >> + } >> +out: >> + return ret; >> +} >> + >> +/** >> + * seccomp_attach_filter: Attaches a seccomp filter to current. >> + * @fprog: BPF program to install >> + * >> + * Context: User context only. This function may sleep on allocation and >> + * operates on current. current must be attempting a system call >> + * when this is called (usually prctl). >> + * >> + * This function may be called repeatedly to install additional filters. >> + * Every filter successfully installed will be evaluated (in reverse order) >> + * for each system call the thread makes. >> + * >> + * Returns 0 on success or an errno on failure. >> + */ >> +long seccomp_attach_filter(struct sock_fprog *fprog) >> +{ >> + struct seccomp_filter *filter = NULL; >> + /* Note, len is a short so overflow should be impossible. */ >> + unsigned long fp_size = fprog->len * sizeof(struct sock_filter); >> + long ret = -EPERM; >> + >> + /* Allocate a new seccomp_filter */ >> + filter = seccomp_filter_alloc(fp_size); >> + if (IS_ERR(filter)) { >> + ret = PTR_ERR(filter); >> + goto out; >> + } >> + >> + /* Lock the process personality and calling convention. */ >> +#ifdef CONFIG_COMPAT >> + if (is_compat_task()) >> + filter->flags.compat = 1; >> +#endif >> + filter->personality = current->personality; >> + >> + /* Auditing is not needed since the capability wasn't requested */ >> + if (security_real_capable_noaudit(current, current_user_ns(), >> + CAP_SYS_ADMIN) == 0) >> + filter->flags.admin = 1; >> + >> + /* Copy the instructions from fprog. */ >> + ret = -EFAULT; >> + if (copy_from_user(filter->insns, fprog->filter, fp_size)) >> + goto out; >> + >> + /* Check the fprog */ >> + ret = sk_chk_filter(filter->insns, filter->count); >> + if (ret) >> + goto out; >> + >> + /* If there is an existing filter, make it the parent >> + * and reuse the existing task-based ref. >> + */ >> + filter->parent = current->seccomp.filter; >> + >> + /* Force all filters to use one system call convention. */ >> + ret = -EINVAL; >> + if (filter->parent) { >> + if (filter->parent->flags.compat != filter->flags.compat) >> + goto out; >> + if (filter->parent->personality != filter->personality) >> + goto out; >> + } >> + >> + /* Double claim the new filter so we can release it below simplifying >> + * the error paths earlier. >> + */ >> + ret = 0; >> + get_seccomp_filter(filter); >> + current->seccomp.filter = filter; >> + /* Engage seccomp if it wasn't. This doesn't use PR_SET_SECCOMP. */ >> + if (!current->seccomp.mode) { >> + current->seccomp.mode = 2; >> + set_thread_flag(TIF_SECCOMP); >> + } >> + >> +out: >> + put_seccomp_filter(filter); /* for get or task, on err */ >> + return ret; >> +} >> + >> +long prctl_attach_seccomp_filter(char __user *user_filter) >> +{ >> + struct sock_fprog fprog; >> + long ret = -EINVAL; >> + >> + ret = -EFAULT; >> + if (!user_filter) >> + goto out; >> + >> + if (copy_from_user(&fprog, user_filter, sizeof(fprog))) >> + goto out; >> + >> + ret = seccomp_attach_filter(&fprog); >> +out: >> + return ret; >> +} >> + >> +/** >> + * seccomp_check_exec: determines if exec is allowed for current >> + * Returns 0 if allowed. >> + */ >> +int seccomp_check_exec(void) >> +{ >> + if (current->seccomp.mode != 2) >> + return 0; >> + /* We can rely on the task refcount for the filter. */ >> + if (!current->seccomp.filter) >> + return -EPERM; >> + /* The last attached filter set for the process is checked. It must >> + * have been installed with CAP_SYS_ADMIN capabilities. > > This comment is confusing. By 'It must' you mean that if not, it's > denied. But if I didn't know better I would read that as "we can't > get to this code unless". Can you change it to something like > "Exec is refused unless the filter was installed with CAP_SYS_ADMIN > privilege"? Sounds good! >> + */ >> + if (current->seccomp.filter->flags.admin) >> + return 0; >> + return -EPERM; >> +} >> + >> +/* seccomp_filter_fork: manages inheritance on fork >> + * @child: forkee >> + * @parent: forker >> + * Ensures that @child inherit a seccomp_filter iff seccomp is enabled >> + * and the set of filters is marked as 'enabled'. >> + */ >> +void seccomp_filter_fork(struct task_struct *child, >> + struct task_struct *parent) >> +{ >> + if (!parent->seccomp.mode) >> + return; >> + child->seccomp.mode = parent->seccomp.mode; >> + child->seccomp.filter = get_seccomp_filter(parent->seccomp.filter); >> +} >> + >> +/* Returns a pointer to the BPF evaluator after checking the offset and size >> + * boundaries. The signature almost matches the signature from >> + * net/core/filter.c with the hopes of sharing code in the future. >> + */ >> +static const void *load_pointer(const u8 *buf, size_t buflen, >> + int offset, size_t size, >> + void *unused) >> +{ >> + if (offset >= buflen) >> + goto fail; >> + if (offset < 0) >> + goto fail; >> + if (size > buflen - offset) >> + goto fail; >> + return buf + offset; >> +fail: >> + return NULL; >> +} >> + >> +/** >> + * seccomp_run_filter - evaluate BPF (over user_regs_struct) >> + * @buf: buffer to execute the filter over >> + * @buflen: length of the buffer >> + * @fentry: filter to apply >> + * >> + * Decode and apply filter instructions to the buffer. >> + * Return length to keep, 0 for none. @buf is a regset we are >> + * filtering, @filter is the array of filter instructions. >> + * Because all jumps are guaranteed to be before last instruction, >> + * and last instruction guaranteed to be a RET, we dont need to check >> + * flen. >> + * >> + * See core/net/filter.c as this is nearly an exact copy. >> + * At some point, it would be nice to merge them to take advantage of >> + * optimizations (like JIT). >> + * >> + * A successful filter must return the full length of the data. Anything less >> + * will currently result in a seccomp failure. In the future, it may be >> + * possible to use that for hard filtering registers on the fly so it is >> + * ideal for consumers to return 0 on intended failure. >> + */ >> +static unsigned int seccomp_run_filter(const u8 *buf, >> + const size_t buflen, >> + const struct sock_filter *fentry) >> +{ >> + const void *ptr; >> + u32 A = 0; /* Accumulator */ >> + u32 X = 0; /* Index Register */ >> + u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ >> + u32 tmp; >> + int k; >> + >> + /* >> + * Process array of filter instructions. >> + */ >> + for (;; fentry++) { >> +#if defined(CONFIG_X86_32) >> +#define K (fentry->k) >> +#else >> + const u32 K = fentry->k; >> +#endif >> + >> + switch (fentry->code) { >> + case BPF_S_ALU_ADD_X: >> + A += X; >> + continue; >> + case BPF_S_ALU_ADD_K: >> + A += K; >> + continue; >> + case BPF_S_ALU_SUB_X: >> + A -= X; >> + continue; >> + case BPF_S_ALU_SUB_K: >> + A -= K; >> + continue; >> + case BPF_S_ALU_MUL_X: >> + A *= X; >> + continue; >> + case BPF_S_ALU_MUL_K: >> + A *= K; >> + continue; >> + case BPF_S_ALU_DIV_X: >> + if (X == 0) >> + return 0; >> + A /= X; >> + continue; >> + case BPF_S_ALU_DIV_K: >> + A = reciprocal_divide(A, K); >> + continue; >> + case BPF_S_ALU_AND_X: >> + A &= X; >> + continue; >> + case BPF_S_ALU_AND_K: >> + A &= K; >> + continue; >> + case BPF_S_ALU_OR_X: >> + A |= X; >> + continue; >> + case BPF_S_ALU_OR_K: >> + A |= K; >> + continue; >> + case BPF_S_ALU_LSH_X: >> + A <<= X; >> + continue; >> + case BPF_S_ALU_LSH_K: >> + A <<= K; >> + continue; >> + case BPF_S_ALU_RSH_X: >> + A >>= X; >> + continue; >> + case BPF_S_ALU_RSH_K: >> + A >>= K; >> + continue; >> + case BPF_S_ALU_NEG: >> + A = -A; >> + continue; >> + case BPF_S_JMP_JA: >> + fentry += K; >> + continue; >> + case BPF_S_JMP_JGT_K: >> + fentry += (A > K) ? fentry->jt : fentry->jf; >> + continue; >> + case BPF_S_JMP_JGE_K: >> + fentry += (A >= K) ? fentry->jt : fentry->jf; >> + continue; >> + case BPF_S_JMP_JEQ_K: >> + fentry += (A == K) ? fentry->jt : fentry->jf; >> + continue; >> + case BPF_S_JMP_JSET_K: >> + fentry += (A & K) ? fentry->jt : fentry->jf; >> + continue; >> + case BPF_S_JMP_JGT_X: >> + fentry += (A > X) ? fentry->jt : fentry->jf; >> + continue; >> + case BPF_S_JMP_JGE_X: >> + fentry += (A >= X) ? fentry->jt : fentry->jf; >> + continue; >> + case BPF_S_JMP_JEQ_X: >> + fentry += (A == X) ? fentry->jt : fentry->jf; >> + continue; >> + case BPF_S_JMP_JSET_X: >> + fentry += (A & X) ? fentry->jt : fentry->jf; >> + continue; >> + case BPF_S_LD_W_ABS: >> + k = K; >> +load_w: >> + ptr = load_pointer(buf, buflen, k, 4, &tmp); >> + if (ptr != NULL) { >> + /* Note, unlike on network data, values are not >> + * byte swapped. >> + */ >> + A = *(const u32 *)ptr; >> + continue; >> + } >> + return 0; >> + case BPF_S_LD_H_ABS: >> + k = K; >> +load_h: >> + ptr = load_pointer(buf, buflen, k, 2, &tmp); >> + if (ptr != NULL) { >> + A = *(const u16 *)ptr; >> + continue; >> + } >> + return 0; >> + case BPF_S_LD_B_ABS: >> + k = K; >> +load_b: >> + ptr = load_pointer(buf, buflen, k, 1, &tmp); >> + if (ptr != NULL) { >> + A = *(const u8 *)ptr; >> + continue; >> + } >> + return 0; >> + case BPF_S_LD_W_LEN: >> + A = buflen; >> + continue; >> + case BPF_S_LDX_W_LEN: >> + X = buflen; >> + continue; >> + case BPF_S_LD_W_IND: >> + k = X + K; >> + goto load_w; >> + case BPF_S_LD_H_IND: >> + k = X + K; >> + goto load_h; >> + case BPF_S_LD_B_IND: >> + k = X + K; >> + goto load_b; >> + case BPF_S_LDX_B_MSH: >> + ptr = load_pointer(buf, buflen, K, 1, &tmp); >> + if (ptr != NULL) { >> + X = (*(u8 *)ptr & 0xf) << 2; >> + continue; >> + } >> + return 0; >> + case BPF_S_LD_IMM: >> + A = K; >> + continue; >> + case BPF_S_LDX_IMM: >> + X = K; >> + continue; >> + case BPF_S_LD_MEM: >> + A = mem[K]; >> + continue; >> + case BPF_S_LDX_MEM: >> + X = mem[K]; >> + continue; >> + case BPF_S_MISC_TAX: >> + X = A; >> + continue; >> + case BPF_S_MISC_TXA: >> + A = X; >> + continue; >> + case BPF_S_RET_K: >> + return K; >> + case BPF_S_RET_A: >> + return A; >> + case BPF_S_ST: >> + mem[K] = A; >> + continue; >> + case BPF_S_STX: >> + mem[K] = X; >> + continue; >> + case BPF_S_ANC_PROTOCOL: >> + case BPF_S_ANC_PKTTYPE: >> + case BPF_S_ANC_IFINDEX: >> + case BPF_S_ANC_MARK: >> + case BPF_S_ANC_QUEUE: >> + case BPF_S_ANC_HATYPE: >> + case BPF_S_ANC_RXHASH: >> + case BPF_S_ANC_CPU: >> + case BPF_S_ANC_NLATTR: >> + case BPF_S_ANC_NLATTR_NEST: >> + /* ignored */ >> + continue; >> + default: >> + WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n", >> + fentry->code, fentry->jt, >> + fentry->jf, fentry->k); >> + return 0; >> + } >> + } >> + >> + return 0; >> +} >> diff --git a/kernel/sys.c b/kernel/sys.c >> index 481611f..77f2eda 100644 >> --- a/kernel/sys.c >> +++ b/kernel/sys.c >> @@ -1783,6 +1783,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, >> case PR_SET_SECCOMP: >> error = prctl_set_seccomp(arg2); >> break; >> + case PR_ATTACH_SECCOMP_FILTER: >> + error = prctl_attach_seccomp_filter((char __user *) >> + arg2); >> + break; >> case PR_GET_TSC: >> error = GET_TSC_CTL(arg2); >> break; >> diff --git a/security/Kconfig b/security/Kconfig >> index 51bd5a0..77b1106 100644 >> --- a/security/Kconfig >> +++ b/security/Kconfig >> @@ -84,6 +84,18 @@ config SECURITY_DMESG_RESTRICT >> >> If you are unsure how to answer this question, answer N. >> >> +config SECCOMP_FILTER >> + bool "Enable seccomp-based system call filtering" >> + select SECCOMP >> + depends on EXPERIMENTAL >> + help >> + This kernel feature expands CONFIG_SECCOMP to allow computing >> + in environments with reduced kernel access dictated by a system >> + call filter, expressed in BPF, installed by the application itself >> + through prctl(2). >> + >> + See Documentation/prctl/seccomp_filter.txt for more detail. >> + >> config SECURITY >> bool "Enable different security models" >> depends on SYSFS >> -- >> 1.7.5.4 >> -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html