One of the most common pain points with seccomp filters has been dealing with the overhead of processing the filters, especially for "always allow" or "always reject" cases. While BPF is extremely fast[1], it will always have overhead associated with it. Additionally, due to seccomp's design, filters are layered, which means processing time goes up as the number of filters attached goes up. In the past, efforts have been focused on making filter execution complete in a shorter amount of time. For example, filters were rewritten from using linear if/then/else syscall search to using balanced binary trees, or moving tests for syscalls common to the process's workload to the front of the filter. However, there are limits to this, especially when some processes are dealing with tens of filters[2], or when some architectures have a less efficient BPF engine[3]. The most common use of seccomp, constructing syscall block/allow-lists, where syscalls that are always allowed or always rejected (without regard to any arguments), also tends to produce the most pathological runtime problems, in that a large number of syscall checks in the filter need to be performed to come to a determination. In order to optimize these cases from O(n) to O(1), seccomp can use bitmaps to immediately determine the desired action. A critical observation in the prior paragraph bears repeating: the common case for syscall tests do not check arguments. For any given filter, there is a constant mapping from the combination of architecture and syscall to the seccomp action result. (For kernels/architectures without CONFIG_COMPAT, there is a single architecture.). As such, it is possible to construct a mapping of arch/syscall to action, which can be updated as new filters are attached to a process. In order to build this mapping at filter attach time, each filter is executed for every syscall (under each possible architecture), and checked for any accesses of struct seccomp_data that are not the "arch" nor "nr" (syscall) members. If only "arch" and "nr" are examined, then there is a constant mapping for that syscall, and bitmaps can be updated accordingly. If any accesses happen outside of those struct members, seccomp must not bypass filter execution for that syscall, since program state will be used to determine filter action result. (This logic comes in the next patch.) [1] https://lore.kernel.org/bpf/20200531171915.wsxvdjeetmhpsdv2@xxxxxxxxxxxxxxxxxxxxxxxxxxxx/ [2] https://lore.kernel.org/bpf/20200601101137.GA121847@gardel-login/ [3] https://lore.kernel.org/bpf/717a06e7f35740ccb4c70470ec70fb2f@xxxxxxxxxx/ Signed-off-by: Kees Cook <keescook@xxxxxxxxxxxx> --- include/linux/seccomp.h | 18 ++++ kernel/seccomp.c | 207 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 221 insertions(+), 4 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 0be20bc81ea9..96df2f899e3d 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -25,6 +25,17 @@ #define SECCOMP_ARCH_IS_MULTIPLEX 3 #define SECCOMP_ARCH_IS_UNKNOWN 0xff +/* When no bits are set for a syscall, filters are run. */ +struct seccomp_bitmaps { +#ifdef SECCOMP_ARCH + /* "allow" are initialized to set and only ever get cleared. */ + DECLARE_BITMAP(allow, NR_syscalls); + /* These are initialized to clear and only ever get set. */ + DECLARE_BITMAP(kill_thread, NR_syscalls); + DECLARE_BITMAP(kill_process, NR_syscalls); +#endif +}; + struct seccomp_filter; /** * struct seccomp - the state of a seccomp'ed process @@ -45,6 +56,13 @@ struct seccomp { #endif atomic_t filter_count; struct seccomp_filter *filter; + struct seccomp_bitmaps native; +#ifdef CONFIG_COMPAT + struct seccomp_bitmaps compat; +#endif +#ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH + struct seccomp_bitmaps multiplex; +#endif }; #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0a3ff8eb8aea..111a238bc532 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -318,7 +318,7 @@ static inline u8 seccomp_get_arch(u32 syscall_arch, u32 syscall_nr) #ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH if (syscall_arch == SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH) { - seccomp_arch |= (sd->nr & SECCOMP_MULTIPLEXED_SYSCALL_TABLE_MASK) >> + seccomp_arch |= (syscall_nr & SECCOMP_MULTIPLEXED_SYSCALL_TABLE_MASK) >> SECCOMP_MULTIPLEXED_SYSCALL_TABLE_SHIFT; } #endif @@ -559,6 +559,21 @@ static inline void seccomp_sync_threads(unsigned long flags) atomic_set(&thread->seccomp.filter_count, atomic_read(&thread->seccomp.filter_count)); + /* Copy syscall filter bitmaps. */ + memcpy(&thread->seccomp.native, + &caller->seccomp.native, + sizeof(caller->seccomp.native)); +#ifdef CONFIG_COMPAT + memcpy(&thread->seccomp.compat, + &caller->seccomp.compat, + sizeof(caller->seccomp.compat)); +#endif +#ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH + memcpy(&thread->seccomp.multiplex, + &caller->seccomp.multiplex, + sizeof(caller->seccomp.multiplex)); +#endif + /* * Don't let an unprivileged task work around * the no_new_privs restriction by creating @@ -661,6 +676,114 @@ seccomp_prepare_user_filter(const char __user *user_filter) return filter; } +static inline bool sd_touched(pte_t *ptep) +{ + return !!pte_young(*(READ_ONCE(ptep))); +} + +#ifdef SECCOMP_ARCH +/* + * We can build constant-action bitmaps only when an arch/nr combination reads + * nothing more that sd->nr and sd->arch, since those have a constant mapping + * to the syscall. + * + * This approach could also be used to test for access to sd->arch too, + * if we wanted to warn about compat-unsafe filters. + */ +static inline bool seccomp_filter_action_is_constant(struct bpf_prog *prog, + struct seccomp_data *sd, + u32 *action) +{ + /* No evaluation implementation yet. */ + return false; +} + +/* + * Walk everyone syscall combination for this arch/mask combo and update + * the bitmaps with any results. + */ +static void seccomp_update_bitmap(struct seccomp_filter *filter, + void *pagepair, u32 arch, u32 mask, + struct seccomp_bitmaps *bitmaps) +{ + struct seccomp_data sd = { }; + bool constant; + u32 nr, ret; + + /* Initialize bitmaps for first filter. */ + if (!filter->prev) + bitmap_fill(bitmaps->allow, NR_syscalls); + + /* + * For every syscall, if we don't already know we need to run + * the full filter, simulate the filter with our static values. + */ + for (nr = 0; nr < NR_syscalls; nr++) { + /* Are we already at the maximal rejection state? */ + if (test_bit(nr, bitmaps->kill_process)) + continue; + + sd.nr = nr | mask; + sd.arch = arch; + + /* Evaluate filter for this arch/syscall. */ + constant = seccomp_filter_action_is_constant(filter->prog, &sd, &ret); + + /* + * If this run through the filter didn't access + * beyond "arch", we know the result is a constant + * mapping for arch/nr -> ret. + */ + if (constant) { + /* Constant evaluation. Mark appropriate bitmaps. */ + switch (ret) { + case SECCOMP_RET_KILL_PROCESS: + set_bit(nr, bitmaps->kill_process); + break; + case SECCOMP_RET_KILL_THREAD: + set_bit(nr, bitmaps->kill_thread); + break; + default: + break; + case SECCOMP_RET_ALLOW: + /* + * If we always map to allow, there are + * no changes needed to the bitmaps. + */ + continue; + } + } + + /* + * Dynamic evaluation of syscall, or non-allow constant + * mapping to something other than SECCOMP_RET_ALLOW: we + * must not short-circuit-allow it anymore. + */ + clear_bit(nr, bitmaps->allow); + } +} + +static void seccomp_update_bitmaps(struct seccomp_filter *filter, + void *pagepair) +{ + seccomp_update_bitmap(filter, pagepair, SECCOMP_ARCH, 0, + ¤t->seccomp.native); +#ifdef CONFIG_COMPAT + seccomp_update_bitmap(filter, pagepair, SECCOMP_ARCH_COMPAT, 0, + ¤t->seccomp.compat); +#endif +#ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH + seccomp_update_bitmap(filter, pagepair, SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH, + SECCOMP_MULTIPLEXED_SYSCALL_TABLE_MASK, + ¤t->seccomp.multiplex); +#endif +} +#else +static void seccomp_update_bitmaps(struct seccomp_filter *filter, + void *pagepair) +{ } +#endif + /** * seccomp_attach_filter: validate and attach filter * @flags: flags to change filter behavior @@ -674,7 +797,8 @@ seccomp_prepare_user_filter(const char __user *user_filter) * - in NEW_LISTENER mode: the fd of the new listener */ static long seccomp_attach_filter(unsigned int flags, - struct seccomp_filter *filter) + struct seccomp_filter *filter, + void *pagepair) { unsigned long total_insns; struct seccomp_filter *walker; @@ -713,6 +837,9 @@ static long seccomp_attach_filter(unsigned int flags, current->seccomp.filter = filter; atomic_inc(¤t->seccomp.filter_count); + /* Evaluate filter for new known-outcome syscalls */ + seccomp_update_bitmaps(filter, pagepair); + /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) seccomp_sync_threads(flags); @@ -970,6 +1097,65 @@ static int seccomp_do_user_notification(int this_syscall, return -1; } +#ifdef SECCOMP_ARCH +static inline bool __bypass_filter(struct seccomp_bitmaps *bitmaps, + u32 nr, u32 *filter_ret) +{ + if (nr < NR_syscalls) { + if (test_bit(nr, bitmaps->allow)) { + *filter_ret = SECCOMP_RET_ALLOW; + return true; + } + if (test_bit(nr, bitmaps->kill_process)) { + *filter_ret = SECCOMP_RET_KILL_PROCESS; + return true; + } + if (test_bit(nr, bitmaps->kill_thread)) { + *filter_ret = SECCOMP_RET_KILL_THREAD; + return true; + } + } + return false; +} + +static inline u32 check_syscall(const struct seccomp_data *sd, + struct seccomp_filter **match) +{ + u32 filter_ret = SECCOMP_RET_KILL_PROCESS; + u8 arch = seccomp_get_arch(sd->arch, sd->nr); + + switch (arch) { + case SECCOMP_ARCH_IS_NATIVE: + if (__bypass_filter(¤t->seccomp.native, sd->nr, &filter_ret)) + return filter_ret; + break; +#ifdef CONFIG_COMPAT + case SECCOMP_ARCH_IS_COMPAT: + if (__bypass_filter(¤t->seccomp.compat, sd->nr, &filter_ret)) + return filter_ret; + break; +#endif +#ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH + case SECCOMP_ARCH_IS_MULTIPLEX: + if (__bypass_filter(¤t->seccomp.multiplex, sd->nr, &filter_ret)) + return filter_ret; + break; +#endif + default: + WARN_ON_ONCE(1); + return filter_ret; + } + + return seccomp_run_filters(sd, match); +} +#else +static inline u32 check_syscall(const struct seccomp_data *sd, + struct seccomp_filter **match) +{ + return seccomp_run_filters(sd, match); +} +#endif + static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { @@ -989,7 +1175,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, sd = &sd_local; } - filter_ret = seccomp_run_filters(sd, &match); + filter_ret = check_syscall(sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; @@ -1580,6 +1766,7 @@ static long seccomp_set_mode_filter(unsigned int flags, long ret = -EINVAL; int listener = -1; struct file *listener_f = NULL; + void *pagepair; /* Validate flags. */ if (flags & ~SECCOMP_FILTER_FLAG_MASK) @@ -1625,12 +1812,24 @@ static long seccomp_set_mode_filter(unsigned int flags, mutex_lock_killable(¤t->signal->cred_guard_mutex)) goto out_put_fd; + /* + * This memory will be needed for bitmap testing, but we'll + * be holding a spinlock at that point. Do the allocation + * (and free) outside of the lock. + * + * Alternative: we could do the bitmap update before attach + * to avoid spending too much time under lock. + */ + pagepair = vzalloc(PAGE_SIZE * 2); + if (!pagepair) + goto out_put_fd; + spin_lock_irq(¤t->sighand->siglock); if (!seccomp_may_assign_mode(seccomp_mode)) goto out; - ret = seccomp_attach_filter(flags, prepared); + ret = seccomp_attach_filter(flags, prepared, pagepair); if (ret) goto out; /* Do not free the successfully attached filter. */ -- 2.25.1 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers