On Thu, Sep 24, 2020 at 1:29 AM Kees Cook <keescook@xxxxxxxxxxxx> wrote: > One of the most common pain points with seccomp filters has been dealing > with the overhead of processing the filters, especially for "always allow" > or "always reject" cases. The "always reject" cases don't need to be fast, in particular not the kill_thread/kill_process ones. Nobody's going to have "process kills itself by executing a forbidden syscall" on a critical hot codepath. > While BPF is extremely fast[1], it will always > have overhead associated with it. Additionally, due to seccomp's design, > filters are layered, which means processing time goes up as the number > of filters attached goes up. [...] > In order to build this mapping at filter attach time, each filter is > executed for every syscall (under each possible architecture), and > checked for any accesses of struct seccomp_data that are not the "arch" > nor "nr" (syscall) members. If only "arch" and "nr" are examined, then > there is a constant mapping for that syscall, and bitmaps can be updated > accordingly. If any accesses happen outside of those struct members, > seccomp must not bypass filter execution for that syscall, since program > state will be used to determine filter action result. (This logic comes > in the next patch.) > > [1] https://lore.kernel.org/bpf/20200531171915.wsxvdjeetmhpsdv2@xxxxxxxxxxxxxxxxxxxxxxxxxxxx/ > [2] https://lore.kernel.org/bpf/20200601101137.GA121847@gardel-login/ > [3] https://lore.kernel.org/bpf/717a06e7f35740ccb4c70470ec70fb2f@xxxxxxxxxx/ > > Signed-off-by: Kees Cook <keescook@xxxxxxxxxxxx> > --- > include/linux/seccomp.h | 18 ++++ > kernel/seccomp.c | 207 +++++++++++++++++++++++++++++++++++++++- > 2 files changed, 221 insertions(+), 4 deletions(-) > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > index 0be20bc81ea9..96df2f899e3d 100644 > --- a/include/linux/seccomp.h > +++ b/include/linux/seccomp.h > @@ -25,6 +25,17 @@ > #define SECCOMP_ARCH_IS_MULTIPLEX 3 > #define SECCOMP_ARCH_IS_UNKNOWN 0xff > > +/* When no bits are set for a syscall, filters are run. */ > +struct seccomp_bitmaps { > +#ifdef SECCOMP_ARCH > + /* "allow" are initialized to set and only ever get cleared. */ > + DECLARE_BITMAP(allow, NR_syscalls); This bitmap makes sense. The "NR_syscalls" part assumes that the compat syscall tables will not be bigger than the native syscall table, right? I guess that's usually mostly true nowadays, thanks to the syscall table unification... (might be worth a comment though) > + /* These are initialized to clear and only ever get set. */ > + DECLARE_BITMAP(kill_thread, NR_syscalls); > + DECLARE_BITMAP(kill_process, NR_syscalls); I don't think these bitmaps make sense, this is not part of any fastpath. (However, a "which syscalls have a fixed result" bitmap might make sense if we want to export the list of permitted syscalls as a text file in procfs, as I mentioned over at <https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@xxxxxxxxxxxxxx/>.) > +#endif > +}; > + > struct seccomp_filter; > /** > * struct seccomp - the state of a seccomp'ed process > @@ -45,6 +56,13 @@ struct seccomp { > #endif > atomic_t filter_count; > struct seccomp_filter *filter; > + struct seccomp_bitmaps native; > +#ifdef CONFIG_COMPAT > + struct seccomp_bitmaps compat; > +#endif > +#ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH > + struct seccomp_bitmaps multiplex; > +#endif Why do we have one bitmap per thread (in struct seccomp) instead of putting the bitmap for a given filter and all its ancestors into the seccomp_filter? > }; > > #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index 0a3ff8eb8aea..111a238bc532 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -318,7 +318,7 @@ static inline u8 seccomp_get_arch(u32 syscall_arch, u32 syscall_nr) > > #ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH > if (syscall_arch == SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH) { > - seccomp_arch |= (sd->nr & SECCOMP_MULTIPLEXED_SYSCALL_TABLE_MASK) >> > + seccomp_arch |= (syscall_nr & SECCOMP_MULTIPLEXED_SYSCALL_TABLE_MASK) >> > SECCOMP_MULTIPLEXED_SYSCALL_TABLE_SHIFT; This belongs over into patch 1. > } > #endif > @@ -559,6 +559,21 @@ static inline void seccomp_sync_threads(unsigned long flags) > atomic_set(&thread->seccomp.filter_count, > atomic_read(&thread->seccomp.filter_count)); > > + /* Copy syscall filter bitmaps. */ > + memcpy(&thread->seccomp.native, > + &caller->seccomp.native, > + sizeof(caller->seccomp.native)); > +#ifdef CONFIG_COMPAT > + memcpy(&thread->seccomp.compat, > + &caller->seccomp.compat, > + sizeof(caller->seccomp.compat)); > +#endif > +#ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH > + memcpy(&thread->seccomp.multiplex, > + &caller->seccomp.multiplex, > + sizeof(caller->seccomp.multiplex)); > +#endif This part wouldn't be necessary if the bitmasks were part of the seccomp_filter... > /* > * Don't let an unprivileged task work around > * the no_new_privs restriction by creating > @@ -661,6 +676,114 @@ seccomp_prepare_user_filter(const char __user *user_filter) > return filter; > } > > +static inline bool sd_touched(pte_t *ptep) > +{ > + return !!pte_young(*(READ_ONCE(ptep))); > +} I think this is left over from the previous version and should've been removed? [...] > +/* > + * Walk everyone syscall combination for this arch/mask combo and update nit: "Walk every possible", or something like that > + * the bitmaps with any results. > + */ > +static void seccomp_update_bitmap(struct seccomp_filter *filter, > + void *pagepair, u32 arch, u32 mask, > + struct seccomp_bitmaps *bitmaps) [...] > @@ -970,6 +1097,65 @@ static int seccomp_do_user_notification(int this_syscall, > return -1; > } > > +#ifdef SECCOMP_ARCH > +static inline bool __bypass_filter(struct seccomp_bitmaps *bitmaps, > + u32 nr, u32 *filter_ret) > +{ > + if (nr < NR_syscalls) { > + if (test_bit(nr, bitmaps->allow)) { > + *filter_ret = SECCOMP_RET_ALLOW; > + return true; > + } > + if (test_bit(nr, bitmaps->kill_process)) { > + *filter_ret = SECCOMP_RET_KILL_PROCESS; > + return true; > + } > + if (test_bit(nr, bitmaps->kill_thread)) { > + *filter_ret = SECCOMP_RET_KILL_THREAD; > + return true; > + } The checks against ->kill_process and ->kill_thread won't make anything faster, but since they will run in the fastpath, they'll probably actually contribute to making things *slower*. > + } > + return false; > +} > + > +static inline u32 check_syscall(const struct seccomp_data *sd, > + struct seccomp_filter **match) > +{ > + u32 filter_ret = SECCOMP_RET_KILL_PROCESS; > + u8 arch = seccomp_get_arch(sd->arch, sd->nr); > + > + switch (arch) { > + case SECCOMP_ARCH_IS_NATIVE: > + if (__bypass_filter(¤t->seccomp.native, sd->nr, &filter_ret)) > + return filter_ret; > + break; > +#ifdef CONFIG_COMPAT > + case SECCOMP_ARCH_IS_COMPAT: > + if (__bypass_filter(¤t->seccomp.compat, sd->nr, &filter_ret)) > + return filter_ret; > + break; > +#endif > +#ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH > + case SECCOMP_ARCH_IS_MULTIPLEX: > + if (__bypass_filter(¤t->seccomp.multiplex, sd->nr, &filter_ret)) > + return filter_ret; > + break; > +#endif > + default: > + WARN_ON_ONCE(1); > + return filter_ret; > + }; > + > + return seccomp_run_filters(sd, match); > +} You could write this in a less repetitive way, and especially if we get rid of the kill_* masks, also more compact: static inline u32 check_syscall(const struct seccomp_data *sd, struct seccomp_filter **match) { struct seccomp_bitmaps *bitmaps; u32 filter_ret; switch (arch) { case SECCOMP_ARCH_IS_NATIVE: bitmaps = ¤t->seccomp.native; break; #ifdef CONFIG_COMPAT case SECCOMP_ARCH_IS_COMPAT: bitmaps = ¤t->seccomp.compat; break; #endif #ifdef SECCOMP_MULTIPLEXED_SYSCALL_TABLE_ARCH case SECCOMP_ARCH_IS_MULTIPLEX: bitmaps = ¤t->seccomp.multiplex; break; #endif default: WARN_ON_ONCE(1); return SECCOMP_RET_KILL_PROCESS; } if ((unsigned)sd->nr < __NR_syscalls && test_bit(sd->nr, bitmaps->allow)) return SECCOMP_RET_ALLOW; return seccomp_run_filters(sd, match); } [...] > @@ -1625,12 +1812,24 @@ static long seccomp_set_mode_filter(unsigned int flags, > mutex_lock_killable(¤t->signal->cred_guard_mutex)) > goto out_put_fd; > > + /* > + * This memory will be needed for bitmap testing, but we'll > + * be holding a spinlock at that point. Do the allocation > + * (and free) outside of the lock. > + * > + * Alternative: we could do the bitmap update before attach > + * to avoid spending too much time under lock. > + */ > + pagepair = vzalloc(PAGE_SIZE * 2); > + if (!pagepair) > + goto out_put_fd; > + [...] > - ret = seccomp_attach_filter(flags, prepared); > + ret = seccomp_attach_filter(flags, prepared, pagepair); You probably intended to rip this stuff back out? AFAIU the vzalloc() stuff is a remnant from the old version that relied on MMU trickery.