From: YiFei Zhu <yifeifz2@xxxxxxxxxxxx> SECCOMP_CACHE_NR_ONLY will only operate on syscalls that do not access any syscall arguments or instruction pointer. To facilitate this we need a static analyser to know whether a filter will return allow regardless of syscall arguments for a given architecture number / syscall number pair. This is implemented here with a pseudo-emulator, and stored in a per-filter bitmap. Each common BPF instruction are emulated. Any weirdness or loading from a syscall argument will cause the emulator to bail. The emulation is also halted if it reaches a return. In that case, if it returns an SECCOMP_RET_ALLOW, the syscall is marked as good. Emulator structure and comments are from Kees [1] and Jann [2]. Emulation is done at attach time. If a filter depends on more filters, and if the dependee does not guarantee to allow the syscall, then we skip the emulation of this syscall. [1] https://lore.kernel.org/lkml/20200923232923.3142503-5-keescook@xxxxxxxxxxxx/ [2] https://lore.kernel.org/lkml/CAG48ez1p=dR_2ikKq=xVxkoGg0fYpTBpkhJSv1w-6BG=76PAvw@xxxxxxxxxxxxxx/ Signed-off-by: YiFei Zhu <yifeifz2@xxxxxxxxxxxx> --- arch/Kconfig | 34 ++++++++++ arch/x86/Kconfig | 1 + kernel/seccomp.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 201 insertions(+), 1 deletion(-) diff --git a/arch/Kconfig b/arch/Kconfig index 21a3675a7a3a..ca867b2a5d71 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -471,6 +471,14 @@ config HAVE_ARCH_SECCOMP_FILTER results in the system call being skipped immediately. - seccomp syscall wired up +config HAVE_ARCH_SECCOMP_CACHE_NR_ONLY + bool + help + An arch should select this symbol if it provides all of these things: + - all the requirements for HAVE_ARCH_SECCOMP_FILTER + - SECCOMP_ARCH_DEFAULT + - SECCOMP_ARCH_DEFAULT_NR + config SECCOMP prompt "Enable seccomp to safely execute untrusted bytecode" def_bool y @@ -498,6 +506,32 @@ config SECCOMP_FILTER See Documentation/userspace-api/seccomp_filter.rst for details. +choice + prompt "Seccomp filter cache" + default SECCOMP_CACHE_NONE + depends on SECCOMP_FILTER + depends on HAVE_ARCH_SECCOMP_CACHE_NR_ONLY + help + Seccomp filters can potentially incur large overhead for each + system call. This can alleviate some of the overhead. + + If in doubt, select 'syscall numbers only'. + +config SECCOMP_CACHE_NONE + bool "None" + help + No caching is done. Seccomp filters will be called each time + a system call occurs in a seccomp-guarded task. + +config SECCOMP_CACHE_NR_ONLY + bool "Syscall number only" + depends on HAVE_ARCH_SECCOMP_CACHE_NR_ONLY + help + For each syscall number, if the seccomp filter has a fixed + result, store that result in a bitmap to speed up system calls. + +endchoice + config HAVE_ARCH_STACKLEAK bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1ab22869a765..ff5289228ea5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -150,6 +150,7 @@ config X86 select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT select HAVE_ARCH_PREL32_RELOCATIONS select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_SECCOMP_CACHE_NR_ONLY select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_STACKLEAK select HAVE_ARCH_TRACEHOOK diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ae6b40cc39f4..f09c9e74ae05 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -143,6 +143,37 @@ struct notification { struct list_head notifications; }; +#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY +/** + * struct seccomp_cache_filter_data - container for cache's per-filter data + * + * Tis struct is ordered to minimize padding holes. + * + * @syscall_allow_default: A bitmap where each bit represents whether the + * filter willalways allow the syscall, for the + * default architecture. + * @syscall_allow_compat: A bitmap where each bit represents whether the + * filter will always allow the syscall, for the + * compat architecture. + */ +struct seccomp_cache_filter_data { +#ifdef SECCOMP_ARCH_DEFAULT + DECLARE_BITMAP(syscall_allow_default, SECCOMP_ARCH_DEFAULT_NR); +#endif +#ifdef SECCOMP_ARCH_COMPAT + DECLARE_BITMAP(syscall_allow_compat, SECCOMP_ARCH_COMPAT_NR); +#endif +}; + +#define SECCOMP_EMU_MAX_PENDING_STATES 64 +#else +struct seccomp_cache_filter_data { }; + +static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter) +{ +} +#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */ + /** * struct seccomp_filter - container for seccomp BPF programs * @@ -159,6 +190,7 @@ struct notification { * this filter after reaching 0. The @users count is always smaller * or equal to @refs. Hence, reaching 0 for @users does not mean * the filter can be freed. + * @cache: container for cache-related data. * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate @@ -180,6 +212,7 @@ struct seccomp_filter { refcount_t refs; refcount_t users; bool log; + struct seccomp_cache_filter_data cache; struct seccomp_filter *prev; struct bpf_prog *prog; struct notification *notif; @@ -544,7 +577,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; - const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE); + const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || + IS_ENABLED(CONFIG_SECCOMP_CACHE_NR_ONLY); if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); @@ -610,6 +644,136 @@ seccomp_prepare_user_filter(const char __user *user_filter) return filter; } +#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY +/** + * seccomp_emu_is_const_allow - check if filter is constant allow with given data + * @fprog: The BPF programs + * @sd: The seccomp data to check against, only syscall number are arch + * number are considered constant. + */ +static bool seccomp_emu_is_const_allow(struct sock_fprog_kern *fprog, + struct seccomp_data *sd) +{ + unsigned int insns; + unsigned int reg_value = 0; + unsigned int pc; + bool op_res; + + if (WARN_ON_ONCE(!fprog)) + return false; + + insns = bpf_classic_proglen(fprog); + for (pc = 0; pc < insns; pc++) { + struct sock_filter *insn = &fprog->filter[pc]; + u16 code = insn->code; + u32 k = insn->k; + + switch (code) { + case BPF_LD | BPF_W | BPF_ABS: + switch (k) { + case offsetof(struct seccomp_data, nr): + reg_value = sd->nr; + break; + case offsetof(struct seccomp_data, arch): + reg_value = sd->arch; + break; + default: + /* can't optimize (non-constant value load) */ + return false; + } + break; + case BPF_RET | BPF_K: + /* reached return with constant values only, check allow */ + return k == SECCOMP_RET_ALLOW; + case BPF_JMP | BPF_JA: + pc += insn->k; + break; + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JSET | BPF_K: + switch (BPF_OP(code)) { + case BPF_JEQ: + op_res = reg_value == k; + break; + case BPF_JGE: + op_res = reg_value >= k; + break; + case BPF_JGT: + op_res = reg_value > k; + break; + case BPF_JSET: + op_res = !!(reg_value & k); + break; + default: + /* can't optimize (unknown jump) */ + return false; + } + + pc += op_res ? insn->jt : insn->jf; + break; + case BPF_ALU | BPF_AND | BPF_K: + reg_value &= k; + break; + default: + /* can't optimize (unknown insn) */ + return false; + } + } + + /* ran off the end of the filter?! */ + WARN_ON(1); + return false; +} + +static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter, + void *bitmap, const void *bitmap_prev, + size_t bitmap_size, int arch) +{ + struct sock_fprog_kern *fprog = sfilter->prog->orig_prog; + struct seccomp_data sd; + int nr; + + for (nr = 0; nr < bitmap_size; nr++) { + if (bitmap_prev && !test_bit(nr, bitmap_prev)) + continue; + + sd.nr = nr; + sd.arch = arch; + + if (seccomp_emu_is_const_allow(fprog, &sd)) + set_bit(nr, bitmap); + } +} + +/** + * seccomp_cache_prepare - emulate the filter to find cachable syscalls + * @sfilter: The seccomp filter + * + * Returns 0 if successful or -errno if error occurred. + */ +static void seccomp_cache_prepare(struct seccomp_filter *sfilter) +{ + struct seccomp_cache_filter_data *cache = &sfilter->cache; + const struct seccomp_cache_filter_data *cache_prev = + sfilter->prev ? &sfilter->prev->cache : NULL; + +#ifdef SECCOMP_ARCH_DEFAULT + seccomp_cache_prepare_bitmap(sfilter, cache->syscall_allow_default, + cache_prev ? cache_prev->syscall_allow_default : NULL, + SECCOMP_ARCH_DEFAULT_NR, + SECCOMP_ARCH_DEFAULT); +#endif /* SECCOMP_ARCH_DEFAULT */ + +#ifdef SECCOMP_ARCH_COMPAT + seccomp_cache_prepare_bitmap(sfilter, cache->syscall_allow_compat, + cache_prev ? cache_prev->syscall_allow_compat : NULL, + SECCOMP_ARCH_COMPAT_NR, + SECCOMP_ARCH_COMPAT); +#endif /* SECCOMP_ARCH_COMPAT */ +} +#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */ + /** * seccomp_attach_filter: validate and attach filter * @flags: flags to change filter behavior @@ -659,6 +823,7 @@ static long seccomp_attach_filter(unsigned int flags, * task reference. */ filter->prev = current->seccomp.filter; + seccomp_cache_prepare(filter); current->seccomp.filter = filter; atomic_inc(¤t->seccomp.filter_count); -- 2.28.0