From: YiFei Zhu <yifeifz2@xxxxxxxxxxxx> SECCOMP_CACHE_NR_ONLY will only operate on syscalls that do not access any syscall arguments or instruction pointer. To facilitate this we need a static analyser to know whether a filter will return allow regardless of syscall arguments for a given architecture number / syscall number pair. This is implemented here with a pseudo-emulator, and stored in a per-filter bitmap. Each common BPF instruction (stolen from Kees's list [1]) are emulated. Any weirdness or loading from a syscall argument will cause the emulator to bail. The emulation is also halted if it reaches a return. In that case, if it returns an SECCOMP_RET_ALLOW, the syscall is marked as good. Filter dependency is resolved at attach time. If a filter depends on more filters, then we perform an and on its bitmask against its dependee; if the dependee does not guarantee to allow the syscall, then the depender is also marked not to guarantee to allow the syscall. [1] https://lore.kernel.org/lkml/20200923232923.3142503-5-keescook@xxxxxxxxxxxx/ Signed-off-by: YiFei Zhu <yifeifz2@xxxxxxxxxxxx> --- arch/Kconfig | 25 ++++++ kernel/seccomp.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 218 insertions(+), 1 deletion(-) diff --git a/arch/Kconfig b/arch/Kconfig index 6dfc5673215d..8cc3dc87f253 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -489,6 +489,31 @@ config SECCOMP_FILTER See Documentation/userspace-api/seccomp_filter.rst for details. +choice + prompt "Seccomp filter cache" + default SECCOMP_CACHE_NONE + depends on SECCOMP_FILTER + help + Seccomp filters can potentially incur large overhead for each + system call. This can alleviate some of the overhead. + + If in doubt, select 'syscall numbers only'. + +config SECCOMP_CACHE_NONE + bool "None" + help + No caching is done. Seccomp filters will be called each time + a system call occurs in a seccomp-guarded task. + +config SECCOMP_CACHE_NR_ONLY + bool "Syscall number only" + depends on !HAVE_SPARSE_SYSCALL_NR + help + For each syscall number, if the seccomp filter has a fixed + result, store that result in a bitmap to speed up system calls. + +endchoice + config HAVE_ARCH_STACKLEAK bool help diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 3ee59ce0a323..20d33378a092 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -143,6 +143,32 @@ struct notification { struct list_head notifications; }; +#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY +/** + * struct seccomp_cache_filter_data - container for cache's per-filter data + * + * @syscall_ok: A bitmap for each architecture number, where each bit + * represents whether the filter will always allow the syscall. + */ +struct seccomp_cache_filter_data { + DECLARE_BITMAP(syscall_ok[ARRAY_SIZE(syscall_arches)], NR_syscalls); +}; + +#define SECCOMP_EMU_MAX_PENDING_STATES 64 +#else +struct seccomp_cache_filter_data { }; + +static inline int seccomp_cache_prepare(struct seccomp_filter *sfilter) +{ + return 0; +} + +static inline void seccomp_cache_inherit(struct seccomp_filter *sfilter, + const struct seccomp_filter *prev) +{ +} +#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */ + /** * struct seccomp_filter - container for seccomp BPF programs * @@ -185,6 +211,7 @@ struct seccomp_filter { struct notification *notif; struct mutex notify_lock; wait_queue_head_t wqh; + struct seccomp_cache_filter_data cache; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -530,6 +557,139 @@ static inline void seccomp_sync_threads(unsigned long flags) } } +#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY +/** + * struct seccomp_emu_env - container for seccomp emulator environment + * + * @filter: The cBPF filter instructions. + * @nr: The syscall number we are emulating. + * @arch: The architecture number we are emulating. + * @syscall_ok: Emulation result, whether it is okay for seccomp to cache the + * syscall. + */ +struct seccomp_emu_env { + struct sock_filter *filter; + int arch; + int nr; + bool syscall_ok; +}; + +/** + * struct seccomp_emu_state - container for seccomp emulator state + * + * @next: The next pending state. This structure is a linked list. + * @pc: The current program counter. + * @areg: the value of that A register. + */ +struct seccomp_emu_state { + struct seccomp_emu_state *next; + int pc; + u32 areg; +}; + +/** + * seccomp_emu_step - step one instruction in the emulator + * @env: The emulator environment + * @state: The emulator state + * + * Returns 1 to halt emulation, 0 to continue, or -errno if error occurred. + */ +static int seccomp_emu_step(struct seccomp_emu_env *env, + struct seccomp_emu_state *state) +{ + struct sock_filter *ftest = &env->filter[state->pc++]; + u16 code = ftest->code; + u32 k = ftest->k; + bool compare; + + switch (code) { + case BPF_LD | BPF_W | BPF_ABS: + if (k == offsetof(struct seccomp_data, nr)) + state->areg = env->nr; + else if (k == offsetof(struct seccomp_data, arch)) + state->areg = env->arch; + else + return 1; + + return 0; + case BPF_JMP | BPF_JA: + state->pc += k; + return 0; + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JSET | BPF_K: + switch (BPF_OP(code)) { + case BPF_JEQ: + compare = state->areg == k; + break; + case BPF_JGT: + compare = state->areg > k; + break; + case BPF_JGE: + compare = state->areg >= k; + break; + case BPF_JSET: + compare = state->areg & k; + break; + default: + WARN_ON(true); + return -EINVAL; + } + + state->pc += compare ? ftest->jt : ftest->jf; + return 0; + case BPF_ALU | BPF_AND | BPF_K: + state->areg &= k; + return 0; + case BPF_RET | BPF_K: + env->syscall_ok = k == SECCOMP_RET_ALLOW; + return 1; + default: + return 1; + } +} + +/** + * seccomp_cache_prepare - emulate the filter to find cachable syscalls + * @sfilter: The seccomp filter + * + * Returns 0 if successful or -errno if error occurred. + */ +int seccomp_cache_prepare(struct seccomp_filter *sfilter) +{ + struct sock_fprog_kern *fprog = sfilter->prog->orig_prog; + struct sock_filter *filter = fprog->filter; + int arch, nr, res = 0; + + for (arch = 0; arch < ARRAY_SIZE(syscall_arches); arch++) { + for (nr = 0; nr < NR_syscalls; nr++) { + struct seccomp_emu_env env = {0}; + struct seccomp_emu_state state = {0}; + + env.filter = filter; + env.arch = syscall_arches[arch]; + env.nr = nr; + + while (true) { + res = seccomp_emu_step(&env, &state); + if (res) + break; + } + + if (res < 0) + goto out; + + if (env.syscall_ok) + set_bit(nr, sfilter->cache.syscall_ok[arch]); + } + } + +out: + return res; +} +#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */ + /** * seccomp_prepare_filter: Prepares a seccomp filter for use. * @fprog: BPF program to install @@ -540,7 +700,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; - const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE); + const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || + IS_ENABLED(CONFIG_SECCOMP_CACHE_NR_ONLY); if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); @@ -571,6 +732,13 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return ERR_PTR(ret); } + ret = seccomp_cache_prepare(sfilter); + if (ret < 0) { + bpf_prog_destroy(sfilter->prog); + kfree(sfilter); + return ERR_PTR(ret); + } + refcount_set(&sfilter->refs, 1); refcount_set(&sfilter->users, 1); init_waitqueue_head(&sfilter->wqh); @@ -606,6 +774,29 @@ seccomp_prepare_user_filter(const char __user *user_filter) return filter; } +#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY +/** + * seccomp_cache_inherit - mask accept bitmap against previous filter + * @sfilter: The seccomp filter + * @sfilter: The previous seccomp filter + */ +static void seccomp_cache_inherit(struct seccomp_filter *sfilter, + const struct seccomp_filter *prev) +{ + int arch; + + if (!prev) + return; + + for (arch = 0; arch < ARRAY_SIZE(syscall_arches); arch++) { + bitmap_and(sfilter->cache.syscall_ok[arch], + sfilter->cache.syscall_ok[arch], + prev->cache.syscall_ok[arch], + NR_syscalls); + } +} +#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */ + /** * seccomp_attach_filter: validate and attach filter * @flags: flags to change filter behavior @@ -655,6 +846,7 @@ static long seccomp_attach_filter(unsigned int flags, * task reference. */ filter->prev = current->seccomp.filter; + seccomp_cache_inherit(filter, filter->prev); current->seccomp.filter = filter; atomic_inc(¤t->seccomp.filter_count); -- 2.28.0 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers