From: YiFei Zhu <yifeifz2@xxxxxxxxxxxx> The fast (common) path for seccomp should be that the filter permits the syscall to pass through, and failing seccomp is expected to be an exceptional case; it is not expected for userspace to call a denylisted syscall over and over. We do this by creating a per-task bitmap of permitted syscalls. If seccomp filter is invoked we check if it is cached and if so directly return allow. Else we call into the cBPF filter, and if the result is an allow then we cache the results. The cache is per-task to minimize thread-synchronization issues in the hot path of cache lookup, and to avoid different architecture numbers sharing the same cache. To account for one thread changing the filter for another thread of the same process, the per-task struct also contains a pointer to the filter the cache is built on. When the cache lookup uses a different filter then the last lookup, the per-task cache bitmap is cleared. Architecture number changes also invokes a clear of the per-task cache, since it should be very unlikely for a given thread to change its architecture. Benchmark results, on qemu-kvm x86_64 VM, on Intel(R) Core(TM) i5-8250U CPU @ 1.60GHz, with seccomp_benchmark: With SECCOMP_CACHE_NONE: Current BPF sysctl settings: net.core.bpf_jit_enable = 1 net.core.bpf_jit_harden = 0 Calibrating sample size for 15 seconds worth of syscalls ... Benchmarking 23486415 syscalls... 16.079642020 - 1.013345439 = 15066296581 (15.1s) getpid native: 641 ns 32.080237410 - 16.080763500 = 15999473910 (16.0s) getpid RET_ALLOW 1 filter: 681 ns 48.609461618 - 32.081296173 = 16528165445 (16.5s) getpid RET_ALLOW 2 filters: 703 ns Estimated total seccomp overhead for 1 filter: 40 ns Estimated total seccomp overhead for 2 filters: 62 ns Estimated seccomp per-filter overhead: 22 ns Estimated seccomp entry overhead: 18 ns With SECCOMP_CACHE_NR_ONLY: Current BPF sysctl settings: net.core.bpf_jit_enable = 1 net.core.bpf_jit_harden = 0 Calibrating sample size for 15 seconds worth of syscalls ... Benchmarking 23486415 syscalls... 16.059512499 - 1.014108434 = 15045404065 (15.0s) getpid native: 640 ns 31.651075934 - 16.060637323 = 15590438611 (15.6s) getpid RET_ALLOW 1 filter: 663 ns 47.367316169 - 31.652302661 = 15715013508 (15.7s) getpid RET_ALLOW 2 filters: 669 ns Estimated total seccomp overhead for 1 filter: 23 ns Estimated total seccomp overhead for 2 filters: 29 ns Estimated seccomp per-filter overhead: 6 ns Estimated seccomp entry overhead: 17 ns Co-developed-by: Dimitrios Skarlatos <dskarlat@xxxxxxxxxx> Signed-off-by: Dimitrios Skarlatos <dskarlat@xxxxxxxxxx> Signed-off-by: YiFei Zhu <yifeifz2@xxxxxxxxxxxx> --- include/linux/seccomp.h | 22 ++++++++++++ kernel/seccomp.c | 77 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 02aef2844c38..08ec8b90c99d 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -21,6 +21,27 @@ #include <asm/seccomp.h> struct seccomp_filter; + +#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY +/** + * struct seccomp_cache_task_data - container for seccomp cache's per-task data + * + * @syscall_ok: A bitmap where each bit represents whether the syscall is cached + * and that the filter allowed it. + * @last_filter: If the next cache lookup uses a different filter, the lookup + * will clear cache. + * @last_arch: If the next cache lookup uses a different arch number, the + * lookup will clear cache. + */ +struct seccomp_cache_task_data { + DECLARE_BITMAP(syscall_ok, NR_syscalls); + const struct seccomp_filter *last_filter; + u32 last_arch; +}; +#else +struct seccomp_cache_task_data { }; +#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */ + /** * struct seccomp - the state of a seccomp'ed process * @@ -36,6 +57,7 @@ struct seccomp { int mode; atomic_t filter_count; struct seccomp_filter *filter; + struct seccomp_cache_task_data cache; }; #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d8c30901face..7096f8c86f71 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -162,6 +162,17 @@ static inline int seccomp_cache_prepare(struct seccomp_filter *sfilter) { return 0; } + +static inline bool seccomp_cache_check(const struct seccomp_filter *sfilter, + const struct seccomp_data *sd) +{ + return 0; +} + +static inline void seccomp_cache_insert(const struct seccomp_filter *sfilter, + const struct seccomp_data *sd) +{ +} #endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */ /** @@ -316,6 +327,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) return 0; } +#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY +/** + * seccomp_cache_check - lookup seccomp cache + * @sfilter: The seccomp filter + * @sd: The seccomp data to lookup the cache with + * + * Returns true if the seccomp_data is cached and allowed. + */ +static bool seccomp_cache_check(const struct seccomp_filter *sfilter, + const struct seccomp_data *sd) +{ + struct seccomp_cache_task_data *thread_data; + int syscall_nr = sd->nr; + + if (unlikely(syscall_nr < 0 || syscall_nr >= NR_syscalls)) + return false; + + thread_data = ¤t->seccomp.cache; + if (unlikely(thread_data->last_filter != sfilter || + thread_data->last_arch != sd->arch)) { + thread_data->last_filter = sfilter; + thread_data->last_arch = sd->arch; + + bitmap_zero(thread_data->syscall_ok, NR_syscalls); + return false; + } + + return test_bit(syscall_nr, thread_data->syscall_ok); +} + +/** + * seccomp_cache_insert - insert into seccomp cache + * @sfilter: The seccomp filter + * @sd: The seccomp data to insert into the cache + */ +static void seccomp_cache_insert(const struct seccomp_filter *sfilter, + const struct seccomp_data *sd) +{ + struct seccomp_cache_task_data *thread_data; + int syscall_nr = sd->nr; + + if (unlikely(syscall_nr < 0 || syscall_nr >= NR_syscalls)) + return; + + thread_data = ¤t->seccomp.cache; + + if (!test_bit(syscall_nr, sfilter->cache.syscall_ok)) + return; + + set_bit(syscall_nr, thread_data->syscall_ok); +} +#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */ + /** * seccomp_run_filters - evaluates all seccomp filters against @sd * @sd: optional seccomp data to be passed to filters @@ -331,13 +395,18 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, { u32 ret = SECCOMP_RET_ALLOW; /* Make sure cross-thread synced filter points somewhere sane. */ - struct seccomp_filter *f = - READ_ONCE(current->seccomp.filter); + struct seccomp_filter *f, *f_head; + + f = READ_ONCE(current->seccomp.filter); + f_head = f; /* Ensure unexpected behavior doesn't result in failing open. */ if (WARN_ON(f == NULL)) return SECCOMP_RET_KILL_PROCESS; + if (seccomp_cache_check(f_head, sd)) + return SECCOMP_RET_ALLOW; + /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). @@ -350,6 +419,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, *match = f; } } + + if (ret == SECCOMP_RET_ALLOW) + seccomp_cache_insert(f_head, sd); + return ret; } #endif /* CONFIG_SECCOMP_FILTER */ -- 2.28.0 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers