On my VM, getpid takes about 70ns. Before this patch, adding a single-instruction always-accept seccomp filter added about 134ns of overhead to getpid. With this patch, the overhead is down to about 13ns. I'm not really thrilled by this patch. It has two main issues: 1. Calling into code in kernel/seccomp.c from assembly feels ugly. 2. The x86 64-bit syscall entry now has four separate code paths: fast, seccomp only, audit only, and slow. This kind of sucks. Would it be worth trying to rewrite the whole thing in C with a two-phase slow path approach like I'm using here for seccomp? Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxxxxxx> --- arch/x86/kernel/entry_64.S | 45 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/seccomp.h | 4 ++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f9e713a..feb32b2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -683,6 +683,45 @@ sysret_signal: FIXUP_TOP_OF_STACK %r11, -ARGOFFSET jmp int_check_syscall_exit_work +#ifdef CONFIG_SECCOMP + /* + * Fast path for seccomp without any other slow path triggers. + */ +seccomp_fastpath: + /* Build seccomp_data */ + pushq %r9 /* args[5] */ + pushq %r8 /* args[4] */ + pushq %r10 /* args[3] */ + pushq %rdx /* args[2] */ + pushq %rsi /* args[1] */ + pushq %rdi /* args[0] */ + pushq RIP-ARGOFFSET+6*8(%rsp) /* rip */ + pushq %rax /* nr and junk */ + movl $AUDIT_ARCH_X86_64, 4(%rsp) /* arch */ + movq %rsp, %rdi + call seccomp_phase1 + addq $8*8, %rsp + cmpq $1, %rax + ja seccomp_invoke_phase2 + LOAD_ARGS 0 /* restore clobbered regs */ + jb system_call_fastpath + jmp ret_from_sys_call + +seccomp_invoke_phase2: + SAVE_REST + FIXUP_TOP_OF_STACK %rdi + movq %rax,%rdi + call seccomp_phase2 + + /* if seccomp says to skip, then set orig_ax to -1 and skip */ + test %eax,%eax + jz 1f + movq $-1, ORIG_RAX(%rsp) +1: + mov ORIG_RAX(%rsp), %rax /* reload rax */ + jmp system_call_post_trace /* and maybe do the syscall */ +#endif + #ifdef CONFIG_AUDITSYSCALL /* * Fast path for syscall audit without full syscall trace. @@ -717,6 +756,10 @@ sysret_audit: /* Do syscall tracing */ tracesys: +#ifdef CONFIG_SECCOMP + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SECCOMP),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + jz seccomp_fastpath +#endif #ifdef CONFIG_AUDITSYSCALL testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jz auditsys @@ -725,6 +768,8 @@ tracesys: FIXUP_TOP_OF_STACK %rdi movq %rsp,%rdi call syscall_trace_enter + +system_call_post_trace: /* * Reload arg registers from stack in case ptrace changed them. * We don't reload %rax because syscall_trace_enter() returned diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 4fc7a84..d3d4c52 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -37,8 +37,8 @@ static inline int secure_computing(void) #define SECCOMP_PHASE1_OK 0 #define SECCOMP_PHASE1_SKIP 1 -extern u32 seccomp_phase1(struct seccomp_data *sd); -int seccomp_phase2(u32 phase1_result); +asmlinkage __visible extern u32 seccomp_phase1(struct seccomp_data *sd); +asmlinkage __visible int seccomp_phase2(u32 phase1_result); #else extern void secure_computing_strict(int this_syscall); #endif -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe linux-arch" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html