CPU mitigations are deployed system-wide, but usually not all of the userspace is malicious. Yet, they suffer from the performance impact of the mitigations. This all or nothing approach is due to lack of a way for kernel to know which userspace can be trusted and which cannot. For scenarios where an admin can decide which processes to trust, an interface to tell the kernel to possibly skip the mitigation would be useful. In preparation for kernel to be able to selectively apply mitigation per-process add a separate kernel entry/exit path that skips the mitigations. Originally-by: Josh Poimboeuf <jpoimboe@xxxxxxxxxx> Signed-off-by: Pawan Gupta <pawan.kumar.gupta@xxxxxxxxxxxxxxx> --- arch/x86/entry/entry_64.S | 66 +++++++++++++++++++++++++++++++++++-------- arch/x86/include/asm/proto.h | 15 +++++++--- arch/x86/include/asm/ptrace.h | 15 +++++++--- arch/x86/kernel/cpu/common.c | 2 +- 4 files changed, 78 insertions(+), 20 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1b5be07f8669..eeaf4226d09c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -84,7 +84,7 @@ * with them due to bugs in both AMD and Intel CPUs. */ -SYM_CODE_START(entry_SYSCALL_64) +.macro __entry_SYSCALL_64 mitigated=0 UNWIND_HINT_ENTRY ENDBR @@ -94,7 +94,12 @@ SYM_CODE_START(entry_SYSCALL_64) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp -SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) +.if \mitigated +SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack_mitigated, SYM_L_GLOBAL) +.else +SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack_unmitigated, SYM_L_GLOBAL) +.endif + ANNOTATE_NOENDBR /* Construct struct pt_regs on stack */ @@ -103,7 +108,11 @@ SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ + +.if \mitigated SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) +.endif + pushq %rax /* pt_regs->orig_ax */ PUSH_AND_CLEAR_REGS rax=$-ENOSYS @@ -113,10 +122,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) /* Sign extend the lower 32bit as syscall numbers are treated as int */ movslq %eax, %rsi +.if \mitigated /* clobbers %rax, make sure it is after saving the syscall nr */ IBRS_ENTER UNTRAIN_RET CLEAR_BRANCH_HISTORY +.endif call do_syscall_64 /* returns with IRQs disabled */ @@ -127,15 +138,26 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) * In the Xen PV case we must use iret anyway. */ - ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ - "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV +.if \mitigated + push %rax + IBRS_EXIT + CLEAR_CPU_BUFFERS + pop %rax +.endif + + ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode_from_syscall", \ + "jmp swapgs_restore_regs_and_return_to_usermode_from_syscall", X86_FEATURE_XENPV /* * We win! This label is here just for ease of understanding * perf profiles. Nothing jumps here. */ -syscall_return_via_sysret: - IBRS_EXIT +.if \mitigated +syscall_return_via_sysret_mitigated: +.else +syscall_return_via_sysret_unmitigated: +.endif + POP_REGS pop_rdi=0 /* @@ -159,15 +181,36 @@ syscall_return_via_sysret: popq %rdi popq %rsp -SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) + +.if \mitigated +SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack_mitigated, SYM_L_GLOBAL) +.else +SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack_unmitigated, SYM_L_GLOBAL) +.endif + ANNOTATE_NOENDBR swapgs - CLEAR_CPU_BUFFERS + +.if \mitigated +SYM_INNER_LABEL(entry_SYSRETQ_end_mitigated, SYM_L_GLOBAL) +.else +SYM_INNER_LABEL(entry_SYSRETQ_end_unmitigated, SYM_L_GLOBAL) +.endif sysretq -SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) + +.endm /* __entry_SYSCALL_64 */ + +SYM_CODE_START(entry_SYSCALL_64_unmitigated) + __entry_SYSCALL_64 mitigated=0 ANNOTATE_NOENDBR int3 -SYM_CODE_END(entry_SYSCALL_64) +SYM_CODE_END(entry_SYSCALL_64_unmitigated) + +SYM_CODE_START(entry_SYSCALL_64_mitigated) + __entry_SYSCALL_64 mitigated=1 + ANNOTATE_NOENDBR + int3 +SYM_CODE_END(entry_SYSCALL_64_mitigated) /* * %rdi: prev task @@ -559,6 +602,8 @@ __irqentry_text_end: SYM_CODE_START_LOCAL(common_interrupt_return) SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) IBRS_EXIT + CLEAR_CPU_BUFFERS +SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode_from_syscall, SYM_L_GLOBAL) #ifdef CONFIG_XEN_PV ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV #endif @@ -573,7 +618,6 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) .Lswapgs_and_iret: swapgs - CLEAR_CPU_BUFFERS /* Assert that the IRET frame indicates user mode. */ testb $3, 8(%rsp) jnz .Lnative_iret diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 484f4f0131a5..0936e0e70659 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -11,10 +11,17 @@ struct task_struct; void syscall_init(void); #ifdef CONFIG_X86_64 -void entry_SYSCALL_64(void); -void entry_SYSCALL_64_safe_stack(void); -void entry_SYSRETQ_unsafe_stack(void); -void entry_SYSRETQ_end(void); + +void entry_SYSCALL_64_unmitigated(void); +void entry_SYSCALL_64_safe_stack_unmitigated(void); +void entry_SYSRETQ_unsafe_stack_unmitigated(void); +void entry_SYSRETQ_end_unmitigated(void); + +void entry_SYSCALL_64_mitigated(void); +void entry_SYSCALL_64_safe_stack_mitigated(void); +void entry_SYSRETQ_unsafe_stack_mitigated(void); +void entry_SYSRETQ_end_mitigated(void); + long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 5a83fbd9bc0b..74a13c76d241 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -261,11 +261,18 @@ static inline bool any_64bit_mode(struct pt_regs *regs) static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs) { - bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && - regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); + bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64_unmitigated && + regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack_unmitigated); + + ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack_unmitigated && + regs->ip < (unsigned long)entry_SYSRETQ_end_unmitigated); + + ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_64_mitigated && + regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack_mitigated); + + ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack_mitigated && + regs->ip < (unsigned long)entry_SYSRETQ_end_mitigated); - ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack && - regs->ip < (unsigned long)entry_SYSRETQ_end); #ifdef CONFIG_IA32_EMULATION ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d4e539d4e158..e72c37f3a437 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2026,7 +2026,7 @@ static void wrmsrl_cstar(unsigned long val) static inline void idt_syscall_init(void) { - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64_unmitigated); if (ia32_enabled()) { wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); -- 2.34.1