From: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx> Signed-off-by: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx> --- arch/x86/entry/entry_64.S | 373 -------------------------------- arch/x86/entry/ist_entry.c | 2 +- arch/x86/include/asm/idtentry.h | 9 +- 3 files changed, 7 insertions(+), 377 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 50a24cc83581..2bb7ab8512dc 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1341,379 +1341,6 @@ SYM_CODE_START_LOCAL(error_return) jmp swapgs_restore_regs_and_return_to_usermode SYM_CODE_END(error_return) -/* - * Runs on exception stack. Xen PV does not go through this path at all, - * so we can use real assembly here. - * - * Registers: - * %r14: Used to save/restore the CR3 of the interrupted context - * when PAGE_TABLE_ISOLATION is in use. Do not clobber. - */ -SYM_CODE_START(asm_exc_nmi) - UNWIND_HINT_IRET_REGS - ENDBR - - /* - * We allow breakpoints in NMIs. If a breakpoint occurs, then - * the iretq it performs will take us out of NMI context. - * This means that we can have nested NMIs where the next - * NMI is using the top of the stack of the previous NMI. We - * can't let it execute because the nested NMI will corrupt the - * stack of the previous NMI. NMI handlers are not re-entrant - * anyway. - * - * To handle this case we do the following: - * Check the a special location on the stack that contains - * a variable that is set when NMIs are executing. - * The interrupted task's stack is also checked to see if it - * is an NMI stack. - * If the variable is not set and the stack is not the NMI - * stack then: - * o Set the special variable on the stack - * o Copy the interrupt frame into an "outermost" location on the - * stack - * o Copy the interrupt frame into an "iret" location on the stack - * o Continue processing the NMI - * If the variable is set or the previous stack is the NMI stack: - * o Modify the "iret" location to jump to the repeat_nmi - * o return back to the first NMI - * - * Now on exit of the first NMI, we first clear the stack variable - * The NMI stack will tell any nested NMIs at that point that it is - * nested. Then we pop the stack normally with iret, and if there was - * a nested NMI that updated the copy interrupt stack frame, a - * jump will be made to the repeat_nmi code that will handle the second - * NMI. - * - * However, espfix prevents us from directly returning to userspace - * with a single IRET instruction. Similarly, IRET to user mode - * can fault. We therefore handle NMIs from user space like - * other IST entries. - */ - - ASM_CLAC - cld - - /* Use %rdx as our temp variable throughout */ - pushq %rdx - - testb $3, CS-RIP+8(%rsp) - jz .Lnmi_from_kernel - - /* - * NMI from user mode. We need to run on the thread stack, but we - * can't go through the normal entry paths: NMIs are masked, and - * we don't want to enable interrupts, because then we'll end - * up in an awkward situation in which IRQs are on but NMIs - * are off. - * - * We also must not push anything to the stack before switching - * stacks lest we corrupt the "NMI executing" variable. - */ - - swapgs - FENCE_SWAPGS_USER_ENTRY - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx - movq %rsp, %rdx - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp - UNWIND_HINT_IRET_REGS base=%rdx offset=8 - pushq 5*8(%rdx) /* pt_regs->ss */ - pushq 4*8(%rdx) /* pt_regs->rsp */ - pushq 3*8(%rdx) /* pt_regs->flags */ - pushq 2*8(%rdx) /* pt_regs->cs */ - pushq 1*8(%rdx) /* pt_regs->rip */ - UNWIND_HINT_IRET_REGS - pushq $-1 /* pt_regs->orig_ax */ - PUSH_AND_CLEAR_REGS rdx=(%rdx) - ENCODE_FRAME_POINTER - - IBRS_ENTER - UNTRAIN_RET - - /* - * At this point we no longer need to worry about stack damage - * due to nesting -- we're on the normal thread stack and we're - * done with the NMI stack. - */ - - movq %rsp, %rdi - movq $-1, %rsi - call exc_nmi - - /* - * Return back to user mode. We must *not* do the normal exit - * work, because we don't want to enable interrupts. - */ - jmp swapgs_restore_regs_and_return_to_usermode - -.Lnmi_from_kernel: - /* - * Here's what our stack frame will look like: - * +---------------------------------------------------------+ - * | original SS | - * | original Return RSP | - * | original RFLAGS | - * | original CS | - * | original RIP | - * +---------------------------------------------------------+ - * | temp storage for rdx | - * +---------------------------------------------------------+ - * | "NMI executing" variable | - * +---------------------------------------------------------+ - * | iret SS } Copied from "outermost" frame | - * | iret Return RSP } on each loop iteration; overwritten | - * | iret RFLAGS } by a nested NMI to force another | - * | iret CS } iteration if needed. | - * | iret RIP } | - * +---------------------------------------------------------+ - * | outermost SS } initialized in first_nmi; | - * | outermost Return RSP } will not be changed before | - * | outermost RFLAGS } NMI processing is done. | - * | outermost CS } Copied to "iret" frame on each | - * | outermost RIP } iteration. | - * +---------------------------------------------------------+ - * | pt_regs | - * +---------------------------------------------------------+ - * - * The "original" frame is used by hardware. Before re-enabling - * NMIs, we need to be done with it, and we need to leave enough - * space for the asm code here. - * - * We return by executing IRET while RSP points to the "iret" frame. - * That will either return for real or it will loop back into NMI - * processing. - * - * The "outermost" frame is copied to the "iret" frame on each - * iteration of the loop, so each iteration starts with the "iret" - * frame pointing to the final return target. - */ - - /* - * Determine whether we're a nested NMI. - * - * If we interrupted kernel code between repeat_nmi and - * end_repeat_nmi, then we are a nested NMI. We must not - * modify the "iret" frame because it's being written by - * the outer NMI. That's okay; the outer NMI handler is - * about to about to call exc_nmi() anyway, so we can just - * resume the outer NMI. - */ - - movq $repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja 1f - movq $end_repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja nested_nmi_out -1: - - /* - * Now check "NMI executing". If it's set, then we're nested. - * This will not detect if we interrupted an outer NMI just - * before IRET. - */ - cmpl $1, -8(%rsp) - je nested_nmi - - /* - * Now test if the previous stack was an NMI stack. This covers - * the case where we interrupt an outer NMI after it clears - * "NMI executing" but before IRET. We need to be careful, though: - * there is one case in which RSP could point to the NMI stack - * despite there being no NMI active: naughty userspace controls - * RSP at the very beginning of the SYSCALL targets. We can - * pull a fast one on naughty userspace, though: we program - * SYSCALL to mask DF, so userspace cannot cause DF to be set - * if it controls the kernel's RSP. We set DF before we clear - * "NMI executing". - */ - lea 6*8(%rsp), %rdx - /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ - cmpq %rdx, 4*8(%rsp) - /* If the stack pointer is above the NMI stack, this is a normal NMI */ - ja first_nmi - - subq $EXCEPTION_STKSZ, %rdx - cmpq %rdx, 4*8(%rsp) - /* If it is below the NMI stack, it is a normal NMI */ - jb first_nmi - - /* Ah, it is within the NMI stack. */ - - testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) - jz first_nmi /* RSP was user controlled. */ - - /* This is a nested NMI. */ - -nested_nmi: - /* - * Modify the "iret" frame to point to repeat_nmi, forcing another - * iteration of NMI handling. - */ - subq $8, %rsp - leaq -10*8(%rsp), %rdx - pushq $__KERNEL_DS - pushq %rdx - pushfq - pushq $__KERNEL_CS - pushq $repeat_nmi - - /* Put stack back */ - addq $(6*8), %rsp - -nested_nmi_out: - popq %rdx - - /* We are returning to kernel mode, so this cannot result in a fault. */ - iretq - -first_nmi: - /* Restore rdx. */ - movq (%rsp), %rdx - - /* Make room for "NMI executing". */ - pushq $0 - - /* Leave room for the "iret" frame */ - subq $(5*8), %rsp - - /* Copy the "original" frame to the "outermost" frame */ - .rept 5 - pushq 11*8(%rsp) - .endr - UNWIND_HINT_IRET_REGS - - /* Everything up to here is safe from nested NMIs */ - -#ifdef CONFIG_DEBUG_ENTRY - /* - * For ease of testing, unmask NMIs right away. Disabled by - * default because IRET is very expensive. - */ - pushq $0 /* SS */ - pushq %rsp /* RSP (minus 8 because of the previous push) */ - addq $8, (%rsp) /* Fix up RSP */ - pushfq /* RFLAGS */ - pushq $__KERNEL_CS /* CS */ - pushq $1f /* RIP */ - iretq /* continues at repeat_nmi below */ - UNWIND_HINT_IRET_REGS -1: -#endif - -repeat_nmi: - ANNOTATE_NOENDBR // this code - /* - * If there was a nested NMI, the first NMI's iret will return - * here. But NMIs are still enabled and we can take another - * nested NMI. The nested NMI checks the interrupted RIP to see - * if it is between repeat_nmi and end_repeat_nmi, and if so - * it will just return, as we are about to repeat an NMI anyway. - * This makes it safe to copy to the stack frame that a nested - * NMI will update. - * - * RSP is pointing to "outermost RIP". gsbase is unknown, but, if - * we're repeating an NMI, gsbase has the same value that it had on - * the first iteration. paranoid_entry will load the kernel - * gsbase if needed before we call exc_nmi(). "NMI executing" - * is zero. - */ - movq $1, 10*8(%rsp) /* Set "NMI executing". */ - - /* - * Copy the "outermost" frame to the "iret" frame. NMIs that nest - * here must not modify the "iret" frame while we're writing to - * it or it will end up containing garbage. - */ - addq $(10*8), %rsp - .rept 5 - pushq -6*8(%rsp) - .endr - subq $(5*8), %rsp -end_repeat_nmi: - ANNOTATE_NOENDBR // this code - - /* - * Everything below this point can be preempted by a nested NMI. - * If this happens, then the inner NMI will change the "iret" - * frame to point back to repeat_nmi. - */ - pushq $-1 /* ORIG_RAX: no syscall to restart */ - - PUSH_AND_CLEAR_REGS - UNWIND_HINT_REGS - ENCODE_FRAME_POINTER - - /* - * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit - * as we should not be calling schedule in NMI context. - * Even with normal interrupts enabled. An NMI should not be - * setting NEED_RESCHED or anything that normal interrupts and - * exceptions might do. - */ - call paranoid_entry - - movq %rsp, %rdi - movq $-1, %rsi - call exc_nmi - - /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ - IBRS_EXIT save_reg=%r15 - - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - - /* - * The above invocation of paranoid_entry stored the GSBASE - * related information in R/EBX depending on the availability - * of FSGSBASE. - * - * If FSGSBASE is enabled, restore the saved GSBASE value - * unconditionally, otherwise take the conditional SWAPGS path. - */ - ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE - - wrgsbase %rbx - jmp nmi_restore - -nmi_no_fsgsbase: - /* EBX == 0 -> invoke SWAPGS */ - testl %ebx, %ebx - jnz nmi_restore - -nmi_swapgs: - swapgs - -nmi_restore: - POP_REGS - - /* - * Skip orig_ax and the "outermost" frame to point RSP at the "iret" - * at the "iret" frame. - */ - addq $6*8, %rsp - - /* - * Clear "NMI executing". Set DF first so that we can easily - * distinguish the remaining code between here and IRET from - * the SYSCALL entry and exit paths. - * - * We arguably should just inspect RIP instead, but I (Andy) wrote - * this code when I had the misapprehension that Xen PV supported - * NMIs, and Xen PV would break that approach. - */ - std - movq $0, 5*8(%rsp) /* clear "NMI executing" */ - - /* - * iretq reads the "iret" frame and exits the NMI stack in a - * single instruction. We are returning to kernel mode, so this - * cannot result in a fault. Similarly, we don't need to worry - * about espfix64 on the way back to kernel mode. - */ - iretq -SYM_CODE_END(asm_exc_nmi) - #ifndef CONFIG_IA32_EMULATION /* * This handles SYSCALL from 32-bit code. There is no way to program diff --git a/arch/x86/entry/ist_entry.c b/arch/x86/entry/ist_entry.c index e1b06306ac51..407571cc4a8c 100644 --- a/arch/x86/entry/ist_entry.c +++ b/arch/x86/entry/ist_entry.c @@ -41,7 +41,7 @@ static __always_inline bool identify_ist_##sym_name( \ return true; \ } -DEFINE_IDENTIFY_IST(NMI, nmi, false) +DEFINE_IDENTIFY_IST(NMI, nmi, true) DEFINE_IDENTIFY_IST(DB, debug, false) DEFINE_IDENTIFY_IST(MCE, machine_check, false) DEFINE_IDENTIFY_IST(VC, vmm_communication, false) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b241af4ce9b4..b568f1de6da6 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -450,6 +450,9 @@ __visible noinstr void func(struct pt_regs *regs, \ idtentry_sysvec vector func #ifdef CONFIG_X86_64 +# define DECLARE_IDTENTRY_NMI(vector, func) \ + idtentry_ist vector asm_##func func func has_error_code=0 stack_offset=CEA_stacks_NMI + # define DECLARE_IDTENTRY_MCE(vector, func) \ idtentry_mce_db vector asm_##func func @@ -475,11 +478,11 @@ __visible noinstr void func(struct pt_regs *regs, \ /* No ASM emitted for XEN hypervisor callback */ # define DECLARE_IDTENTRY_XENCB(vector, func) -#endif - -/* No ASM code emitted for NMI */ +/* No ASM code emitted for NMI for X86_32 */ #define DECLARE_IDTENTRY_NMI(vector, func) +#endif + /* * ASM code to emit the common vector entry stubs where each stub is * packed into IDT_ALIGN bytes. -- 2.19.1.6.gb485710b