[RFC PATCH 4/7] x86/entry: Use atomic-IST-entry for NMI

Lai Jiangshan <jiangshanlai@xxxxxxxxx> · Mon, 3 Apr 2023 22:06:02 +0800

From: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>

Signed-off-by: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>
---
 arch/x86/entry/entry_64.S       | 373 --------------------------------
 arch/x86/entry/ist_entry.c      |   2 +-
 arch/x86/include/asm/idtentry.h |   9 +-
 3 files changed, 7 insertions(+), 377 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 50a24cc83581..2bb7ab8512dc 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1341,379 +1341,6 @@ SYM_CODE_START_LOCAL(error_return)
 	jmp	swapgs_restore_regs_and_return_to_usermode
 SYM_CODE_END(error_return)
 
-/*
- * Runs on exception stack.  Xen PV does not go through this path at all,
- * so we can use real assembly here.
- *
- * Registers:
- *	%r14: Used to save/restore the CR3 of the interrupted context
- *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
- */
-SYM_CODE_START(asm_exc_nmi)
-	UNWIND_HINT_IRET_REGS
-	ENDBR
-
-	/*
-	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
-	 * the iretq it performs will take us out of NMI context.
-	 * This means that we can have nested NMIs where the next
-	 * NMI is using the top of the stack of the previous NMI. We
-	 * can't let it execute because the nested NMI will corrupt the
-	 * stack of the previous NMI. NMI handlers are not re-entrant
-	 * anyway.
-	 *
-	 * To handle this case we do the following:
-	 *  Check the a special location on the stack that contains
-	 *  a variable that is set when NMIs are executing.
-	 *  The interrupted task's stack is also checked to see if it
-	 *  is an NMI stack.
-	 *  If the variable is not set and the stack is not the NMI
-	 *  stack then:
-	 *    o Set the special variable on the stack
-	 *    o Copy the interrupt frame into an "outermost" location on the
-	 *      stack
-	 *    o Copy the interrupt frame into an "iret" location on the stack
-	 *    o Continue processing the NMI
-	 *  If the variable is set or the previous stack is the NMI stack:
-	 *    o Modify the "iret" location to jump to the repeat_nmi
-	 *    o return back to the first NMI
-	 *
-	 * Now on exit of the first NMI, we first clear the stack variable
-	 * The NMI stack will tell any nested NMIs at that point that it is
-	 * nested. Then we pop the stack normally with iret, and if there was
-	 * a nested NMI that updated the copy interrupt stack frame, a
-	 * jump will be made to the repeat_nmi code that will handle the second
-	 * NMI.
-	 *
-	 * However, espfix prevents us from directly returning to userspace
-	 * with a single IRET instruction.  Similarly, IRET to user mode
-	 * can fault.  We therefore handle NMIs from user space like
-	 * other IST entries.
-	 */
-
-	ASM_CLAC
-	cld
-
-	/* Use %rdx as our temp variable throughout */
-	pushq	%rdx
-
-	testb	$3, CS-RIP+8(%rsp)
-	jz	.Lnmi_from_kernel
-
-	/*
-	 * NMI from user mode.  We need to run on the thread stack, but we
-	 * can't go through the normal entry paths: NMIs are masked, and
-	 * we don't want to enable interrupts, because then we'll end
-	 * up in an awkward situation in which IRQs are on but NMIs
-	 * are off.
-	 *
-	 * We also must not push anything to the stack before switching
-	 * stacks lest we corrupt the "NMI executing" variable.
-	 */
-
-	swapgs
-	FENCE_SWAPGS_USER_ENTRY
-	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
-	movq	%rsp, %rdx
-	movq	PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
-	UNWIND_HINT_IRET_REGS base=%rdx offset=8
-	pushq	5*8(%rdx)	/* pt_regs->ss */
-	pushq	4*8(%rdx)	/* pt_regs->rsp */
-	pushq	3*8(%rdx)	/* pt_regs->flags */
-	pushq	2*8(%rdx)	/* pt_regs->cs */
-	pushq	1*8(%rdx)	/* pt_regs->rip */
-	UNWIND_HINT_IRET_REGS
-	pushq   $-1		/* pt_regs->orig_ax */
-	PUSH_AND_CLEAR_REGS rdx=(%rdx)
-	ENCODE_FRAME_POINTER
-
-	IBRS_ENTER
-	UNTRAIN_RET
-
-	/*
-	 * At this point we no longer need to worry about stack damage
-	 * due to nesting -- we're on the normal thread stack and we're
-	 * done with the NMI stack.
-	 */
-
-	movq	%rsp, %rdi
-	movq	$-1, %rsi
-	call	exc_nmi
-
-	/*
-	 * Return back to user mode.  We must *not* do the normal exit
-	 * work, because we don't want to enable interrupts.
-	 */
-	jmp	swapgs_restore_regs_and_return_to_usermode
-
-.Lnmi_from_kernel:
-	/*
-	 * Here's what our stack frame will look like:
-	 * +---------------------------------------------------------+
-	 * | original SS                                             |
-	 * | original Return RSP                                     |
-	 * | original RFLAGS                                         |
-	 * | original CS                                             |
-	 * | original RIP                                            |
-	 * +---------------------------------------------------------+
-	 * | temp storage for rdx                                    |
-	 * +---------------------------------------------------------+
-	 * | "NMI executing" variable                                |
-	 * +---------------------------------------------------------+
-	 * | iret SS          } Copied from "outermost" frame        |
-	 * | iret Return RSP  } on each loop iteration; overwritten  |
-	 * | iret RFLAGS      } by a nested NMI to force another     |
-	 * | iret CS          } iteration if needed.                 |
-	 * | iret RIP         }                                      |
-	 * +---------------------------------------------------------+
-	 * | outermost SS          } initialized in first_nmi;       |
-	 * | outermost Return RSP  } will not be changed before      |
-	 * | outermost RFLAGS      } NMI processing is done.         |
-	 * | outermost CS          } Copied to "iret" frame on each  |
-	 * | outermost RIP         } iteration.                      |
-	 * +---------------------------------------------------------+
-	 * | pt_regs                                                 |
-	 * +---------------------------------------------------------+
-	 *
-	 * The "original" frame is used by hardware.  Before re-enabling
-	 * NMIs, we need to be done with it, and we need to leave enough
-	 * space for the asm code here.
-	 *
-	 * We return by executing IRET while RSP points to the "iret" frame.
-	 * That will either return for real or it will loop back into NMI
-	 * processing.
-	 *
-	 * The "outermost" frame is copied to the "iret" frame on each
-	 * iteration of the loop, so each iteration starts with the "iret"
-	 * frame pointing to the final return target.
-	 */
-
-	/*
-	 * Determine whether we're a nested NMI.
-	 *
-	 * If we interrupted kernel code between repeat_nmi and
-	 * end_repeat_nmi, then we are a nested NMI.  We must not
-	 * modify the "iret" frame because it's being written by
-	 * the outer NMI.  That's okay; the outer NMI handler is
-	 * about to about to call exc_nmi() anyway, so we can just
-	 * resume the outer NMI.
-	 */
-
-	movq	$repeat_nmi, %rdx
-	cmpq	8(%rsp), %rdx
-	ja	1f
-	movq	$end_repeat_nmi, %rdx
-	cmpq	8(%rsp), %rdx
-	ja	nested_nmi_out
-1:
-
-	/*
-	 * Now check "NMI executing".  If it's set, then we're nested.
-	 * This will not detect if we interrupted an outer NMI just
-	 * before IRET.
-	 */
-	cmpl	$1, -8(%rsp)
-	je	nested_nmi
-
-	/*
-	 * Now test if the previous stack was an NMI stack.  This covers
-	 * the case where we interrupt an outer NMI after it clears
-	 * "NMI executing" but before IRET.  We need to be careful, though:
-	 * there is one case in which RSP could point to the NMI stack
-	 * despite there being no NMI active: naughty userspace controls
-	 * RSP at the very beginning of the SYSCALL targets.  We can
-	 * pull a fast one on naughty userspace, though: we program
-	 * SYSCALL to mask DF, so userspace cannot cause DF to be set
-	 * if it controls the kernel's RSP.  We set DF before we clear
-	 * "NMI executing".
-	 */
-	lea	6*8(%rsp), %rdx
-	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
-	cmpq	%rdx, 4*8(%rsp)
-	/* If the stack pointer is above the NMI stack, this is a normal NMI */
-	ja	first_nmi
-
-	subq	$EXCEPTION_STKSZ, %rdx
-	cmpq	%rdx, 4*8(%rsp)
-	/* If it is below the NMI stack, it is a normal NMI */
-	jb	first_nmi
-
-	/* Ah, it is within the NMI stack. */
-
-	testb	$(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
-	jz	first_nmi	/* RSP was user controlled. */
-
-	/* This is a nested NMI. */
-
-nested_nmi:
-	/*
-	 * Modify the "iret" frame to point to repeat_nmi, forcing another
-	 * iteration of NMI handling.
-	 */
-	subq	$8, %rsp
-	leaq	-10*8(%rsp), %rdx
-	pushq	$__KERNEL_DS
-	pushq	%rdx
-	pushfq
-	pushq	$__KERNEL_CS
-	pushq	$repeat_nmi
-
-	/* Put stack back */
-	addq	$(6*8), %rsp
-
-nested_nmi_out:
-	popq	%rdx
-
-	/* We are returning to kernel mode, so this cannot result in a fault. */
-	iretq
-
-first_nmi:
-	/* Restore rdx. */
-	movq	(%rsp), %rdx
-
-	/* Make room for "NMI executing". */
-	pushq	$0
-
-	/* Leave room for the "iret" frame */
-	subq	$(5*8), %rsp
-
-	/* Copy the "original" frame to the "outermost" frame */
-	.rept 5
-	pushq	11*8(%rsp)
-	.endr
-	UNWIND_HINT_IRET_REGS
-
-	/* Everything up to here is safe from nested NMIs */
-
-#ifdef CONFIG_DEBUG_ENTRY
-	/*
-	 * For ease of testing, unmask NMIs right away.  Disabled by
-	 * default because IRET is very expensive.
-	 */
-	pushq	$0		/* SS */
-	pushq	%rsp		/* RSP (minus 8 because of the previous push) */
-	addq	$8, (%rsp)	/* Fix up RSP */
-	pushfq			/* RFLAGS */
-	pushq	$__KERNEL_CS	/* CS */
-	pushq	$1f		/* RIP */
-	iretq			/* continues at repeat_nmi below */
-	UNWIND_HINT_IRET_REGS
-1:
-#endif
-
-repeat_nmi:
-	ANNOTATE_NOENDBR // this code
-	/*
-	 * If there was a nested NMI, the first NMI's iret will return
-	 * here. But NMIs are still enabled and we can take another
-	 * nested NMI. The nested NMI checks the interrupted RIP to see
-	 * if it is between repeat_nmi and end_repeat_nmi, and if so
-	 * it will just return, as we are about to repeat an NMI anyway.
-	 * This makes it safe to copy to the stack frame that a nested
-	 * NMI will update.
-	 *
-	 * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
-	 * we're repeating an NMI, gsbase has the same value that it had on
-	 * the first iteration.  paranoid_entry will load the kernel
-	 * gsbase if needed before we call exc_nmi().  "NMI executing"
-	 * is zero.
-	 */
-	movq	$1, 10*8(%rsp)		/* Set "NMI executing". */
-
-	/*
-	 * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
-	 * here must not modify the "iret" frame while we're writing to
-	 * it or it will end up containing garbage.
-	 */
-	addq	$(10*8), %rsp
-	.rept 5
-	pushq	-6*8(%rsp)
-	.endr
-	subq	$(5*8), %rsp
-end_repeat_nmi:
-	ANNOTATE_NOENDBR // this code
-
-	/*
-	 * Everything below this point can be preempted by a nested NMI.
-	 * If this happens, then the inner NMI will change the "iret"
-	 * frame to point back to repeat_nmi.
-	 */
-	pushq	$-1				/* ORIG_RAX: no syscall to restart */
-
-	PUSH_AND_CLEAR_REGS
-	UNWIND_HINT_REGS
-	ENCODE_FRAME_POINTER
-
-	/*
-	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
-	 * as we should not be calling schedule in NMI context.
-	 * Even with normal interrupts enabled. An NMI should not be
-	 * setting NEED_RESCHED or anything that normal interrupts and
-	 * exceptions might do.
-	 */
-	call	paranoid_entry
-
-	movq	%rsp, %rdi
-	movq	$-1, %rsi
-	call	exc_nmi
-
-	/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
-	IBRS_EXIT save_reg=%r15
-
-	/* Always restore stashed CR3 value (see paranoid_entry) */
-	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
-
-	/*
-	 * The above invocation of paranoid_entry stored the GSBASE
-	 * related information in R/EBX depending on the availability
-	 * of FSGSBASE.
-	 *
-	 * If FSGSBASE is enabled, restore the saved GSBASE value
-	 * unconditionally, otherwise take the conditional SWAPGS path.
-	 */
-	ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
-
-	wrgsbase	%rbx
-	jmp	nmi_restore
-
-nmi_no_fsgsbase:
-	/* EBX == 0 -> invoke SWAPGS */
-	testl	%ebx, %ebx
-	jnz	nmi_restore
-
-nmi_swapgs:
-	swapgs
-
-nmi_restore:
-	POP_REGS
-
-	/*
-	 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
-	 * at the "iret" frame.
-	 */
-	addq	$6*8, %rsp
-
-	/*
-	 * Clear "NMI executing".  Set DF first so that we can easily
-	 * distinguish the remaining code between here and IRET from
-	 * the SYSCALL entry and exit paths.
-	 *
-	 * We arguably should just inspect RIP instead, but I (Andy) wrote
-	 * this code when I had the misapprehension that Xen PV supported
-	 * NMIs, and Xen PV would break that approach.
-	 */
-	std
-	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */
-
-	/*
-	 * iretq reads the "iret" frame and exits the NMI stack in a
-	 * single instruction.  We are returning to kernel mode, so this
-	 * cannot result in a fault.  Similarly, we don't need to worry
-	 * about espfix64 on the way back to kernel mode.
-	 */
-	iretq
-SYM_CODE_END(asm_exc_nmi)
-
 #ifndef CONFIG_IA32_EMULATION
 /*
  * This handles SYSCALL from 32-bit code.  There is no way to program
diff --git a/arch/x86/entry/ist_entry.c b/arch/x86/entry/ist_entry.c
index e1b06306ac51..407571cc4a8c 100644
--- a/arch/x86/entry/ist_entry.c
+++ b/arch/x86/entry/ist_entry.c
@@ -41,7 +41,7 @@ static __always_inline bool identify_ist_##sym_name(				\
 	return true;								\
 }
 
-DEFINE_IDENTIFY_IST(NMI, nmi, false)
+DEFINE_IDENTIFY_IST(NMI, nmi, true)
 DEFINE_IDENTIFY_IST(DB, debug, false)
 DEFINE_IDENTIFY_IST(MCE, machine_check, false)
 DEFINE_IDENTIFY_IST(VC, vmm_communication, false)
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index b241af4ce9b4..b568f1de6da6 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -450,6 +450,9 @@ __visible noinstr void func(struct pt_regs *regs,			\
 	idtentry_sysvec vector func
 
 #ifdef CONFIG_X86_64
+# define DECLARE_IDTENTRY_NMI(vector, func)				\
+	idtentry_ist vector asm_##func func func has_error_code=0 stack_offset=CEA_stacks_NMI
+
 # define DECLARE_IDTENTRY_MCE(vector, func)				\
 	idtentry_mce_db vector asm_##func func
 
@@ -475,11 +478,11 @@ __visible noinstr void func(struct pt_regs *regs,			\
 /* No ASM emitted for XEN hypervisor callback */
 # define DECLARE_IDTENTRY_XENCB(vector, func)
 
-#endif
-
-/* No ASM code emitted for NMI */
+/* No ASM code emitted for NMI for X86_32 */
 #define DECLARE_IDTENTRY_NMI(vector, func)
 
+#endif
+
 /*
  * ASM code to emit the common vector entry stubs where each stub is
  * packed into IDT_ALIGN bytes.
-- 
2.19.1.6.gb485710b