"KVM: VMX:" for the shortlog please. On Mon, Apr 10, 2023, Xin Li wrote: > -.macro VMX_DO_EVENT_IRQOFF call_insn call_target > +.macro VMX_DO_EVENT_IRQOFF call_insn call_target fred=0 nmi=0 > /* > * Unconditionally create a stack frame, getting the correct RSP on the > * stack (for x86-64) would take two instructions anyways, and RBP can > @@ -41,16 +43,55 @@ > mov %_ASM_SP, %_ASM_BP > > #ifdef CONFIG_X86_64 > +#ifdef CONFIG_X86_FRED > + /* > + * It's not necessary to change current stack level for handling IRQ/NMI > + * because the state of the kernel stack is well defined in this place > + * in the code, and it is known not to be deep in a bunch of nested I/O > + * layer handlers that eat up the stack. > + * > + * Before starting to push a FRED stack frame, FRED reserves a redzone > + * (for CALL emulation) and aligns RSP to a 64-byte boundary. > + */ > + sub $(FRED_CONFIG_REDZONE_AMOUNT << 6), %rsp > + and $FRED_STACK_FRAME_RSP_MASK, %rsp > + > + /* > + * A FRED stack frame has extra 16 bytes of information pushed at the > + * regular stack top comparing to an IDT stack frame. > + */ > + push $0 /* Reserved by FRED, must be 0 */ > + push $0 /* FRED event data, 0 for NMI and external interrupts */ > +#else > /* > * Align RSP to a 16-byte boundary (to emulate CPU behavior) before > * creating the synthetic interrupt stack frame for the IRQ/NMI. > */ > and $-16, %rsp > - push $__KERNEL_DS > +#endif > + > + .if \fred > + .if \nmi > + mov $(2 << 32 | 2 << 48), %rax /* NMI event type and vector */ > + .else > + mov %rdi, %rax > + shl $32, %rax /* External interrupt vector */ > + .endif > + add $__KERNEL_DS, %rax > + bts $57, %rax /* Set 64-bit mode */ > + .else > + mov $__KERNEL_DS, %rax > + .endif > + push %rax This is painfully difficult to read, and the trampolines only add to that pain. Using macros instead of magic numbers would alleviate a small amount of pain, but but the #ifdefs and .if \fred/\nmi are the real culprits. > static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) > @@ -6916,14 +6916,20 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) > { > u32 intr_info = vmx_get_intr_info(vcpu); > unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; > - gate_desc *desc = (gate_desc *)host_idt_base + vector; > + unsigned long entry_or_vector; > + > +#ifdef CONFIG_X86_64 > + entry_or_vector = vector; > +#else > + entry_or_vector = gate_offset((gate_desc *)host_idt_base + vector); > +#endif And then this is equally gross. Rather than funnel FRED+legacy into a single function only to split them back out, just route FRED into its own asm subroutine. The common bits are basically the creation/destruction of the stack frame and the CALL itself, i.e. the truly interesting bits are what's different. Pretty much all of the #ifdeffery goes away, the helpers just need #ifdefs to play nice with CONFIG_X86_FRED=n. E.g. something like the below as a starting point (it most definitely doesn't compile, and most definitely isn't 100% correct). --- arch/x86/kvm/vmx/vmenter.S | 72 ++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 19 ++++++++-- 2 files changed, 88 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 631fd7da2bc3..a6929c78e038 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -2,12 +2,14 @@ #include <linux/linkage.h> #include <asm/asm.h> #include <asm/bitsperlong.h> +#include <asm/fred.h> #include <asm/kvm_vcpu_regs.h> #include <asm/nospec-branch.h> #include <asm/percpu.h> #include <asm/segment.h> #include "kvm-asm-offsets.h" #include "run_flags.h" +#include "../../entry/calling.h" #define WORD_SIZE (BITS_PER_LONG / 8) @@ -31,6 +33,62 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif +#ifdef CONFIG_X86_FRED +.macro VMX_DO_FRED_EVENT_IRQOFF call_target cs_val + /* + * Unconditionally create a stack frame, getting the correct RSP on the + * stack (for x86-64) would take two instructions anyways, and RBP can + * be used to restore RSP to make objtool happy (see below). + */ + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + + /* + * Don't check the FRED stack level, the call stack leading to this + * helper is effectively constant and shallow (relatively speaking). + * + * Emulate the FRED-defined redzone and stack alignment (128 bytes and + * 64 bytes respectively). + */ + sub $(FRED_CONFIG_REDZONE_AMOUNT << 6), %rsp + and $FRED_STACK_FRAME_RSP_MASK, %rsp + + /* + * A FRED stack frame has extra 16 bytes of information pushed at the + * regular stack top compared to an IDT stack frame. + */ + push $0 /* Reserved by FRED, must be 0 */ + push $0 /* FRED event data, 0 for NMI and external interrupts */ + shl $32, %rax + orq $__KERNEL_DS | $FRED_64_BIT_MODE, %ax + push %rax /* Vector (from the "caller") and DS */ + + push %rbp + pushf + push \cs_val + + push $0 /* FRED error code, 0 for NMI and external interrupts */ + PUSH_REGS + + /* Load @pt_regs */ + movq %rsp, %_ASM_ARG1 + + call \call_target + + POP_REGS + + /* + * "Restore" RSP from RBP, even though IRET has already unwound RSP to + * the correct value. objtool doesn't know the callee will IRET and, + * without the explicit restore, thinks the stack is getting walloped. + * Using an unwind hint is problematic due to x86-64's dynamic alignment. + */ + mov %_ASM_BP, %_ASM_SP + pop %_ASM_BP + RET +.endm +#endif + .macro VMX_DO_EVENT_IRQOFF call_insn call_target /* * Unconditionally create a stack frame, getting the correct RSP on the @@ -299,6 +357,14 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) SYM_FUNC_END(__vmx_vcpu_run) +#ifdef CONFIG_X86_FRED +SYM_FUNC_START(vmx_do_fred_nmi_irqoff) + push $FRED_NMI_ERROR_CODE + mov $NMI_VECTOR | $FRED_NMI_SOMETHING, %eax + VMX_DO_FRED_EVENT_IRQOFF call fred_entrypoint_kernel $FRED_NMI_CS_VAL +SYM_FUNC_END(vmx_do_nmi_irqoff) +#endif + SYM_FUNC_START(vmx_do_nmi_irqoff) VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx SYM_FUNC_END(vmx_do_nmi_irqoff) @@ -357,6 +423,12 @@ SYM_FUNC_START(vmread_error_trampoline) SYM_FUNC_END(vmread_error_trampoline) #endif +#ifdef CONFIG_X86_FRED +SYM_FUNC_START(vmx_do_fred_interrupt_irqoff) + mov %_ASM_ARG1, %rax + VMX_DO_FRED_EVENT_IRQOFF call external_interrupt +#endif + SYM_FUNC_START(vmx_do_interrupt_irqoff) VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 SYM_FUNC_END(vmx_do_interrupt_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 11080a649f60..42f50b0cc125 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6891,6 +6891,14 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } +#ifdef CONFIG_X86_FRED +void vmx_do_fred_interrupt_irqoff(unsigned int vector); +void vmx_do_fred_nmi_irqoff(unsigned int vector); +#else +#define vmx_do_fred_interrupt_irqoff(x) BUG(); +#define vmx_do_fred_nmi_irqoff(x) BUG(); +#endif + void vmx_do_interrupt_irqoff(unsigned long entry); void vmx_do_nmi_irqoff(void); @@ -6933,14 +6941,16 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { u32 intr_info = vmx_get_intr_info(vcpu); unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; - gate_desc *desc = (gate_desc *)host_idt_base + vector; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); - vmx_do_interrupt_irqoff(gate_offset(desc)); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + vmx_do_fred_interrupt_irqoff(vector); + else + vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); kvm_after_interrupt(vcpu); vcpu->arch.at_instruction_boundary = true; @@ -7226,7 +7236,10 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && is_nmi(vmx_get_intr_info(vcpu))) { kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - vmx_do_nmi_irqoff(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + vmx_do_fred_nmi_irqoff(); + else + vmx_do_nmi_irqoff(); kvm_after_interrupt(vcpu); } base-commit: 33d1a64081c98e390e064db18738428d6fb96f95 --