Split out the really last steps of guest enter and the early guest exit code and mark it .noinstr.text along with the ASM code invoked from there. The few functions which are invoked from there are either made __always_inline or marked with noinstr which moves them into the .noinstr.text section. Use native_wrmsr() in the L1D flush code to prevent a tracepoint from being inserted. Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx> Cc: kvm@xxxxxxxxxxxxxxx --- arch/x86/include/asm/hardirq.h | 4 - arch/x86/include/asm/kvm_host.h | 8 +++ arch/x86/kvm/vmx/ops.h | 4 + arch/x86/kvm/vmx/vmenter.S | 2 arch/x86/kvm/vmx/vmx.c | 105 ++++++++++++++++++++++------------------ arch/x86/kvm/x86.c | 2 6 files changed, 76 insertions(+), 49 deletions(-) --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -67,12 +67,12 @@ static inline void kvm_set_cpu_l1tf_flus __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); } -static inline void kvm_clear_cpu_l1tf_flush_l1d(void) +static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0); } -static inline bool kvm_get_cpu_l1tf_flush_l1d(void) +static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void) { return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); } --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -131,7 +131,9 @@ do { \ : : op1 : "cc" : error, fault); \ return; \ error: \ + instr_begin(); \ insn##_error(error_args); \ + instr_end(); \ return; \ fault: \ kvm_spurious_fault(); \ @@ -146,7 +148,9 @@ do { \ : : op1, op2 : "cc" : error, fault); \ return; \ error: \ + instr_begin(); \ insn##_error(error_args); \ + instr_end(); \ return; \ fault: \ kvm_spurious_fault(); \ --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -27,7 +27,7 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif - .text +.section .noinstr.text, "ax" /** * vmx_vmenter - VM-Enter the current loaded VMCS --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5931,7 +5931,7 @@ static int vmx_handle_exit(struct kvm_vc * information but as all relevant affected CPUs have 32KiB L1D cache size * there is no point in doing so. */ -static void vmx_l1d_flush(struct kvm_vcpu *vcpu) +static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) { int size = PAGE_SIZE << L1D_CACHE_ORDER; @@ -5964,7 +5964,7 @@ static void vmx_l1d_flush(struct kvm_vcp vcpu->stat.l1d_flush++; if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return; } @@ -6452,7 +6452,7 @@ static void vmx_update_hv_timer(struct k } } -void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) +void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) { if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { vmx->loaded_vmcs->host_state.rsp = host_rsp; @@ -6462,6 +6462,61 @@ void vmx_update_host_rsp(struct vcpu_vmx bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); +static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, + struct vcpu_vmx *vmx) +{ + instr_begin(); + /* + * VMENTER enables interrupts (host state), but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about + * it. This is the same logic as for exit_to_user_mode(). + * + * 1) Trace interrupts on state + * 2) Prepare lockdep with RCU on + * 3) Invoke context tracking if enabled to adjust RCU state + * 4) Tell lockdep that interrupts are enabled + */ + __trace_hardirqs_on(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instr_end(); + + guest_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); + + /* L1D Flush includes CPU buffer clear to mitigate MDS */ + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + else if (static_branch_unlikely(&mds_user_clear)) + mds_clear_cpu_buffers(); + + if (vcpu->arch.cr2 != read_cr2()) + write_cr2(vcpu->arch.cr2); + + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + vmx->loaded_vmcs->launched); + + vcpu->arch.cr2 = read_cr2(); + + /* + * VMEXIT disables interrupts (host state), but tracing and lockdep + * have them in state 'on'. Same as enter_from_user_mode(). + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + * + * This needs to be done before the below as native_read_msr() + * contains a tracepoint and x86_spec_ctrl_restore_host() calls + * into world and some more. + */ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_exit_irqoff(); + + instr_begin(); + __trace_hardirqs_off(); + instr_end(); +} + static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6538,49 +6593,9 @@ static void vmx_vcpu_run(struct kvm_vcpu x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); /* - * VMENTER enables interrupts (host state), but the kernel state is - * interrupts disabled when this is invoked. Also tell RCU about - * it. This is the same logic as for exit_to_user_mode(). - * - * 1) Trace interrupts on state - * 2) Prepare lockdep with RCU on - * 3) Invoke context tracking if enabled to adjust RCU state - * 4) Tell lockdep that interrupts are enabled + * The actual VMENTER/EXIT is in the .noinstr.text section. */ - __trace_hardirqs_on(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - guest_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); - - /* L1D Flush includes CPU buffer clear to mitigate MDS */ - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mds_user_clear)) - mds_clear_cpu_buffers(); - - if (vcpu->arch.cr2 != read_cr2()) - write_cr2(vcpu->arch.cr2); - - vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - vmx->loaded_vmcs->launched); - - vcpu->arch.cr2 = read_cr2(); - - /* - * VMEXIT disables interrupts (host state), but tracing and lockdep - * have them in state 'on'. Same as enter_from_user_mode(). - * - * 1) Tell lockdep that interrupts are disabled - * 2) Invoke context tracking if enabled to reactivate RCU - * 3) Trace interrupts off state - * - * This needs to be done before the below as native_read_msr() - * contains a tracepoint and x86_spec_ctrl_restore_host() calls - * into world and some more. - */ - lockdep_hardirqs_off(CALLER_ADDR0); - guest_exit_irqoff(); - __trace_hardirqs_off(); + vmx_vcpu_enter_exit(vcpu, vmx); /* * We do not use IBRS in the kernel. If this vCPU has used the --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -354,7 +354,7 @@ int kvm_set_apic_base(struct kvm_vcpu *v } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible void kvm_spurious_fault(void) +asmlinkage __visible noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting);