For performance reasons, KVM defers many consistency checks to the CPU, including checks that result in VMFail (as opposed to VMExit). This behavior may be undesirable for some users since KVM detects certain classes of VMFail only after it has processed guest state, e.g. emulated MSR load-on-entry. Because there is a strict ordering between checks that cause VMFail and those that cause VMExit, i.e. all VMFail checks are performed before any checks that cause VMExit, we can detect all VMFail conditions via a dry run of sorts. After preparing vmcs02 with all state needed to pass the VMFail consistency checks, optionally do a "test" VMEnter with an invalid GUEST_RIP. If the VMEnter results in a VMExit (due to bad guest state), then we can safely say that the nested VMEnter should not VMFail, i.e. any VMFail encountered in nested_vmx_vmexit() must be due to an L0 bug. Add a module param, early_consistency_checks, to provide control over whether or not VMX performs the early consistency checks. In addition to standard on/off behavior, the param accepts a value of -1, which is essentialy an "auto" setting whereby KVM does the early checks only when it thinks it's running on bare metal. When running nested, doing early checks is of dubious value since the resulting behavior is heavily dependent on L0. Unfortunately, since the "passing" case causes a VMExit, KVM must be extra diligent to ensure that host state is restored, e.g. DR7 and RFLAGS are reset on VMExit. Failure to restore RFLAGS.IF is particularly fatal. And of course the extra VMEnter and VMExit impacts performance. The raw overhead of the early consistency checks is ~6% on modern hardware (though this could easily vary based on configuration), while the added latency observed from the L1 VMM is ~10%. The early consistency checks do not occur in a vacuum, e.g. spending more time in L0 can lead to more interrupts being serviced while emulating VMEnter, thereby increasing the latency observed by L1. Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx> --- arch/x86/kvm/vmx.c | 152 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 146 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index af65477fe317..92baa2573825 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -53,6 +53,7 @@ #include <asm/mmu_context.h> #include <asm/spec-ctrl.h> #include <asm/mshyperv.h> +#include <asm/hypervisor.h> #include "trace.h" #include "pmu.h" @@ -109,6 +110,13 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested = 0; module_param(nested, bool, S_IRUGO); +/* + * early_consistency_checks is a tri-state: -1 == "native only", + * 0 == "never" and 1 == "always". + */ +static int __read_mostly early_consistency_checks = -1; +module_param(early_consistency_checks, int, S_IRUGO); + static u64 __read_mostly host_xss; static bool __read_mostly enable_pml = 1; @@ -187,6 +195,7 @@ static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); extern const ulong vmx_return; +extern const ulong vmx_early_consistency_check_return; struct kvm_vmx { struct kvm kvm; @@ -7527,6 +7536,10 @@ static __init int hardware_setup(void) if (!cpu_has_virtual_nmis()) enable_vnmi = 0; + if (early_consistency_checks < 0 && + !hypervisor_is_type(X86_HYPER_NATIVE)) + early_consistency_checks = 0; + /* * set_apic_access_page_addr() is used to reload apic access * page upon invalidation. No need to do anything if not @@ -11169,6 +11182,14 @@ static void prepare_vmcs02_first_run(struct vcpu_vmx *vmx) if (vmx->nested.vmcs02.launched) return; + /* + * We don't care what the EPTP value is we just need to guarantee + * it's valid so we don't get a false positive when doing early + * consistency checks. + */ + if (enable_ept && early_consistency_checks) + vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); + /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); @@ -11222,7 +11243,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * entry, but only if the current (host) sp changed from the value * we wrote last (vmx->host_rsp). This cache is no longer relevant * if we switch vmcs, and rather than hold a separate cache per vmcs, - * here we just force the write to happen on entry. + * here we just force the write to happen on entry. host_rsp will + * also be written unconditionally by nested_vmx_check_vmentry_hw() + * if we are doing early consistency checks via hardware. */ vmx->host_rsp = 0; @@ -11808,9 +11831,120 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 0; } +static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4; + + if (!early_consistency_checks) + return 0; + + if (vmx->msr_autoload.nr) { + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + } + + preempt_disable(); + + vmx_save_host_state(vcpu); + + /* + * prepare_vmcs02() writes GUEST_RIP unconditionally, no need + * to save/restore or set dirty bits. + */ + vmcs_writel(GUEST_RIP, 0xf0f0ULL << 48); + + vmcs_writel(HOST_RIP, vmx_early_consistency_check_return); + + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + } + + cr4 = cr4_read_shadow(); + if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + } + + vmx->__launched = vmx->loaded_vmcs->launched; + + asm( + /* Set HOST_RSP */ + __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t" + + /* Check if vmlaunch of vmresume is needed */ + "cmpl $0, %c[launched](%0)\n\t" + "je 1f\n\t" + __ex(ASM_VMX_VMRESUME) "\n\t" + "jmp 2f\n\t" + "1: " __ex(ASM_VMX_VMLAUNCH) "\n\t" + "jmp 2f\n\t" + "2: " + + /* Set vmx->fail accordingly */ + "setbe %c[fail](%0)\n\t" + + ".pushsection .rodata\n\t" + ".global vmx_early_consistency_check_return\n\t" + "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t" + ".popsection" + : + : "c"(vmx), "d"((unsigned long)HOST_RSP), + [launched]"i"(offsetof(struct vcpu_vmx, __launched)), + [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)) + : "rax", "cc", "memory" + ); + + vmcs_writel(HOST_RIP, vmx_return); + + preempt_enable(); + + if (vmx->msr_autoload.nr) { + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + } + + if (vmx->fail) { + WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != + VMXERR_ENTRY_INVALID_CONTROL_FIELD); + vmx->fail = 0; + return 1; + } + + /* + * VMExit clears RFLAGS.IF and DR7, even on a consistency check. + */ + local_irq_enable(); + if (hw_breakpoint_active()) + set_debugreg(__this_cpu_read(cpu_dr7), 7); + + /* + * A non-failing VMEntry means we somehow entered guest mode with + * an illegal RIP, and that's just the tip of the iceberg. There + * is no telling what memory has been modified or what state has + * been exposed to unknown code. Hitting this all but guarantees + * a (very critical) hardware issue. + */ + BUG_ON(!(vmcs_read32(VM_EXIT_REASON) & + VMX_EXIT_REASONS_FAILED_VMENTRY)); + + return 0; +} +STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); + static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); +/* + * Return values: + * 0 - success, i.e. proceed with actual VMEnter + * 1 - consistency check VMExit + * -1 - consistency check VMFail + */ static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -11830,6 +11964,11 @@ static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu) if (!is_vmentry) goto enter_non_root_mode; + if (nested_vmx_check_vmentry_hw(vcpu)) { + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + return -1; + } + if (check_vmentry_postreqs(vcpu, vmcs12, &exit_reason, &exit_qual)) goto consistency_check_vmexit; @@ -11931,13 +12070,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * We're finally done with prerequisite checking, and can start with * the nested entry. */ - vmx->nested.nested_run_pending = 1; ret = nested_vmx_enter_non_root_mode(vcpu); - if (ret) { - vmx->nested.nested_run_pending = 0; - return ret; - } + vmx->nested.nested_run_pending = !ret; + if (ret < 0) { + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + goto out; + } else if (ret) + return 1; /* * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken -- 2.18.0