It is possible for the instruction emulator to decode a different instruction from what was implied by the VM-exit information provided by hardware in vmcs02. Such is the case when the TLB entry for the guest's IP is out of sync with the appropriate page-table mapping if page installation isn't followed with a TLB flush. Currently, KVM refuses to emulate in these scenarios, instead injecting a #UD into L2. While this does address the security risk of CVE-2020-2732, it could result in spurious #UDs to the L2 guest. Fix this by instead flushing the TLB then resuming L2, allowing hardware to generate the appropriate VM-exit to be reflected into L1. Exceptional handling is also required for RSM and RDTSCP instructions. RDTSCP could be emulated on hardware which doesn't support it, therefore hardware will not generate a RDTSCP VM-exit on L2 resume. The dual-monitor treatment of SMM is not supported in nVMX, which implies that L0 should never handle a RSM instruction. Resuming the guest will only result in another #UD. Avoid getting stuck in a loop with these instructions by injecting a #UD for RSM and the appropriate VM-exit for RDTSCP. Fixes: 07721feee46b ("KVM: nVMX: Don't emulate instructions in guest mode") Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx> Cc: Sean Christopherson <sean.j.christopherson@xxxxxxxxx> Signed-off-by: Oliver Upton <oupton@xxxxxxxxxx> Reviewed-by: Jim Mattson <jmattson@xxxxxxxxxx> Reviewed-by: Peter Shier <pshier@xxxxxxxxxx> --- arch/x86/kvm/emulate.c | 2 ++ arch/x86/kvm/kvm_emulate.h | 1 + arch/x86/kvm/vmx/vmx.c | 68 ++++++++++++++++++++++++++++---------- arch/x86/kvm/x86.c | 2 +- 4 files changed, 55 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d0e2825ae617..6e56e7a29ba1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5812,6 +5812,8 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } if (rc == X86EMUL_INTERCEPTED) return EMULATION_INTERCEPTED; + if (rc == X86EMUL_RETRY_INSTR) + return EMULATION_RETRY_INSTR; if (rc == X86EMUL_CONTINUE) writeback_registers(ctxt); diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 43c93ffa76ed..5bfab8d65cd1 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -496,6 +496,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_OK 0 #define EMULATION_RESTART 1 #define EMULATION_INTERCEPTED 2 +#define EMULATION_RETRY_INSTR 3 void init_decode_cache(struct x86_emulate_ctxt *ctxt); int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 08e26a9518c2..ebfafd7837ba 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7329,12 +7329,11 @@ static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) to_vmx(vcpu)->req_immediate_exit = true; } -static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info) +static bool vmx_check_intercept_io(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned short port; - bool intercept; int size; if (info->intercept == x86_intercept_in || @@ -7354,13 +7353,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. */ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - intercept = nested_cpu_has(vmcs12, - CPU_BASED_UNCOND_IO_EXITING); - else - intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); + return nested_cpu_has(vmcs12, + CPU_BASED_UNCOND_IO_EXITING); - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ - return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; + return nested_vmx_check_io_bitmaps(vcpu, port, size); } static int vmx_check_intercept(struct kvm_vcpu *vcpu, @@ -7369,6 +7365,7 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + bool intercepted; switch (info->intercept) { /* @@ -7381,13 +7378,27 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; } + + intercepted = nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); + + /* + * RDTSCP could be emulated on a CPU which doesn't support it. + * As such, flushing the TLB and resuming L2 will result in + * another #UD rather than a VM-exit to reflect into L1. + * Instead, synthesize the VM-exit here. + */ + if (intercepted) { + nested_vmx_vmexit(vcpu, EXIT_REASON_RDTSCP, 0, 0); + return X86EMUL_INTERCEPTED; + } break; case x86_intercept_in: case x86_intercept_ins: case x86_intercept_out: case x86_intercept_outs: - return vmx_check_intercept_io(vcpu, info); + intercepted = vmx_check_intercept_io(vcpu, info); + break; case x86_intercept_lgdt: case x86_intercept_lidt: @@ -7397,18 +7408,41 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, case x86_intercept_sidt: case x86_intercept_sldt: case x86_intercept_str: - if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) - return X86EMUL_CONTINUE; - - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ + intercepted = nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); break; - /* TODO: check more intercepts... */ + /* + * The dual-monitor treatment of SMM is not supported in nVMX. As such, + * L0 will never handle the RSM instruction nor should it retry + * instruction execution. Instead, a #UD should be injected into the + * guest for the execution of RSM outside of SMM. + */ + case x86_intercept_rsm: + exception->vector = UD_VECTOR; + exception->error_code_valid = false; + return X86EMUL_PROPAGATE_FAULT; + default: - break; + intercepted = true; } - return X86EMUL_UNHANDLEABLE; + if (!intercepted) + return X86EMUL_CONTINUE; + + /* + * The only uses of the emulator in VMX for instructions which may be + * intercepted are port IO instructions, descriptor-table accesses, and + * the RDTSCP instruction. As such, if the emulator has decoded an + * instruction that is different from the VM-exit provided by hardware + * it is likely that the TLB entry and page-table mapping for the + * guest's RIP are out of sync. + * + * Rather than synthesizing a VM-exit into L1 for every possible + * instruction just flush the TLB, resume L2, and let hardware generate + * the appropriate VM-exit. + */ + vmx_flush_tlb_gva(vcpu, kvm_rip_read(vcpu)); + return X86EMUL_RETRY_INSTR; } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00c88c2f34e4..2ab47485100f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6967,7 +6967,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, r = x86_emulate_insn(ctxt); - if (r == EMULATION_INTERCEPTED) + if (r == EMULATION_INTERCEPTED || r == EMULATION_RETRY_INSTR) return 1; if (r == EMULATION_FAILED) { -- 2.27.0.290.gba653c62da-goog