This patch contains the logic of whether an L2 exit should be handled by L0 and then L2 should be resumed, or whether L1 should be run to handle this exit (using the nested_vmx_vmexit() function of the previous patch). The basic idea is to let L1 handle the exit only if it actually asked to trap this sort of event. For example, when L2 exits on a change to CR0, we check L1's CR0_GUEST_HOST_MASK to see if L1 expressed interest in any bit which changed; If it did, we exit to L1. But if it didn't it means that it is we (L0) that wished to trap this event, so we handle it ourselves. The next two patches add additional logic of what to do when an interrupt or exception is injected: Does L0 need to do it, should we exit to L1 to do it, or should we resume L2 and keep the exception to be injected later. We keep a new flag, "nested_run_pending", which can override the decision of which should run next, L1 or L2. nested_run_pending=1 means that we *must* run L2 next, not L1. This is necessary in several situations where had L1 run on bare metal it would not have expected to be resumed at this stage. One example is when L1 did a VMLAUNCH of L2 and therefore expects L2 to be run. Another examples is when L2 exits on an #NM exception that L0 asked for (because of lazy FPU loading), and L0 must deal with the exception and resume L2 which was in a middle of an instruction, and not resume L1 which does not expect to see an exit from L2 at this point. nested_run_pending is especially intended to avoid switching to L1 in the injection decision-point described above. Signed-off-by: Nadav Har'El <nyh@xxxxxxxxxx> --- --- .before/arch/x86/kvm/vmx.c 2010-06-13 15:01:30.000000000 +0300 +++ .after/arch/x86/kvm/vmx.c 2010-06-13 15:01:30.000000000 +0300 @@ -318,6 +318,8 @@ struct nested_vmx { struct shadow_vmcs *l1_shadow_vmcs; /* Level 1 vmcs loaded into the processor */ struct vmcs *l1_vmcs; + /* L2 must run next, and mustn't decide to exit to L1. */ + bool nested_run_pending; }; enum vmcs_field_type { @@ -900,6 +902,24 @@ static inline bool nested_cpu_has_vmx_ep } +static inline bool nested_cpu_has_vmx_msr_bitmap(struct kvm_vcpu *vcpu) +{ + return get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control & + CPU_BASED_USE_MSR_BITMAPS; +} + +static inline bool is_exception(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); +} + +static inline bool is_nmi(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -3694,6 +3714,8 @@ static void vmx_set_nmi_mask(struct kvm_ } } +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu, bool is_interrupt); + static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && @@ -3819,6 +3841,8 @@ static int handle_exception(struct kvm_v if (is_no_device(intr_info)) { vmx_fpu_activate(vcpu); + if (vmx->nested.nested_mode) + vmx->nested.nested_run_pending = 1; return 1; } @@ -4989,6 +5013,202 @@ static int (*kvm_vmx_exit_handlers[])(st static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); +/* Return 1 if we should exit from L2 to L1 to handle an MSR access exit, + * rather than handle it ourselves in L0. I.e., check L1's MSR bitmap whether + * it expressed interest in the current event (read or write a specific MSR). + */ +static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, + struct shadow_vmcs *l2svmcs, u32 exit_code) +{ + u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; + struct page *msr_bitmap_page; + void *va; + bool ret; + + if (!cpu_has_vmx_msr_bitmap() || !nested_cpu_has_vmx_msr_bitmap(vcpu)) + return 1; + + msr_bitmap_page = nested_get_page(vcpu, l2svmcs->msr_bitmap); + if (!msr_bitmap_page) { + printk(KERN_INFO "%s error in nested_get_page\n", __func__); + return 0; + } + + va = kmap_atomic(msr_bitmap_page, KM_USER1); + if (exit_code == EXIT_REASON_MSR_WRITE) + va += 0x800; + if (msr_index >= 0xc0000000) { + msr_index -= 0xc0000000; + va += 0x400; + } + if (msr_index > 0x1fff) + return 0; + ret = test_bit(msr_index, va); + kunmap_atomic(va, KM_USER1); + return ret; +} + +/* Return 1 if we should exit from L2 to L1 to handle a CR access exit, + * rather than handle it ourselves in L0. I.e., check if L1 wanted to + * intercept (via guest_host_mask etc.) the current event. + */ +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, + struct shadow_vmcs *l2svmcs) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int cr = exit_qualification & 15; + int reg = (exit_qualification >> 8) & 15; + unsigned long val = kvm_register_read(vcpu, reg); + + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + switch (cr) { + case 0: + if (l2svmcs->cr0_guest_host_mask & + (val ^ l2svmcs->cr0_read_shadow)) + return 1; + break; + case 3: + if (l2svmcs->cpu_based_vm_exec_control & + CPU_BASED_CR3_LOAD_EXITING) + return 1; + break; + case 4: + if (l2svmcs->cr4_guest_host_mask & + (l2svmcs->cr4_read_shadow ^ val)) + return 1; + break; + case 8: + if (l2svmcs->cpu_based_vm_exec_control & + CPU_BASED_CR8_LOAD_EXITING) + return 1; + break; + } + break; + case 2: /* clts */ + if (l2svmcs->cr0_guest_host_mask & X86_CR0_TS) + return 1; + break; + case 1: /* mov from cr */ + switch (cr) { + case 0: + return 1; + case 3: + if (l2svmcs->cpu_based_vm_exec_control & + CPU_BASED_CR3_STORE_EXITING) + return 1; + break; + case 4: + return 1; + break; + case 8: + if (l2svmcs->cpu_based_vm_exec_control & + CPU_BASED_CR8_STORE_EXITING) + return 1; + break; + } + break; + case 3: /* lmsw */ + if (l2svmcs->cr0_guest_host_mask & + (val ^ l2svmcs->cr0_read_shadow)) + return 1; + break; + } + return 0; +} + +/* Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we + * should handle it ourselves in L0. Only call this when in nested_mode (L2). + */ +static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool afterexit) +{ + u32 exit_code = vmcs_read32(VM_EXIT_REASON); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + struct shadow_vmcs *l2svmcs; + int r = 0; + + if (vmx->nested.nested_run_pending) + return 0; + + if (unlikely(vmx->fail)) { + printk(KERN_INFO "%s failed vm entry %x\n", + __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); + return 1; + } + + if (afterexit) { + /* There are some cases where we should let L1 handle certain + * events when these are injected (afterexit==0) but we should + * handle them in L0 on an exit (afterexit==1). + */ + switch (exit_code) { + case EXIT_REASON_EXTERNAL_INTERRUPT: + return 0; + case EXIT_REASON_EXCEPTION_NMI: + if (!is_exception(intr_info)) + return 0; + if (is_page_fault(intr_info) && (!enable_ept)) + return 0; + break; + case EXIT_REASON_EPT_VIOLATION: + if (enable_ept) + return 0; + break; + } + } + + if (!nested_map_current(vcpu)) + return 0; + l2svmcs = get_shadow_vmcs(vcpu); + + switch (exit_code) { + case EXIT_REASON_INVLPG: + if (l2svmcs->cpu_based_vm_exec_control & + CPU_BASED_INVLPG_EXITING) + r = 1; + break; + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + r = nested_vmx_exit_handled_msr(vcpu, l2svmcs, exit_code); + break; + case EXIT_REASON_CR_ACCESS: + r = nested_vmx_exit_handled_cr(vcpu, l2svmcs); + break; + case EXIT_REASON_DR_ACCESS: + if (l2svmcs->cpu_based_vm_exec_control & + CPU_BASED_MOV_DR_EXITING) + r = 1; + break; + case EXIT_REASON_EXCEPTION_NMI: + if (is_external_interrupt(intr_info) && + (l2svmcs->pin_based_vm_exec_control & + PIN_BASED_EXT_INTR_MASK)) + r = 1; + else if (is_nmi(intr_info) && + (l2svmcs->pin_based_vm_exec_control & + PIN_BASED_NMI_EXITING)) + r = 1; + else if (is_exception(intr_info) && + (l2svmcs->exception_bitmap & + (1u << (intr_info & INTR_INFO_VECTOR_MASK)))) + r = 1; + else if (is_page_fault(intr_info)) + r = 1; + break; + case EXIT_REASON_EXTERNAL_INTERRUPT: + if (l2svmcs->pin_based_vm_exec_control & + PIN_BASED_EXT_INTR_MASK) + r = 1; + break; + default: + r = 1; + } + nested_unmap_current(vcpu); + + return r; +} + /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -5005,6 +5225,17 @@ static int vmx_handle_exit(struct kvm_vc if (vmx->emulation_required && emulate_invalid_guest_state) return handle_invalid_guest_state(vcpu); + if (exit_reason == EXIT_REASON_VMLAUNCH || + exit_reason == EXIT_REASON_VMRESUME) + vmx->nested.nested_run_pending = 1; + else + vmx->nested.nested_run_pending = 0; + + if (vmx->nested.nested_mode && nested_vmx_exit_handled(vcpu, true)) { + nested_vmx_vmexit(vcpu, false); + return 1; + } + /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ if (enable_ept && is_paging(vcpu)) @@ -5956,6 +6187,7 @@ static int nested_vmx_run(struct kvm_vcp r = kvm_mmu_load(vcpu); if (unlikely(r)) { printk(KERN_ERR "Error in kvm_mmu_load r %d\n", r); + nested_vmx_vmexit(vcpu, false); set_rflags_to_vmx_fail_valid(vcpu); /* switch back to L1 */ vmx->nested.nested_mode = 0; -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html