> From: Nadav Har'El > Sent: Tuesday, May 17, 2011 3:55 AM > > This patch contains the logic of whether an L2 exit should be handled by L0 > and then L2 should be resumed, or whether L1 should be run to handle this > exit (using the nested_vmx_vmexit() function of the previous patch). > > The basic idea is to let L1 handle the exit only if it actually asked to > trap this sort of event. For example, when L2 exits on a change to CR0, > we check L1's CR0_GUEST_HOST_MASK to see if L1 expressed interest in any > bit which changed; If it did, we exit to L1. But if it didn't it means that > it is we (L0) that wished to trap this event, so we handle it ourselves. > > The next two patches add additional logic of what to do when an interrupt or > exception is injected: Does L0 need to do it, should we exit to L1 to do it, > or should we resume L2 and keep the exception to be injected later. > > We keep a new flag, "nested_run_pending", which can override the decision of > which should run next, L1 or L2. nested_run_pending=1 means that we *must* > run > L2 next, not L1. This is necessary in particular when L1 did a VMLAUNCH of L2 > and therefore expects L2 to be run (and perhaps be injected with an event it > specified, etc.). Nested_run_pending is especially intended to avoid switching > to L1 in the injection decision-point described above. > > Signed-off-by: Nadav Har'El <nyh@xxxxxxxxxx> > --- > arch/x86/kvm/vmx.c | 256 > ++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 255 insertions(+), 1 deletion(-) > > --- .before/arch/x86/kvm/vmx.c 2011-05-16 22:36:49.000000000 +0300 > +++ .after/arch/x86/kvm/vmx.c 2011-05-16 22:36:49.000000000 +0300 > @@ -351,6 +351,8 @@ struct nested_vmx { > /* Saving the VMCS that we used for running L1 */ > struct saved_vmcs saved_vmcs01; > u64 vmcs01_tsc_offset; > + /* L2 must run next, and mustn't decide to exit to L1. */ > + bool nested_run_pending; > /* > * Guest pages referred to in vmcs02 with host-physical pointers, so > * we must keep them pinned while L2 runs. > @@ -870,6 +872,20 @@ static inline bool nested_cpu_has2(struc > (vmcs12->secondary_vm_exec_control & bit); > } > > +static inline bool nested_cpu_has_virtual_nmis(struct kvm_vcpu *vcpu) > +{ > + return is_guest_mode(vcpu) && > + (get_vmcs12(vcpu)->pin_based_vm_exec_control & > + PIN_BASED_VIRTUAL_NMIS); > +} any reason to add guest mode check here? I didn't see such check in your earlier nested_cpu_has_xxx. It would be clearer to use existing nested_cpu_has_xxx along with is_guest_mode explicitly which makes such usage consistent. > + > +static inline bool is_exception(u32 intr_info) > +{ > + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | > INTR_INFO_VALID_MASK)) > + == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); > +} > + > +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); > static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, > struct vmcs12 *vmcs12, > u32 reason, unsigned long qualification); > @@ -5281,6 +5297,232 @@ static int (*kvm_vmx_exit_handlers[])(st > static const int kvm_vmx_max_exit_handlers = > ARRAY_SIZE(kvm_vmx_exit_handlers); > > +/* > + * Return 1 if we should exit from L2 to L1 to handle an MSR access access, > + * rather than handle it ourselves in L0. I.e., check whether L1 expressed > + * disinterest in the current event (read or write a specific MSR) by using an > + * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. > + */ > +static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, > + struct vmcs12 *vmcs12, u32 exit_reason) > +{ > + u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; > + gpa_t bitmap; > + > + if (!nested_cpu_has(get_vmcs12(vcpu), > CPU_BASED_USE_MSR_BITMAPS)) > + return 1; > + > + /* > + * The MSR_BITMAP page is divided into four 1024-byte bitmaps, > + * for the four combinations of read/write and low/high MSR numbers. > + * First we need to figure out which of the four to use: > + */ > + bitmap = vmcs12->msr_bitmap; > + if (exit_reason == EXIT_REASON_MSR_WRITE) > + bitmap += 2048; > + if (msr_index >= 0xc0000000) { > + msr_index -= 0xc0000000; > + bitmap += 1024; > + } > + > + /* Then read the msr_index'th bit from this bitmap: */ > + if (msr_index < 1024*8) { > + unsigned char b; > + kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); > + return 1 & (b >> (msr_index & 7)); > + } else > + return 1; /* let L1 handle the wrong parameter */ > +} > + > +/* > + * Return 1 if we should exit from L2 to L1 to handle a CR access exit, > + * rather than handle it ourselves in L0. I.e., check if L1 wanted to > + * intercept (via guest_host_mask etc.) the current event. > + */ > +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, > + struct vmcs12 *vmcs12) > +{ > + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); > + int cr = exit_qualification & 15; > + int reg = (exit_qualification >> 8) & 15; > + unsigned long val = kvm_register_read(vcpu, reg); > + > + switch ((exit_qualification >> 4) & 3) { > + case 0: /* mov to cr */ > + switch (cr) { > + case 0: > + if (vmcs12->cr0_guest_host_mask & > + (val ^ vmcs12->cr0_read_shadow)) > + return 1; > + break; > + case 3: > + if ((vmcs12->cr3_target_count >= 1 && > + vmcs12->cr3_target_value0 == val) || > + (vmcs12->cr3_target_count >= 2 && > + vmcs12->cr3_target_value1 == val) || > + (vmcs12->cr3_target_count >= 3 && > + vmcs12->cr3_target_value2 == val) || > + (vmcs12->cr3_target_count >= 4 && > + vmcs12->cr3_target_value3 == val)) > + return 0; > + if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) > + return 1; > + break; > + case 4: > + if (vmcs12->cr4_guest_host_mask & > + (vmcs12->cr4_read_shadow ^ val)) > + return 1; > + break; > + case 8: > + if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) > + return 1; > + break; > + } > + break; > + case 2: /* clts */ > + if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && > + (vmcs12->cr0_read_shadow & X86_CR0_TS)) > + return 1; > + break; > + case 1: /* mov from cr */ > + switch (cr) { > + case 3: > + if (vmcs12->cpu_based_vm_exec_control & > + CPU_BASED_CR3_STORE_EXITING) > + return 1; > + break; > + case 8: > + if (vmcs12->cpu_based_vm_exec_control & > + CPU_BASED_CR8_STORE_EXITING) > + return 1; > + break; > + } > + break; > + case 3: /* lmsw */ > + /* > + * lmsw can change bits 1..3 of cr0, and only set bit 0 of > + * cr0. Other attempted changes are ignored, with no exit. > + */ > + if (vmcs12->cr0_guest_host_mask & 0xe & > + (val ^ vmcs12->cr0_read_shadow)) > + return 1; > + if ((vmcs12->cr0_guest_host_mask & 0x1) && > + !(vmcs12->cr0_read_shadow & 0x1) && > + (val & 0x1)) > + return 1; > + break; > + } > + return 0; > +} > + > +/* > + * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we > + * should handle it ourselves in L0 (and then continue L2). Only call this > + * when in is_guest_mode (L2). > + */ > +static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) > +{ > + u32 exit_reason = vmcs_read32(VM_EXIT_REASON); > + u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > + > + if (vmx->nested.nested_run_pending) > + return 0; > + > + if (unlikely(vmx->fail)) { > + printk(KERN_INFO "%s failed vm entry %x\n", > + __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); > + return 1; > + } > + > + switch (exit_reason) { > + case EXIT_REASON_EXCEPTION_NMI: > + if (!is_exception(intr_info)) > + return 0; > + else if (is_page_fault(intr_info)) > + return enable_ept; > + return vmcs12->exception_bitmap & > + (1u << (intr_info & INTR_INFO_VECTOR_MASK)); > + case EXIT_REASON_EXTERNAL_INTERRUPT: > + return 0; > + case EXIT_REASON_TRIPLE_FAULT: > + return 1; > + case EXIT_REASON_PENDING_INTERRUPT: > + case EXIT_REASON_NMI_WINDOW: > + /* > + * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING > bit > + * (aka Interrupt Window Exiting) only when L1 turned it on, > + * so if we got a PENDING_INTERRUPT exit, this must be for L1. > + * Same for NMI Window Exiting. > + */ > + return 1; > + case EXIT_REASON_TASK_SWITCH: > + return 1; > + case EXIT_REASON_CPUID: > + return 1; > + case EXIT_REASON_HLT: > + return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); > + case EXIT_REASON_INVD: > + return 1; > + case EXIT_REASON_INVLPG: > + return vmcs12->cpu_based_vm_exec_control & > + CPU_BASED_INVLPG_EXITING; use nested_cpu_has. > + case EXIT_REASON_RDPMC: > + return vmcs12->cpu_based_vm_exec_control & > + CPU_BASED_RDPMC_EXITING; > + case EXIT_REASON_RDTSC: > + return vmcs12->cpu_based_vm_exec_control & > + CPU_BASED_RDTSC_EXITING; ditto > + case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: > + case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: > + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: > + case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: > + case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: > + /* > + * VMX instructions trap unconditionally. This allows L1 to > + * emulate them for its L2 guest, i.e., allows 3-level nesting! > + */ > + return 1; > + case EXIT_REASON_CR_ACCESS: > + return nested_vmx_exit_handled_cr(vcpu, vmcs12); > + case EXIT_REASON_DR_ACCESS: > + return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); > + case EXIT_REASON_IO_INSTRUCTION: > + /* TODO: support IO bitmaps */ > + return 1; > + case EXIT_REASON_MSR_READ: > + case EXIT_REASON_MSR_WRITE: > + return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); > + case EXIT_REASON_INVALID_STATE: > + return 1; > + case EXIT_REASON_MWAIT_INSTRUCTION: > + return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); > + case EXIT_REASON_MONITOR_INSTRUCTION: > + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); > + case EXIT_REASON_PAUSE_INSTRUCTION: > + return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || > + nested_cpu_has2(vmcs12, > + SECONDARY_EXEC_PAUSE_LOOP_EXITING); > + case EXIT_REASON_MCE_DURING_VMENTRY: > + return 0; > + case EXIT_REASON_TPR_BELOW_THRESHOLD: > + return 1; > + case EXIT_REASON_APIC_ACCESS: > + return nested_cpu_has2(vmcs12, > + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); > + case EXIT_REASON_EPT_VIOLATION: > + case EXIT_REASON_EPT_MISCONFIG: > + return 0; > + case EXIT_REASON_WBINVD: > + return nested_cpu_has2(vmcs12, > SECONDARY_EXEC_WBINVD_EXITING); > + case EXIT_REASON_XSETBV: > + return 1; > + default: > + return 1; > + } > +} > + > static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) > { > *info1 = vmcs_readl(EXIT_QUALIFICATION); > @@ -5303,6 +5545,17 @@ static int vmx_handle_exit(struct kvm_vc > if (vmx->emulation_required && emulate_invalid_guest_state) > return handle_invalid_guest_state(vcpu); > > + if (exit_reason == EXIT_REASON_VMLAUNCH || > + exit_reason == EXIT_REASON_VMRESUME) > + vmx->nested.nested_run_pending = 1; > + else > + vmx->nested.nested_run_pending = 0; what about VMLAUNCH invoked from L2? In such case I think you expect L1 to run instead of L2. On the other hand, isn't just guest mode check enough to differentiate pending nested run? When L1 invokes VMLAUNCH/VMRESUME, guest mode hasn't been set yet, and below check will fail. All other operations will then be checked by nested_vmx_exit_handled... Do I miss anything here? > + > + if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { > + nested_vmx_vmexit(vcpu); > + return 1; > + } > + > if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { > vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; > vcpu->run->fail_entry.hardware_entry_failure_reason > @@ -5325,7 +5578,8 @@ static int vmx_handle_exit(struct kvm_vc > "(0x%x) and exit reason is 0x%x\n", > __func__, vectoring_info, exit_reason); > > - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { > + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && > + !nested_cpu_has_virtual_nmis(vcpu))) { Would L0 want to control vNMI for L2 guest? Otherwise we just use is_guest_mode here for the condition check? > if (vmx_interrupt_allowed(vcpu)) { > vmx->soft_vnmi_blocked = 0; > } else if (vmx->vnmi_blocked_time > 1000000000LL && Thanks, Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html