From: Orit Wasserman <oritw@xxxxxxxxxx> --- arch/x86/kvm/vmx.c | 521 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 515 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0d36b49..203f016 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -262,6 +262,10 @@ struct nested_vmx { gpa_t current_vmptr; /* Are we running nested guest */ bool nested_mode; + /* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */ + bool nested_run_pending; + /* flag indicating if there was a valid IDT after exiting from l2 */ + bool valid_idt_vectoring_info; /* Level 1 state for switching to level 2 and back */ struct level_state *l1_state; /* Level 1 shadow vmcs for switching to level 2 and back */ @@ -908,9 +912,16 @@ static u64 host_efer; static void ept_save_pdptrs(struct kvm_vcpu *vcpu); static int nested_vmx_check_permission(struct kvm_vcpu *vcpu); +static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr, + bool has_error_code, u32 error_code); +static int nested_vmx_intr(struct kvm_vcpu *vcpu); static int create_l1_state(struct kvm_vcpu *vcpu); static int create_l2_state(struct kvm_vcpu *vcpu); static int launch_guest(struct kvm_vcpu *vcpu, bool launch); +static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu); +static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override); +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu, + bool is_interrupt); /* * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it @@ -1467,6 +1478,8 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu); +int load_vmcs_host_state(struct shadow_vmcs *src); + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -1503,6 +1516,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->cpu != cpu) { struct descriptor_table dt; unsigned long sysenter_esp; + struct shadow_vmcs *l1_shadow_vmcs = vmx->nested.l1_shadow_vmcs; vcpu->cpu = cpu; /* @@ -1525,6 +1539,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) new_offset = vmcs_read64(TSC_OFFSET) + delta; vmcs_write64(TSC_OFFSET, new_offset); } + + if (l1_shadow_vmcs != NULL) { + l1_shadow_vmcs->host_tr_base = + vmcs_readl(HOST_TR_BASE); + l1_shadow_vmcs->host_gdtr_base = + vmcs_readl(HOST_GDTR_BASE); + l1_shadow_vmcs->host_ia32_sysenter_esp = + vmcs_readl(HOST_IA32_SYSENTER_ESP); + + if (tsc_this < vcpu->arch.host_tsc) + l1_shadow_vmcs->tsc_offset = + vmcs_read64(TSC_OFFSET); + + if (vmx->nested.nested_mode) + load_vmcs_host_state(l1_shadow_vmcs); + } } } @@ -1611,6 +1641,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info = nr | INTR_INFO_VALID_MASK; + if (nested_vmx_check_exception(vmx, nr, has_error_code, error_code)) + return; + if (has_error_code) { vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; @@ -2185,9 +2218,6 @@ int load_vmcs_common(struct shadow_vmcs *src) if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat); - if (src->vm_entry_msr_load_count < 512) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field); vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, src->vm_entry_exception_error_code); @@ -3794,6 +3824,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; + if (to_vmx(vcpu)->nested.nested_mode) { + nested_vmx_intr(vcpu); + return; + } + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); @@ -3922,6 +3957,11 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { + if (to_vmx(vcpu)->nested.nested_mode) { + if (!nested_vmx_intr(vcpu)) + return 0; + } + return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); @@ -4042,6 +4082,10 @@ static int handle_exception(struct kvm_vcpu *vcpu) not interested (exception bitmap 12 does not include NM_VECTOR) enable fpu and resume l2 (avoid switching to l1) */ + + if (vmx->nested.nested_mode) + vmx->nested.nested_run_pending = 1; /* removing this line cause hung on boot of l2*/ + vmx_fpu_activate(vcpu); return 1; @@ -4169,7 +4213,33 @@ static int handle_cr(struct kvm_vcpu *vcpu) trace_kvm_cr_write(cr, val); switch (cr) { case 0: - kvm_set_cr0(vcpu, val); + if (to_vmx(vcpu)->nested.nested_mode) { + /* assume only X86_CR0_TS is handled by l0 */ + long new_cr0 = vmcs_readl(GUEST_CR0); + long new_cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW); + + vmx_fpu_deactivate(vcpu); + + if (val & X86_CR0_TS) { + new_cr0 |= X86_CR0_TS; + new_cr0_read_shadow |= X86_CR0_TS; + vcpu->arch.cr0 |= X86_CR0_TS; + } else { + new_cr0 &= ~X86_CR0_TS; + new_cr0_read_shadow &= ~X86_CR0_TS; + vcpu->arch.cr0 &= X86_CR0_TS; + } + + vmcs_writel(GUEST_CR0, new_cr0); + vmcs_writel(CR0_READ_SHADOW, new_cr0_read_shadow); + + if (!(val & X86_CR0_TS) || !(val & X86_CR0_PE)) + vmx_fpu_activate(vcpu); + + to_vmx(vcpu)->nested.nested_run_pending = 1; + } else + kvm_set_cr0(vcpu, val); + skip_emulated_instruction(vcpu); return 1; case 3: @@ -4196,8 +4266,15 @@ static int handle_cr(struct kvm_vcpu *vcpu) break; case 2: /* clts */ vmx_fpu_deactivate(vcpu); - vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); + if (to_vmx(vcpu)->nested.nested_mode) { + vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0) & ~X86_CR0_TS); + vmcs_writel(CR0_READ_SHADOW, vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); + vcpu->arch.cr0 &= ~X86_CR0_TS; + to_vmx(vcpu)->nested.nested_run_pending = 1; + } else { + vcpu->arch.cr0 &= ~X86_CR0_TS; + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); + } vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -5173,6 +5250,17 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (vmx->emulation_required && emulate_invalid_guest_state) return handle_invalid_guest_state(vcpu); + if (exit_reason == EXIT_REASON_VMLAUNCH || + exit_reason == EXIT_REASON_VMRESUME) + vmx->nested.nested_run_pending = 1; + else + vmx->nested.nested_run_pending = 0; + + if (vmx->nested.nested_mode && nested_vmx_exit_handled(vcpu, true)) { + nested_vmx_vmexit(vcpu, false); + return 1; + } + /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ if (enable_ept && is_paging(vcpu)) @@ -5347,6 +5435,60 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) | vmx->rmode.irq.vector; } +static int nested_handle_valid_idt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int irq; + int type; + int errCodeValid; + u32 idt_vectoring_info; + u32 guest_intr; + bool nmi_window_open; + bool interrupt_window_open; + + if (vmx->nested.valid_idt_vectoring_info) { + idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + irq = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; + type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; + errCodeValid = idt_vectoring_info & + VECTORING_INFO_DELIVER_CODE_MASK; + + guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + nmi_window_open = + !(guest_intr & (GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_MOV_SS | + GUEST_INTR_STATE_NMI)); + + interrupt_window_open = + ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(guest_intr & (GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_MOV_SS))); + + if (type == INTR_TYPE_EXT_INTR && !interrupt_window_open) { + printk(KERN_INFO "IDT ignored, l2 interrupt window closed!\n"); + return 0; + } + + if (type == INTR_TYPE_NMI_INTR && !nmi_window_open) { + printk(KERN_INFO "IDT ignored, l2 nmi window closed!\n"); + return 0; + } + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + irq | type | INTR_INFO_VALID_MASK | errCodeValid); + + + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); + + if (errCodeValid) + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs_read32(IDT_VECTORING_ERROR_CODE)); + } + + return 1; +} + #ifdef CONFIG_X86_64 #define R "r" #define Q "q" @@ -5358,8 +5500,17 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + int r; u32 nested_exception_bitmap = 0; + if (vmx->nested.nested_mode) { + r = nested_handle_valid_idt(vcpu); + if (!r) { + vmx->fail = 1; + return; + } + } + /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) vmx->entry_time = ktime_get(); @@ -5539,6 +5690,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + vmx->nested.valid_idt_vectoring_info = vmx->nested.nested_mode && + (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); @@ -6191,6 +6345,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu) r = kvm_mmu_load(vcpu); if (unlikely(r)) { printk(KERN_ERR "Error in kvm_mmu_load r %d\n", r); + nested_vmx_vmexit(vcpu, false); set_rflags_to_vmx_fail_valid(vcpu); /* switch back to L1 */ vmx->nested.nested_mode = 0; @@ -6244,6 +6399,360 @@ static int launch_guest(struct kvm_vcpu *vcpu, bool launch) return 1; } +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu, + bool is_interrupt) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct level_state *l2_state; + int efer_offset; + + if (!vmx->nested.nested_mode) { + printk(KERN_INFO "WARNING: %s called but not in nested mode\n", + __func__); + return 0; + } + + sync_cached_regs_to_vmcs(vcpu); + + if (!nested_map_current(vcpu)) { + printk(KERN_INFO "Error mapping shadow vmcs\n"); + set_rflags_to_vmx_fail_valid(vcpu); + return 1; + } + + l2_state = &(vmx->nested.current_l2_page->l2_state); + prepare_vmcs_12(vcpu); + if (is_interrupt) + get_shadow_vmcs(vcpu)->vm_exit_reason = + EXIT_REASON_EXTERNAL_INTERRUPT; + + l2_state->launched = vmx->launched; + l2_state->cpu = vcpu->cpu; + + nested_unmap_current(vcpu); + + vmx->vmcs = vmx->nested.l1_vmcs; + vcpu->cpu = vmx->nested.l1_state->cpu; + vmx->launched = vmx->nested.l1_state->launched; + + vmx_vcpu_load(vcpu, get_cpu()); + put_cpu(); + + vcpu->arch.shadow_efer = vmx->nested.l1_state->shadow_efer; + if ((vcpu->arch.shadow_efer & EFER_LMA) && + !(vcpu->arch.shadow_efer & EFER_SCE)) + vcpu->arch.shadow_efer |= EFER_SCE; + + efer_offset = __find_msr_index(vmx, MSR_EFER); + if (update_transition_efer(vmx, efer_offset)) + wrmsrl(MSR_EFER, vmx->guest_msrs[efer_offset].data); + + vmx_set_cr0(vcpu, vmx->nested.l1_shadow_vmcs->cr0_read_shadow); + vmx_set_cr4(vcpu, vmx->nested.l1_state->cr4); + + if (enable_ept) { + vcpu->arch.cr3 = vmx->nested.l1_shadow_vmcs->guest_cr3; + vmcs_write32(GUEST_CR3, vmx->nested.l1_shadow_vmcs->guest_cr3); + } else { + kvm_set_cr3(vcpu, vmx->nested.l1_state->cr3); + } + + if (!nested_map_current(vcpu)) { + printk(KERN_INFO "Error mapping shadow vmcs\n"); + set_rflags_to_vmx_fail_valid(vcpu); + return 1; + } + + switch_back_vmcs(vcpu); + + vmx->nested.l1_shadow_vmcs->guest_cr0 = get_shadow_vmcs(vcpu)->host_cr0; + + nested_unmap_current(vcpu); + + kvm_register_write(vcpu, VCPU_REGS_RSP, + vmx->nested.l1_shadow_vmcs->guest_rsp); + kvm_register_write(vcpu, VCPU_REGS_RIP, + vmx->nested.l1_shadow_vmcs->guest_rip); + + vmx->nested.nested_mode = 0; + + kvm_mmu_reset_context(vcpu); + kvm_mmu_load(vcpu); + + if (unlikely(vmx->fail)) { + vmx->fail = 0; + set_rflags_to_vmx_fail_valid(vcpu); + } else + clear_rflags_cf_zf(vcpu); + + return 0; +} + +static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu) +{ + if (to_vmx(vcpu)->nested.nested_mode) { + struct page *msr_page = NULL; + u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; + u32 exit_code = vmcs_read32(VM_EXIT_REASON); + struct shadow_vmcs *l2svmcs = get_shadow_vmcs(vcpu); + + if (!cpu_has_vmx_msr_bitmap() + || !nested_cpu_has_vmx_msr_bitmap(vcpu)) + return 1; + + msr_page = nested_get_page(vcpu, + l2svmcs->msr_bitmap); + + if (!msr_page) { + printk(KERN_INFO "%s error in nested_get_page\n", + __func__); + return 0; + } + + switch (exit_code) { + case EXIT_REASON_MSR_READ: + if (msr_index <= 0x1fff) { + if (test_bit(msr_index, + (unsigned long *)(msr_page + + 0x000))) + return 1; + } else if ((msr_index >= 0xc0000000) && + (msr_index <= 0xc0001fff)) { + msr_index &= 0x1fff; + if (test_bit(msr_index, + (unsigned long *)(msr_page + + 0x400))) + return 1; + } + break; + case EXIT_REASON_MSR_WRITE: + if (msr_index <= 0x1fff) { + if (test_bit(msr_index, + (unsigned long *)(msr_page + + 0x800))) + return 1; + } else if ((msr_index >= 0xc0000000) && + (msr_index <= 0xc0001fff)) { + msr_index &= 0x1fff; + if (test_bit(msr_index, + (unsigned long *)(msr_page + + 0xc00))) + return 1; + } + break; + } + } + + return 0; +} + +static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override) +{ + u32 exit_code = vmcs_read32(VM_EXIT_REASON); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + struct shadow_vmcs *l2svmcs; + + int r = 0; + + if (vmx->nested.nested_run_pending) + return 0; + + if (unlikely(vmx->fail)) { + printk(KERN_INFO "%s failed vm entry %x\n", + __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); + return 1; + } + + if (kvm_override) { + switch (exit_code) { + case EXIT_REASON_EXTERNAL_INTERRUPT: + return 0; + case EXIT_REASON_EXCEPTION_NMI: + if (!is_exception(intr_info)) + return 0; + + if (is_page_fault(intr_info) && (!enable_ept)) + return 0; + + break; + case EXIT_REASON_EPT_VIOLATION: + if (enable_ept) + return 0; + + break; + } + } + + + if (!nested_map_current(vcpu)) + return 0; + + l2svmcs = get_shadow_vmcs(vcpu); + + switch (exit_code) { + case EXIT_REASON_INVLPG: + if (l2svmcs->cpu_based_vm_exec_control & + CPU_BASED_INVLPG_EXITING) + r = 1; + break; + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + r = nested_vmx_exit_handled_msr(vcpu); + break; + case EXIT_REASON_CR_ACCESS: { + unsigned long exit_qualification = + vmcs_readl(EXIT_QUALIFICATION); + int cr = exit_qualification & 15; + int reg = (exit_qualification >> 8) & 15; + unsigned long val = kvm_register_read(vcpu, reg); + + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + switch (cr) { + case 0: + if (l2svmcs->cr0_guest_host_mask & + (val ^ l2svmcs->cr0_read_shadow)) + r = 1; + break; + case 3: + if (l2svmcs->cpu_based_vm_exec_control & + CPU_BASED_CR3_LOAD_EXITING) + r = 1; + break; + case 4: + if (l2svmcs->cr4_guest_host_mask & + (l2svmcs->cr4_read_shadow ^ val)) + r = 1; + break; + case 8: + if (l2svmcs->cpu_based_vm_exec_control & + CPU_BASED_CR8_LOAD_EXITING) + r = 1; + break; + } + break; + case 2: /* clts */ + if (l2svmcs->cr0_guest_host_mask & X86_CR0_TS) + r = 1; + break; + case 1: /*mov from cr*/ + switch (cr) { + case 0: + r = 1; + case 3: + if (l2svmcs->cpu_based_vm_exec_control & + CPU_BASED_CR3_STORE_EXITING) + r = 1; + break; + case 4: + r = 1; + break; + case 8: + if (l2svmcs->cpu_based_vm_exec_control & + CPU_BASED_CR8_STORE_EXITING) + r = 1; + break; + } + break; + case 3: /* lmsw */ + if (l2svmcs->cr0_guest_host_mask & + (val ^ l2svmcs->cr0_read_shadow)) + r = 1; + break; + } + break; + } + case EXIT_REASON_DR_ACCESS: { + if (l2svmcs->cpu_based_vm_exec_control & + CPU_BASED_MOV_DR_EXITING) + r = 1; + break; + } + + case EXIT_REASON_EXCEPTION_NMI: { + + if (is_external_interrupt(intr_info) && + (l2svmcs->pin_based_vm_exec_control & + PIN_BASED_EXT_INTR_MASK)) + r = 1; + else if (is_nmi(intr_info) && + (l2svmcs->pin_based_vm_exec_control & + PIN_BASED_NMI_EXITING)) + r = 1; + else if (is_exception(intr_info) && + (l2svmcs->exception_bitmap & + (1u << (intr_info & INTR_INFO_VECTOR_MASK)))) + r = 1; + else if (is_page_fault(intr_info)) + r = 1; + break; + } + + case EXIT_REASON_EXTERNAL_INTERRUPT: + if (l2svmcs->pin_based_vm_exec_control & + PIN_BASED_EXT_INTR_MASK) + r = 1; + break; + default: + r = 1; + } + nested_unmap_current(vcpu); + + return r; +} + +static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr, + bool has_error_code, u32 error_code) +{ + if (vmx->nested.nested_mode) { + if (nested_vmx_exit_handled(&vmx->vcpu, false)) { + nested_vmx_vmexit(&vmx->vcpu, false); + if (!nested_map_current(&vmx->vcpu)) + return 1; + get_shadow_vmcs(&vmx->vcpu)->vm_exit_reason = + EXIT_REASON_EXCEPTION_NMI; + get_shadow_vmcs(&vmx->vcpu)->vm_exit_intr_info = + (nr | INTR_TYPE_HARD_EXCEPTION + | (has_error_code ? + INTR_INFO_DELIVER_CODE_MASK : 0) + | INTR_INFO_VALID_MASK); + + if (has_error_code) + get_shadow_vmcs(&vmx->vcpu)-> + vm_exit_intr_error_code = error_code; + nested_unmap_current(&vmx->vcpu); + return 1; + } + } + return 0; +} + +static int nested_vmx_intr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested_map_current(vcpu)) + return 0; + + if (get_shadow_vmcs(vcpu)->pin_based_vm_exec_control & + PIN_BASED_EXT_INTR_MASK) { + + if (vmx->nested.nested_run_pending) { + nested_unmap_current(vcpu); + return 0; + } + + nested_unmap_current(vcpu); + nested_vmx_vmexit(vcpu, true); + return 1; + } + + nested_unmap_current(vcpu); + + return 0; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, -- 1.6.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html