From: Orit Wasserman <oritw@xxxxxxxxxx> --- arch/x86/kvm/svm.c | 3 - arch/x86/kvm/vmx.c | 217 +++++++++++++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 6 +- arch/x86/kvm/x86.h | 2 + 4 files changed, 222 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2df9b45..3c1f22a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -124,9 +124,6 @@ static int npt = 1; module_param(npt, int, S_IRUGO); -static int nested = 1; -module_param(nested, int, S_IRUGO); - static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 78101dd..71bd91a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -67,6 +67,11 @@ struct vmcs { char data[0]; }; +struct nested_vmx { + /* Has the level1 guest done vmxon? */ + bool vmxon; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; struct list_head local_vcpus_link; @@ -114,6 +119,9 @@ struct vcpu_vmx { ktime_t entry_time; s64 vnmi_blocked_time; u32 exit_reason; + + /* Nested vmx */ + struct nested_vmx nested; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -967,6 +975,95 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) } /* + * Handles msr read for nested virtualization + */ +static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, + u64 *pdata) +{ + u64 vmx_msr = 0; + + switch (msr_index) { + case MSR_IA32_FEATURE_CONTROL: + *pdata = 0; + break; + case MSR_IA32_VMX_BASIC: + *pdata = 0; + rdmsrl(MSR_IA32_VMX_BASIC, vmx_msr); + *pdata = (vmx_msr & 0x00ffffcfffffffff); + break; + case MSR_IA32_VMX_PINBASED_CTLS: + rdmsrl(MSR_IA32_VMX_PINBASED_CTLS, vmx_msr); + *pdata = (PIN_BASED_EXT_INTR_MASK & vmcs_config.pin_based_exec_ctrl) | + (PIN_BASED_NMI_EXITING & vmcs_config.pin_based_exec_ctrl) | + (PIN_BASED_VIRTUAL_NMIS & vmcs_config.pin_based_exec_ctrl); + break; + case MSR_IA32_VMX_PROCBASED_CTLS: + { + u32 vmx_msr_high, vmx_msr_low; + u32 control = CPU_BASED_HLT_EXITING | +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_MOV_DR_EXITING | + CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_INVLPG_EXITING | + CPU_BASED_TPR_SHADOW | + CPU_BASED_USE_MSR_BITMAPS | + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + + control &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ + control |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ + + *pdata = (CPU_BASED_HLT_EXITING & control) | +#ifdef CONFIG_X86_64 + (CPU_BASED_CR8_LOAD_EXITING & control) | + (CPU_BASED_CR8_STORE_EXITING & control) | +#endif + (CPU_BASED_CR3_LOAD_EXITING & control) | + (CPU_BASED_CR3_STORE_EXITING & control) | + (CPU_BASED_USE_IO_BITMAPS & control) | + (CPU_BASED_MOV_DR_EXITING & control) | + (CPU_BASED_USE_TSC_OFFSETING & control) | + (CPU_BASED_INVLPG_EXITING & control) ; + + if (cpu_has_secondary_exec_ctrls()) + *pdata |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + + if (vm_need_tpr_shadow(vcpu->kvm)) + *pdata |= CPU_BASED_TPR_SHADOW; + break; + } + case MSR_IA32_VMX_EXIT_CTLS: + *pdata = 0; +#ifdef CONFIG_X86_64 + *pdata |= VM_EXIT_HOST_ADDR_SPACE_SIZE; +#endif + break; + case MSR_IA32_VMX_ENTRY_CTLS: + *pdata = 0; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + *pdata = 0; + if (vm_need_virtualize_apic_accesses(vcpu->kvm)) + *pdata |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + break; + case MSR_IA32_VMX_EPT_VPID_CAP: + *pdata = 0; + break; + default: + return 1; + } + + return 0; +} + +/* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. @@ -1005,6 +1102,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: + if (nested && + !nested_vmx_get_msr(vcpu, msr_index, &data)) + break; vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { @@ -1019,6 +1119,27 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) } /* + * Writes msr value for nested virtualization + * Returns 0 on success, non-0 otherwise. + */ +static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + switch (msr_index) { + case MSR_IA32_FEATURE_CONTROL: + if ((data & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + != (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + return 1; + break; + default: + return 1; + } + + return 0; +} + +/* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. @@ -1064,6 +1185,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } /* Otherwise falls through to kvm_set_msr_common */ default: + if (nested && + !nested_vmx_set_msr(vcpu, msr_index, data)) + break; vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -3095,12 +3219,101 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) return 1; } +/* + * Check to see if vcpu can execute vmx command + * Inject the corrseponding exception + */ +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_msr_entry *msr; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + + if (!vmx->nested.vmxon) { + printk(KERN_DEBUG "%s: vmx not on\n", __func__); + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + msr = find_msr_entry(vmx, MSR_EFER); + + if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || + ((msr->data & EFER_LMA) && !cs.l)) { + printk(KERN_DEBUG "%s: invalid mode cs.l %d lma %llu\n", + __func__, cs.l, msr->data & EFER_LMA); + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); + return 0; + } + + return 1; +} + static int handle_vmx_insn(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } +static int handle_vmoff(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx->nested.vmxon = 0; + + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_vmon(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested) { + printk(KERN_DEBUG "%s: nested vmx not enabled\n", __func__); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + + if (!(vcpu->arch.cr4 & X86_CR4_VMXE) || + !(vcpu->arch.cr0 & X86_CR0_PE) || + (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { + kvm_queue_exception(vcpu, UD_VECTOR); + printk(KERN_INFO "%s invalid register state\n", __func__); + return 1; + } +#ifdef CONFIG_X86_64 + if (((find_msr_entry(to_vmx(vcpu), + MSR_EFER)->data & EFER_LMA) && !cs.l)) { + kvm_queue_exception(vcpu, UD_VECTOR); + printk(KERN_INFO "%s invalid register state\n", __func__); + return 1; + } +#endif + if (vmx_get_cpl(vcpu)) { + printk(KERN_INFO "%s no permission\n", __func__); + kvm_inject_gp(vcpu, 0); + return 1; + } + + vmx->nested.vmxon = 1; + + skip_emulated_instruction(vcpu); + return 1; +} + static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -3376,8 +3589,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMREAD] = handle_vmx_insn, [EXIT_REASON_VMRESUME] = handle_vmx_insn, [EXIT_REASON_VMWRITE] = handle_vmx_insn, - [EXIT_REASON_VMOFF] = handle_vmx_insn, - [EXIT_REASON_VMON] = handle_vmx_insn, + [EXIT_REASON_VMOFF] = handle_vmoff, + [EXIT_REASON_VMON] = handle_vmon, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b3a169..9c39092 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -87,6 +87,10 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); int ignore_msrs = 0; module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +int nested = 1; +EXPORT_SYMBOL_GPL(nested); +module_param(nested, int, S_IRUGO); + struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, { "pf_guest", VCPU_STAT(pf_guest) }, @@ -373,7 +377,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return; } - if (cr4 & X86_CR4_VMXE) { + if (cr4 & X86_CR4_VMXE && !nested) { printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); kvm_inject_gp(vcpu, 0); return; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5eadea5..57204cb 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -35,4 +35,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr) struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); +extern int nested; + #endif -- 1.6.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html