From: Orit Wasserman <oritw@xxxxxxxxxx> --- arch/x86/kvm/svm.c | 3 - arch/x86/kvm/vmx.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 6 ++- arch/x86/kvm/x86.h | 2 + 4 files changed, 192 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2df9b45..3c1f22a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -124,9 +124,6 @@ static int npt = 1; module_param(npt, int, S_IRUGO); -static int nested = 1; -module_param(nested, int, S_IRUGO); - static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 78101dd..abba325 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -67,6 +67,11 @@ struct vmcs { char data[0]; }; +struct nested_vmx { + /* Has the level1 guest done vmon? */ + bool vmon; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; struct list_head local_vcpus_link; @@ -114,6 +119,9 @@ struct vcpu_vmx { ktime_t entry_time; s64 vnmi_blocked_time; u32 exit_reason; + + /* Nested vmx */ + struct nested_vmx nested; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -967,6 +975,69 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) } /* + * Handles msr read for nested virtualization + */ +static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, + u64 *pdata) +{ + u32 vmx_msr_low = 0, vmx_msr_high = 0; + + switch (msr_index) { + case MSR_IA32_FEATURE_CONTROL: + *pdata = 0; + break; + case MSR_IA32_VMX_BASIC: + rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); + *pdata = vmx_msr_low | ((u64)vmx_msr_high << 32); + break; + case MSR_IA32_VMX_PINBASED_CTLS: + *pdata = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS; + break; + case MSR_IA32_VMX_PROCBASED_CTLS: + *pdata = CPU_BASED_HLT_EXITING | +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_MOV_DR_EXITING | + CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_INVLPG_EXITING; + + if (cpu_has_secondary_exec_ctrls()) + *pdata |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + + if (vm_need_tpr_shadow(vcpu->kvm)) + *pdata |= CPU_BASED_TPR_SHADOW; + break; + case MSR_IA32_VMX_EXIT_CTLS: + *pdata = 0; +#ifdef CONFIG_X86_64 + *pdata |= VM_EXIT_HOST_ADDR_SPACE_SIZE; +#endif + break; + case MSR_IA32_VMX_ENTRY_CTLS: + *pdata = 0; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + *pdata = 0; + if (vm_need_virtualize_apic_accesses(vcpu->kvm)) + *pdata |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + break; + case MSR_IA32_VMX_EPT_VPID_CAP: + *pdata = 0; + break; + default: + return 1; + } + + return 0; +} + +/* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. @@ -1005,6 +1076,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: + if (nested && + !nested_vmx_get_msr(vcpu, msr_index, &data)) + break; vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { @@ -1019,6 +1093,27 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) } /* + * Writes msr value for nested virtualization + * Returns 0 on success, non-0 otherwise. + */ +static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + switch (msr_index) { + case MSR_IA32_FEATURE_CONTROL: + if ((data & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + != (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + return 1; + break; + default: + return 1; + } + + return 0; +} + +/* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. @@ -1064,6 +1159,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } /* Otherwise falls through to kvm_set_msr_common */ default: + if (nested && + !nested_vmx_set_msr(vcpu, msr_index, data)) + break; vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -3095,12 +3193,97 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) return 1; } +/* + * Check to see if vcpu can execute vmx command + * Inject the corrseponding exception + */ +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_msr_entry *msr; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + + if (!vmx->nested.vmon) { + printk(KERN_DEBUG "%s: vmx not on\n", __func__); + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + msr = find_msr_entry(vmx, MSR_EFER); + + if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || + ((msr->data & EFER_LMA) && !cs.l)) { + printk(KERN_DEBUG "%s: invalid mode cs.l %d lma %llu\n", + __func__, cs.l, msr->data & EFER_LMA); + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); + return 0; + } + + return 1; +} + + static int handle_vmx_insn(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } +static int handle_vmoff(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmx->nested.vmon = 0; + + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_vmon(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested) { + printk(KERN_DEBUG "%s: nested vmx not enabled\n", __func__); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + + if (!(vcpu->arch.cr4 & X86_CR4_VMXE) || + !(vcpu->arch.cr0 & X86_CR0_PE) || + (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || + ((find_msr_entry(to_vmx(vcpu), + MSR_EFER)->data & EFER_LMA) && !cs.l)) { + kvm_queue_exception(vcpu, UD_VECTOR); + printk(KERN_INFO "%s invalid register state\n", __func__); + return 1; + } + + if (vmx_get_cpl(vcpu)) { + printk(KERN_INFO "%s no permission\n", __func__); + kvm_inject_gp(vcpu, 0); + return 1; + } + + vmx->nested.vmon = 1; + + skip_emulated_instruction(vcpu); + return 1; +} + static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -3376,8 +3559,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMREAD] = handle_vmx_insn, [EXIT_REASON_VMRESUME] = handle_vmx_insn, [EXIT_REASON_VMWRITE] = handle_vmx_insn, - [EXIT_REASON_VMOFF] = handle_vmx_insn, - [EXIT_REASON_VMON] = handle_vmx_insn, + [EXIT_REASON_VMOFF] = handle_vmoff, + [EXIT_REASON_VMON] = handle_vmon, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b3a169..9c39092 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -87,6 +87,10 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); int ignore_msrs = 0; module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +int nested = 1; +EXPORT_SYMBOL_GPL(nested); +module_param(nested, int, S_IRUGO); + struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, { "pf_guest", VCPU_STAT(pf_guest) }, @@ -373,7 +377,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return; } - if (cr4 & X86_CR4_VMXE) { + if (cr4 & X86_CR4_VMXE && !nested) { printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); kvm_inject_gp(vcpu, 0); return; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5eadea5..57204cb 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -35,4 +35,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr) struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); +extern int nested; + #endif -- 1.6.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html