When the guest can use VMX instructions (when the "nested" module option is on), it should also be able to read and write VMX MSRs, e.g., to query about VMX capabilities. This patch adds this support. Signed-off-by: Nadav Har'El <nyh@xxxxxxxxxx> --- arch/x86/include/asm/msr-index.h | 12 ++ arch/x86/kvm/vmx.c | 174 +++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+) --- .before/arch/x86/kvm/vmx.c 2011-05-08 10:43:18.000000000 +0300 +++ .after/arch/x86/kvm/vmx.c 2011-05-08 10:43:18.000000000 +0300 @@ -1365,6 +1365,176 @@ static inline bool nested_vmx_allowed(st } /* + * nested_vmx_pinbased_ctls() returns the value which is to be returned + * for MSR_IA32_VMX_PINBASED_CTLS, and also determines the legal setting of + * vmcs12->pin_based_vm_exec_control. See the spec and vmx_control_verify() for + * the meaning of the low and high halves of this MSR. + * TODO: allow the return value to be modified (downgraded) by module options + * or other means. + */ +static inline void nested_vmx_pinbased_ctls(u32 *low, u32 *high) +{ + /* + * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is + * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. + */ + *low = 0x16 ; + /* Allow only these bits to be 1 */ + *high = 0x16 | PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING + | PIN_BASED_VIRTUAL_NMIS; +} + +static inline bool vmx_control_verify(u32 control, u32 low, u32 high) +{ + /* + * Bits 0 in high must be 0, and bits 1 in low must be 1. + */ + return ((control & high) | low) == control; +} + +/* + * If we allow our guest to use VMX instructions (i.e., nested VMX), we should + * also let it use VMX-specific MSRs. + * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a + * VMX-specific MSR, or 0 when we haven't (and the caller should handle it + * like all other MSRs). + */ +static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + u32 vmx_msr_high, vmx_msr_low; + + if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC && + msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) { + /* + * According to the spec, processors which do not support VMX + * should throw a #GP(0) when VMX capability MSRs are read. + */ + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + + switch (msr_index) { + case MSR_IA32_FEATURE_CONTROL: + *pdata = 0; + break; + case MSR_IA32_VMX_BASIC: + /* + * This MSR reports some information about VMX support. We + * should return information about the VMX we emulate for the + * guest, and the VMCS structure we give it - not about the + * VMX support of the underlying hardware. + */ + *pdata = VMCS12_REVISION | + ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | + (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + break; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_PINBASED_CTLS: + nested_vmx_pinbased_ctls(&vmx_msr_low, &vmx_msr_high); + *pdata = vmx_msr_low | ((u64)vmx_msr_high << 32); + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + /* This MSR determines which vm-execution controls the L1 + * hypervisor may ask, or may not ask, to enable. Normally we + * can only allow enabling features which the hardware can + * support, but we limit ourselves to allowing only known + * features that were tested nested. We can allow disabling any + * feature (even if the hardware can't disable it) - we just + * need to enable this feature and hide the extra exits from L1 + */ + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + vmx_msr_low = 0; /* allow disabling any feature */ + vmx_msr_high &= /* do not expose new untested features */ + CPU_BASED_HLT_EXITING | CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING | + CPU_BASED_INVLPG_EXITING | +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + /* + * We can allow some features even when not supported by the + * hardware. For example, L1 can specify an MSR bitmap - and we + * can use it to avoid exits to L1 - even when L0 runs L2 + * without MSR bitmaps. + */ + vmx_msr_high |= CPU_BASED_USE_MSR_BITMAPS; + *pdata = vmx_msr_low | ((u64)vmx_msr_high << 32); + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + *pdata = 0; +#ifdef CONFIG_X86_64 + *pdata |= VM_EXIT_HOST_ADDR_SPACE_SIZE; +#endif + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + *pdata = 0; + break; + case MSR_IA32_VMX_MISC: + *pdata = 0; + break; + /* + * These MSRs specify bits which the guest must keep fixed (on or off) + * while L1 is in VMXON mode (in L1's root mode, or running an L2). + * We picked the standard core2 setting. + */ +#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) +#define VMXON_CR4_ALWAYSON X86_CR4_VMXE + case MSR_IA32_VMX_CR0_FIXED0: + *pdata = VMXON_CR0_ALWAYSON; + break; + case MSR_IA32_VMX_CR0_FIXED1: + *pdata = -1ULL; + break; + case MSR_IA32_VMX_CR4_FIXED0: + *pdata = VMXON_CR4_ALWAYSON; + break; + case MSR_IA32_VMX_CR4_FIXED1: + *pdata = -1ULL; + break; + case MSR_IA32_VMX_VMCS_ENUM: + *pdata = 0x1f; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, vmx_msr_low, vmx_msr_high); + vmx_msr_low = 0; /* allow disabling any feature */ + vmx_msr_high &= /* do not expose new untested features */ + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + *pdata = vmx_msr_low | ((u64)vmx_msr_high << 32); + break; + case MSR_IA32_VMX_EPT_VPID_CAP: + /* Currently, no nested ept or nested vpid */ + *pdata = 0; + break; + default: + return 0; + } + + return 1; +} + +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + if (!nested_vmx_allowed(vcpu)) + return 0; + + if (msr_index == MSR_IA32_FEATURE_CONTROL) + /* TODO: the right thing. */ + return 1; + /* + * No need to treat VMX capability MSRs specially: If we don't handle + * them, handle_wrmsr will #GP(0), which is correct (they are readonly) + */ + return 0; +} + +/* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. @@ -1412,6 +1582,8 @@ static int vmx_get_msr(struct kvm_vcpu * /* Otherwise falls through */ default: vmx_load_host_state(to_vmx(vcpu)); + if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) + return 0; msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { vmx_load_host_state(to_vmx(vcpu)); @@ -1483,6 +1655,8 @@ static int vmx_set_msr(struct kvm_vcpu * return 1; /* Otherwise falls through */ default: + if (vmx_set_vmx_msr(vcpu, msr_index, data)) + break; msr = find_msr_entry(vmx, msr_index); if (msr) { vmx_load_host_state(vmx); --- .before/arch/x86/include/asm/msr-index.h 2011-05-08 10:43:18.000000000 +0300 +++ .after/arch/x86/include/asm/msr-index.h 2011-05-08 10:43:18.000000000 +0300 @@ -434,6 +434,18 @@ #define MSR_IA32_VMX_VMCS_ENUM 0x0000048a #define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b #define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c +#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d +#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e +#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f +#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 + +/* VMX_BASIC bits and bitmasks */ +#define VMX_BASIC_VMCS_SIZE_SHIFT 32 +#define VMX_BASIC_64 0x0001000000000000LLU +#define VMX_BASIC_MEM_TYPE_SHIFT 50 +#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU +#define VMX_BASIC_MEM_TYPE_WB 6LLU +#define VMX_BASIC_INOUT 0x0040000000000000LLU /* AMD-V MSRs */ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html