On Fri, Sep 2, 2016 at 11:28 AM, David Matlack <dmatlack@xxxxxxxxxx> wrote: > The VMX capability MSRs advertise the set of features the KVM virtual > CPU can support. This set of features varies across different host CPUs > and KVM versions. This patch aims to addresses both sources of > differences, allowing VMs to be migrated across CPUs and KVM versions > without guest-visible changes to these MSRs. Note that cross-KVM- > version migration is only supported from this point forward. > > In order to support cross CPU migration, the VMX capability MSRs that > depend on host CPU features (all of the VM-Execution Control capability > MSRs, IA32_VMX_MISC, and IA32_VMX_EPT_VPID_CAP) must be audited on > restore to check that the set of features advertised on restore are > a subset of what KVM and the CPU support. > > MSRs that do not depend on host features (MSR_IA32_VMX_BASIC and the > cr0/cr4 fixed0/1 MSRs), and instead are entirely dependent on KVM > software support, can only be restored if they match the default value > in KVM. Any future changes to these MSRs will need to be backward > compatible with the existing value of these MSRs and will need to > update the checks done in vmx_set_vmx_msr(). > > The VMX capability MSRs are read-only, so they do not need to be on > the default MSR save/restore lists. The userspace hypervisor can set > the values of these MSRs or read them from KVM at VCPU creation time, > and restore the same value after every save/restore. > > Signed-off-by: David Matlack <dmatlack@xxxxxxxxxx> > --- > arch/x86/include/asm/vmx.h | 21 +++++ > arch/x86/kvm/vmx.c | 231 ++++++++++++++++++++++++++++++++++++++++----- > 2 files changed, 230 insertions(+), 22 deletions(-) > > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > index a002b07..e57f389 100644 > --- a/arch/x86/include/asm/vmx.h > +++ b/arch/x86/include/asm/vmx.h > @@ -25,6 +25,7 @@ > #define VMX_H > > > +#include <linux/bitops.h> > #include <linux/types.h> > #include <uapi/asm/vmx.h> > > @@ -110,6 +111,26 @@ > #define VMX_MISC_SAVE_EFER_LMA 0x00000020 > #define VMX_MISC_ACTIVITY_HLT 0x00000040 > > +static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) > +{ > + return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; > +} > + > +static inline int vmx_misc_cr3_count(u64 vmx_misc) > +{ > + return (vmx_misc & GENMASK_ULL(24, 16)) >> 16; > +} > + > +static inline int vmx_misc_max_msr(u64 vmx_misc) > +{ > + return (vmx_misc & GENMASK_ULL(27, 25)) >> 25; > +} > + > +static inline int vmx_misc_mseg_revid(u64 vmx_misc) > +{ > + return (vmx_misc & GENMASK_ULL(63, 32)) >> 32; > +} > + > /* VMCS Encodings */ > enum vmcs_field { > VIRTUAL_PROCESSOR_ID = 0x00000000, > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 5cede40..1e88cc8 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -464,6 +464,12 @@ struct nested_vmx { > u32 nested_vmx_misc_high; > u32 nested_vmx_ept_caps; > u32 nested_vmx_vpid_caps; > + u64 nested_vmx_basic; > + u64 nested_vmx_cr0_fixed0; > + u64 nested_vmx_cr0_fixed1; > + u64 nested_vmx_cr4_fixed0; > + u64 nested_vmx_cr4_fixed1; > + u64 nested_vmx_vmcs_enum; > }; > > #define POSTED_INTR_ON 0 > @@ -2846,6 +2852,33 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) > VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | > VMX_MISC_ACTIVITY_HLT; > vmx->nested.nested_vmx_misc_high = 0; > + > + /* > + * This MSR reports some information about VMX support. We > + * should return information about the VMX we emulate for the > + * guest, and the VMCS structure we give it - not about the > + * VMX support of the underlying hardware. > + */ > + vmx->nested.nested_vmx_basic = > + VMCS12_REVISION | > + VMX_BASIC_TRUE_CTLS | > + ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | > + (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); I'll need to send out a v2 to account for "KVM: nVMX: expose INS/OUTS information support" (https://git.kernel.org/cgit/virt/kvm/kvm.git/commit/?h=next&id=9ac7e3e815060efdc86b6d12448200e3c3597e01). It will be a relatively small modification: restore of MSR_IA32_VMX_BASIC will need to support the 0-setting of VMX_BASIC_INOUT. Also, as an FYI to reviewers, I will be on vacation the next two weeks, so expect high latency from me until late September. Thanks! > + > + /* > + * These MSRs specify bits which the guest must keep fixed (on or off) > + * while L1 is in VMXON mode (in L1's root mode, or running an L2). > + * We picked the standard core2 setting. > + */ > +#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) > +#define VMXON_CR4_ALWAYSON X86_CR4_VMXE > + vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON; > + vmx->nested.nested_vmx_cr0_fixed1 = -1ULL; > + vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON; > + vmx->nested.nested_vmx_cr4_fixed1 = -1ULL; > + > + /* highest index: VMX_PREEMPTION_TIMER_VALUE */ > + vmx->nested.nested_vmx_vmcs_enum = 0x2e; > } > > static inline bool vmx_control_verify(u32 control, u32 low, u32 high) > @@ -2861,6 +2894,171 @@ static inline u64 vmx_control_msr(u32 low, u32 high) > return low | ((u64)high << 32); > } > > +static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) > +{ > + superset &= mask; > + subset &= mask; > + > + return (superset | subset) == superset; > +} > + > +static int > +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) > +{ > + u64 supported; > + u32 *lowp, *highp; > + > + switch (msr_index) { > + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: > + case MSR_IA32_VMX_PINBASED_CTLS: > + lowp = &vmx->nested.nested_vmx_pinbased_ctls_low; > + highp = &vmx->nested.nested_vmx_pinbased_ctls_high; > + break; > + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: > + lowp = &vmx->nested.nested_vmx_true_procbased_ctls_low; > + highp = &vmx->nested.nested_vmx_procbased_ctls_high; > + break; > + case MSR_IA32_VMX_PROCBASED_CTLS: > + lowp = &vmx->nested.nested_vmx_procbased_ctls_low; > + highp = &vmx->nested.nested_vmx_procbased_ctls_high; > + break; > + case MSR_IA32_VMX_TRUE_EXIT_CTLS: > + lowp = &vmx->nested.nested_vmx_true_exit_ctls_low; > + highp = &vmx->nested.nested_vmx_exit_ctls_high; > + break; > + case MSR_IA32_VMX_EXIT_CTLS: > + lowp = &vmx->nested.nested_vmx_exit_ctls_low, > + highp = &vmx->nested.nested_vmx_exit_ctls_high; > + break; > + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: > + lowp = &vmx->nested.nested_vmx_true_entry_ctls_low; > + highp = &vmx->nested.nested_vmx_entry_ctls_high; > + break; > + case MSR_IA32_VMX_ENTRY_CTLS: > + lowp = &vmx->nested.nested_vmx_entry_ctls_low; > + highp = &vmx->nested.nested_vmx_entry_ctls_high; > + break; > + case MSR_IA32_VMX_PROCBASED_CTLS2: > + lowp = &vmx->nested.nested_vmx_secondary_ctls_low; > + highp = &vmx->nested.nested_vmx_secondary_ctls_high; > + break; > + default: > + BUG(); > + } > + > + supported = vmx_control_msr(*lowp, *highp); > + > + /* Check must-be-1 bits are still 1. */ > + if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) > + return -EINVAL; > + > + /* Check must-be-0 bits are still 0. */ > + if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) > + return -EINVAL; > + > + *lowp = data; > + *highp = data >> 32; > + return 0; > +} > + > +static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) > +{ > + const u64 feature_and_reserved_bits = > + /* feature */ > + BIT(5) | GENMASK(8, 6) | BIT(15) | BIT(28) | BIT(29) | > + /* reserved */ > + GENMASK(14, 9) | GENMASK(31, 30); > + u64 vmx_misc; > + > + vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low, > + vmx->nested.nested_vmx_misc_high); > + > + if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) > + return -EINVAL; > + > + > + if ((vmx->nested.nested_vmx_pinbased_ctls_high & > + PIN_BASED_VMX_PREEMPTION_TIMER) && > + vmx_misc_preemption_timer_rate(data) != > + vmx_misc_preemption_timer_rate(vmx_misc)) > + return -EINVAL; > + > + if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) > + return -EINVAL; > + > + if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) > + return -EINVAL; > + > + if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) > + return -EINVAL; > + > + vmx->nested.nested_vmx_misc_low = data; > + vmx->nested.nested_vmx_misc_high = data >> 32; > + return 0; > +} > + > +static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) > +{ > + u64 vmx_ept_vpid_cap; > + > + vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps, > + vmx->nested.nested_vmx_vpid_caps); > + > + /* Every bit is either reserved or a feature bit. */ > + if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) > + return -EINVAL; > + > + vmx->nested.nested_vmx_ept_caps = data; > + vmx->nested.nested_vmx_vpid_caps = data >> 32; > + return 0; > +} > + > +/* > + * Called when userspace is restoring VMX MSRs. > + * > + * Returns 0 on success, non-0 otherwise. > + */ > +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + > + switch (msr_index) { > + case MSR_IA32_VMX_BASIC: > + return vmx->nested.nested_vmx_basic != data; > + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: > + case MSR_IA32_VMX_PINBASED_CTLS: > + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: > + case MSR_IA32_VMX_PROCBASED_CTLS: > + case MSR_IA32_VMX_TRUE_EXIT_CTLS: > + case MSR_IA32_VMX_EXIT_CTLS: > + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: > + case MSR_IA32_VMX_ENTRY_CTLS: > + case MSR_IA32_VMX_PROCBASED_CTLS2: > + return vmx_restore_control_msr(vmx, msr_index, data); > + case MSR_IA32_VMX_MISC: > + return vmx_restore_vmx_misc(vmx, data); > + case MSR_IA32_VMX_CR0_FIXED0: > + return vmx->nested.nested_vmx_cr0_fixed0 != data; > + case MSR_IA32_VMX_CR0_FIXED1: > + return vmx->nested.nested_vmx_cr0_fixed1 != data; > + case MSR_IA32_VMX_CR4_FIXED0: > + return vmx->nested.nested_vmx_cr4_fixed0 != data; > + case MSR_IA32_VMX_CR4_FIXED1: > + return vmx->nested.nested_vmx_cr4_fixed1 != data; > + case MSR_IA32_VMX_EPT_VPID_CAP: > + return vmx_restore_vmx_ept_vpid_cap(vmx, data); > + case MSR_IA32_VMX_VMCS_ENUM: > + return vmx->nested.nested_vmx_vmcs_enum != data; > + default: > + /* > + * The rest of the VMX capability MSRs do not support restore. > + */ > + return -EINVAL; > + } > + > + return 0; > +} > + > /* Returns 0 on success, non-0 otherwise. */ > static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) > { > @@ -2868,15 +3066,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) > > switch (msr_index) { > case MSR_IA32_VMX_BASIC: > - /* > - * This MSR reports some information about VMX support. We > - * should return information about the VMX we emulate for the > - * guest, and the VMCS structure we give it - not about the > - * VMX support of the underlying hardware. > - */ > - *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | > - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | > - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); > + *pdata = vmx->nested.nested_vmx_basic; > break; > case MSR_IA32_VMX_TRUE_PINBASED_CTLS: > case MSR_IA32_VMX_PINBASED_CTLS: > @@ -2919,27 +3109,20 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) > vmx->nested.nested_vmx_misc_low, > vmx->nested.nested_vmx_misc_high); > break; > - /* > - * These MSRs specify bits which the guest must keep fixed (on or off) > - * while L1 is in VMXON mode (in L1's root mode, or running an L2). > - * We picked the standard core2 setting. > - */ > -#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) > -#define VMXON_CR4_ALWAYSON X86_CR4_VMXE > case MSR_IA32_VMX_CR0_FIXED0: > - *pdata = VMXON_CR0_ALWAYSON; > + *pdata = vmx->nested.nested_vmx_cr0_fixed0; > break; > case MSR_IA32_VMX_CR0_FIXED1: > - *pdata = -1ULL; > + *pdata = vmx->nested.nested_vmx_cr0_fixed1; > break; > case MSR_IA32_VMX_CR4_FIXED0: > - *pdata = VMXON_CR4_ALWAYSON; > + *pdata = vmx->nested.nested_vmx_cr4_fixed0; > break; > case MSR_IA32_VMX_CR4_FIXED1: > - *pdata = -1ULL; > + *pdata = vmx->nested.nested_vmx_cr4_fixed1; > break; > case MSR_IA32_VMX_VMCS_ENUM: > - *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ > + *pdata = vmx->nested.nested_vmx_vmcs_enum; > break; > case MSR_IA32_VMX_PROCBASED_CTLS2: > *pdata = vmx_control_msr( > @@ -3122,7 +3305,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > vmx_leave_nested(vcpu); > break; > case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: > - return 1; /* they are read-only */ > + if (!msr_info->host_initiated) > + return 1; /* they are read-only */ > + if (!nested_vmx_allowed(vcpu)) > + return 1; > + return vmx_set_vmx_msr(vcpu, msr_index, data); > case MSR_IA32_XSS: > if (!vmx_xsaves_supported()) > return 1; > -- > 2.8.0.rc3.226.g39d4020 > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html