Maxim Levitsky <mlevitsk@xxxxxxxxxx> writes: > Currently to support Intel->AMD migration, if CPU vendor is GenuineIntel, > we emulate the full 64 value for MSR_IA32_SYSENTER_{EIP|ESP} > msrs, and we also emulate the sysenter/sysexit instruction in long mode. > > (Emulator does still refuse to emulate sysenter in 64 bit mode, on the > ground that the code for that wasn't tested and likely has no users) > > However when virtual vmload/vmsave is enabled, the vmload instruction will > update these 32 bit msrs without triggering their msr intercept, > which will lead to having stale values in kvm's shadow copy of these msrs, > which relies on the intercept to be up to date. > > Fix/optimize this by doing the following: > > 1. Enable the MSR intercepts for SYSENTER MSRs iff vendor=GenuineIntel > (This is both a tiny optimization and also ensures that in case > the guest cpu vendor is AMD, the msrs will be 32 bit wide as > AMD defined). > > 2. Store only high 32 bit part of these msrs on interception and combine > it with hardware msr value on intercepted read/writes > iff vendor=GenuineIntel. > > 3. Disable vmload/vmsave virtualization if vendor=GenuineIntel. > (It is somewhat insane to set vendor=GenuineIntel and still enable > SVM for the guest but well whatever). > Then zero the high 32 bit parts when kvm intercepts and emulates vmload. > > Thanks a lot to Paulo Bonzini for helping me with fixing this in the most s/Paulo/Paolo/ :-) > correct way. > > This patch fixes nested migration of 32 bit nested guests, that was > broken because incorrect cached values of SYSENTER msrs were stored in > the migration stream if L1 changed these msrs with > vmload prior to L2 entry. > > Signed-off-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx> > --- > arch/x86/kvm/svm/svm.c | 99 +++++++++++++++++++++++++++--------------- > arch/x86/kvm/svm/svm.h | 6 +-- > 2 files changed, 68 insertions(+), 37 deletions(-) > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > index 271196400495..6c39b0cd6ec6 100644 > --- a/arch/x86/kvm/svm/svm.c > +++ b/arch/x86/kvm/svm/svm.c > @@ -95,6 +95,8 @@ static const struct svm_direct_access_msrs { > } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { > { .index = MSR_STAR, .always = true }, > { .index = MSR_IA32_SYSENTER_CS, .always = true }, > + { .index = MSR_IA32_SYSENTER_EIP, .always = false }, > + { .index = MSR_IA32_SYSENTER_ESP, .always = false }, > #ifdef CONFIG_X86_64 > { .index = MSR_GS_BASE, .always = true }, > { .index = MSR_FS_BASE, .always = true }, > @@ -1258,16 +1260,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) > if (kvm_vcpu_apicv_active(vcpu)) > avic_init_vmcb(svm); > > - /* > - * If hardware supports Virtual VMLOAD VMSAVE then enable it > - * in VMCB and clear intercepts to avoid #VMEXIT. > - */ > - if (vls) { > - svm_clr_intercept(svm, INTERCEPT_VMLOAD); > - svm_clr_intercept(svm, INTERCEPT_VMSAVE); > - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; > - } > - > if (vgif) { > svm_clr_intercept(svm, INTERCEPT_STGI); > svm_clr_intercept(svm, INTERCEPT_CLGI); > @@ -2133,9 +2125,11 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) > > ret = kvm_skip_emulated_instruction(vcpu); > > - if (vmload) > + if (vmload) { > nested_svm_vmloadsave(vmcb12, svm->vmcb); > - else > + svm->sysenter_eip_hi = 0; > + svm->sysenter_esp_hi = 0; > + } else > nested_svm_vmloadsave(svm->vmcb, vmcb12); Nitpicking: {} are now needed for both branches here. > > kvm_vcpu_unmap(vcpu, &map, true); > @@ -2677,10 +2671,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > msr_info->data = svm->vmcb01.ptr->save.sysenter_cs; > break; > case MSR_IA32_SYSENTER_EIP: > - msr_info->data = svm->sysenter_eip; > + msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; > + if (guest_cpuid_is_intel(vcpu)) > + msr_info->data |= (u64)svm->sysenter_eip_hi << 32; > break; > case MSR_IA32_SYSENTER_ESP: > - msr_info->data = svm->sysenter_esp; > + msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; > + if (guest_cpuid_is_intel(vcpu)) > + msr_info->data |= (u64)svm->sysenter_esp_hi << 32; > break; > case MSR_TSC_AUX: > if (!boot_cpu_has(X86_FEATURE_RDTSCP)) > @@ -2885,12 +2883,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) > svm->vmcb01.ptr->save.sysenter_cs = data; > break; > case MSR_IA32_SYSENTER_EIP: > - svm->sysenter_eip = data; > - svm->vmcb01.ptr->save.sysenter_eip = data; > + svm->vmcb01.ptr->save.sysenter_eip = (u32)data; > + /* > + * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs > + * when we spoof an Intel vendor ID (for cross vendor migration). > + * In this case we use this intercept to track the high > + * 32 bit part of these msrs to support Intel's > + * implementation of SYSENTER/SYSEXIT. > + */ > + svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; (Personal taste) I'd suggest we keep the whole 'sysenter_eip'/'sysenter_esp' even if we only use the upper 32 bits of it. That would reduce the code churn a little bit (no need to change 'struct vcpu_svm'). > break; > case MSR_IA32_SYSENTER_ESP: > - svm->sysenter_esp = data; > - svm->vmcb01.ptr->save.sysenter_esp = data; > + svm->vmcb01.ptr->save.sysenter_esp = (u32)data; > + svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; > break; > case MSR_TSC_AUX: > if (!boot_cpu_has(X86_FEATURE_RDTSCP)) > @@ -4009,24 +4014,50 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) > vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); > } > > - if (!kvm_vcpu_apicv_active(vcpu)) > - return; > + if (kvm_vcpu_apicv_active(vcpu)) { > + /* > + * AVIC does not work with an x2APIC mode guest. If the X2APIC feature > + * is exposed to the guest, disable AVIC. > + */ > + if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) > + kvm_request_apicv_update(vcpu->kvm, false, > + APICV_INHIBIT_REASON_X2APIC); > > - /* > - * AVIC does not work with an x2APIC mode guest. If the X2APIC feature > - * is exposed to the guest, disable AVIC. > - */ > - if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) > - kvm_request_apicv_update(vcpu->kvm, false, > - APICV_INHIBIT_REASON_X2APIC); > + /* > + * Currently, AVIC does not work with nested virtualization. > + * So, we disable AVIC when cpuid for SVM is set in the L1 guest. > + */ > + if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) > + kvm_request_apicv_update(vcpu->kvm, false, > + APICV_INHIBIT_REASON_NESTED); > + } > > - /* > - * Currently, AVIC does not work with nested virtualization. > - * So, we disable AVIC when cpuid for SVM is set in the L1 guest. > - */ > - if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) > - kvm_request_apicv_update(vcpu->kvm, false, > - APICV_INHIBIT_REASON_NESTED); > + if (guest_cpuid_is_intel(vcpu)) { > + /* > + * We must intercept SYSENTER_EIP and SYSENTER_ESP > + * accesses because the processor only stores 32 bits. > + * For the same reason we cannot use virtual VMLOAD/VMSAVE. > + */ > + svm_set_intercept(svm, INTERCEPT_VMLOAD); > + svm_set_intercept(svm, INTERCEPT_VMSAVE); > + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; > + > + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); > + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); > + } else { > + /* > + * If hardware supports Virtual VMLOAD VMSAVE then enable it > + * in VMCB and clear intercepts to avoid #VMEXIT. > + */ > + if (vls) { > + svm_clr_intercept(svm, INTERCEPT_VMLOAD); > + svm_clr_intercept(svm, INTERCEPT_VMSAVE); > + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; > + } > + /* No need to intercept these MSRs */ > + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); > + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); > + } > } > > static bool svm_has_wbinvd_exit(void) > diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h > index 8e276c4fb33d..fffdd5fb446d 100644 > --- a/arch/x86/kvm/svm/svm.h > +++ b/arch/x86/kvm/svm/svm.h > @@ -28,7 +28,7 @@ static const u32 host_save_user_msrs[] = { > }; > #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) > > -#define MAX_DIRECT_ACCESS_MSRS 18 > +#define MAX_DIRECT_ACCESS_MSRS 20 > #define MSRPM_OFFSETS 16 > extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; > extern bool npt_enabled; > @@ -116,8 +116,8 @@ struct vcpu_svm { > struct kvm_vmcb_info *current_vmcb; > struct svm_cpu_data *svm_data; > u32 asid; > - uint64_t sysenter_esp; > - uint64_t sysenter_eip; > + u32 sysenter_esp_hi; > + u32 sysenter_eip_hi; > uint64_t tsc_aux; > > u64 msr_decfg; -- Vitaly