On Tue, Aug 9, 2016 at 5:32 PM, Yang Zhang <yang.zhang.wz@xxxxxxxxx> wrote: > On 2016/8/9 2:16, Radim Krčmář wrote: >> >> msr bitmap can be used to avoid a VM exit (interception) on guest MSR >> accesses. In some configurations of VMX controls, the guest can even >> directly access host's x2APIC MSRs. See SDM 29.5 VIRTUALIZING MSR-BASED >> APIC ACCESSES. >> >> L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI. >> To do so, L1 would first trick KVM to disable all possible interceptions >> by enabling APICv features and then would turn those features off; >> nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would >> not intercept previously enabled MSRs even though they were not safe >> with the new configuration. >> >> Correctly re-enabling interceptions is not enough as a second bug would >> still allow L1+L2 to access host's MSRs: msr bitmap was shared for all >> VMCSs, so L1 could trigger a race to get the desired combination of msr >> bitmap and VMX controls. >> >> This fix allocates a msr bitmap for every L1 VCPU, allows only safe >> x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would >> have to intercept everything anyway. >> >> Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap") >> Reported-by: Jim Mattson <jmattson@xxxxxxxxxx> >> Suggested-by: Wincy Van <fanwenyi0529@xxxxxxxxx> >> Signed-off-by: Radim Krčmář <rkrcmar@xxxxxxxxxx> >> --- >> arch/x86/kvm/vmx.c | 107 >> ++++++++++++++++++++++------------------------------- >> 1 file changed, 44 insertions(+), 63 deletions(-) >> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index a45d8580f91e..c66ac2c70d22 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -435,6 +435,8 @@ struct nested_vmx { >> bool pi_pending; >> u16 posted_intr_nv; >> >> + unsigned long *msr_bitmap; >> + >> struct hrtimer preemption_timer; >> bool preemption_timer_expired; >> >> @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy; >> static unsigned long *vmx_msr_bitmap_longmode; >> static unsigned long *vmx_msr_bitmap_legacy_x2apic; >> static unsigned long *vmx_msr_bitmap_longmode_x2apic; >> -static unsigned long *vmx_msr_bitmap_nested; >> static unsigned long *vmx_vmread_bitmap; >> static unsigned long *vmx_vmwrite_bitmap; >> >> @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu >> *vcpu) >> unsigned long *msr_bitmap; >> >> if (is_guest_mode(vcpu)) >> - msr_bitmap = vmx_msr_bitmap_nested; >> + msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; >> else if (cpu_has_secondary_exec_ctrls() && >> (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & >> SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { >> @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void) >> if (!vmx_msr_bitmap_longmode_x2apic) >> goto out4; >> >> - if (nested) { >> - vmx_msr_bitmap_nested = >> - (unsigned long *)__get_free_page(GFP_KERNEL); >> - if (!vmx_msr_bitmap_nested) >> - goto out5; >> - } >> - >> vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); >> if (!vmx_vmread_bitmap) >> goto out6; >> @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void) >> >> memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); >> memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); >> - if (nested) >> - memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); >> >> if (setup_vmcs_config(&vmcs_config) < 0) { >> r = -EIO; >> @@ -6529,9 +6521,6 @@ out8: >> out7: >> free_page((unsigned long)vmx_vmread_bitmap); >> out6: >> - if (nested) >> - free_page((unsigned long)vmx_msr_bitmap_nested); >> -out5: >> free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); >> out4: >> free_page((unsigned long)vmx_msr_bitmap_longmode); >> @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void) >> free_page((unsigned long)vmx_io_bitmap_a); >> free_page((unsigned long)vmx_vmwrite_bitmap); >> free_page((unsigned long)vmx_vmread_bitmap); >> - if (nested) >> - free_page((unsigned long)vmx_msr_bitmap_nested); >> >> free_kvm_area(); >> } >> @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu) >> return 1; >> } >> >> + if (cpu_has_vmx_msr_bitmap()) { >> + vmx->nested.msr_bitmap = >> + (unsigned long >> *)__get_free_page(GFP_KERNEL); >> + if (!vmx->nested.msr_bitmap) >> + goto out_msr_bitmap; >> + } >> + > > > We export msr_bitmap to guest even it is not supported by hardware. So we > need to allocate msr_bitmap for L1 unconditionally. > > Nice to see you, Yang :-) I think vmx->nested.msr_bitmap is filled by L0 and L1's settings, and used by hardware. L1's msr_bitmap is managed by itself, and L1 will write MSR_BITMAP field to vmcs12->msr_bitmap, then L0 can get L1's settings to emulate MSR_BITMAP. Am I missed something? Thanks, Wincy >> vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); >> if (!vmx->nested.cached_vmcs12) >> - return -ENOMEM; >> + goto out_cached_vmcs12; >> >> if (enable_shadow_vmcs) { >> shadow_vmcs = alloc_vmcs(); >> - if (!shadow_vmcs) { >> - kfree(vmx->nested.cached_vmcs12); >> - return -ENOMEM; >> - } >> + if (!shadow_vmcs) >> + goto out_shadow_vmcs; >> /* mark vmcs as shadow */ >> shadow_vmcs->revision_id |= (1u << 31); >> /* init shadow vmcs */ >> @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) >> skip_emulated_instruction(vcpu); >> nested_vmx_succeed(vcpu); >> return 1; >> + >> +out_shadow_vmcs: >> + kfree(vmx->nested.cached_vmcs12); >> + >> +out_cached_vmcs12: >> + free_page((unsigned long)vmx->nested.msr_bitmap); >> + >> +out_msr_bitmap: >> + return -ENOMEM; >> } >> >> /* >> @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx) >> vmx->nested.vmxon = false; >> free_vpid(vmx->nested.vpid02); >> nested_release_vmcs12(vmx); >> + if (vmx->nested.msr_bitmap) { >> + free_page((unsigned long)vmx->nested.msr_bitmap); >> + vmx->nested.msr_bitmap = NULL; >> + } >> if (enable_shadow_vmcs) >> free_vmcs(vmx->nested.current_shadow_vmcs); >> kfree(vmx->nested.cached_vmcs12); >> @@ -9472,8 +9477,10 @@ static inline bool >> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, >> { >> int msr; >> struct page *page; >> - unsigned long *msr_bitmap; >> + unsigned long *msr_bitmap_l1; >> + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; >> >> + /* This shortcut is ok because we support only x2APIC MSRs so far. >> */ >> if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) >> return false; >> >> @@ -9482,63 +9489,37 @@ static inline bool >> nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, >> WARN_ON(1); >> return false; >> } >> - msr_bitmap = (unsigned long *)kmap(page); >> - if (!msr_bitmap) { >> + msr_bitmap_l1 = (unsigned long *)kmap(page); >> + if (!msr_bitmap_l1) { >> nested_release_page_clean(page); >> WARN_ON(1); >> return false; >> } >> >> + memset(msr_bitmap_l0, 0xff, PAGE_SIZE); >> + >> if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { >> if (nested_cpu_has_apic_reg_virt(vmcs12)) >> for (msr = 0x800; msr <= 0x8ff; msr++) >> nested_vmx_disable_intercept_for_msr( >> - msr_bitmap, >> - vmx_msr_bitmap_nested, >> + msr_bitmap_l1, msr_bitmap_l0, >> msr, MSR_TYPE_R); >> - /* TPR is allowed */ >> - nested_vmx_disable_intercept_for_msr(msr_bitmap, >> - vmx_msr_bitmap_nested, >> + >> + nested_vmx_disable_intercept_for_msr( >> + msr_bitmap_l1, msr_bitmap_l0, >> APIC_BASE_MSR + (APIC_TASKPRI >> 4), >> MSR_TYPE_R | MSR_TYPE_W); >> + >> if (nested_cpu_has_vid(vmcs12)) { >> - /* EOI and self-IPI are allowed */ >> nested_vmx_disable_intercept_for_msr( >> - msr_bitmap, >> - vmx_msr_bitmap_nested, >> + msr_bitmap_l1, msr_bitmap_l0, >> APIC_BASE_MSR + (APIC_EOI >> 4), >> MSR_TYPE_W); >> nested_vmx_disable_intercept_for_msr( >> - msr_bitmap, >> - vmx_msr_bitmap_nested, >> + msr_bitmap_l1, msr_bitmap_l0, >> APIC_BASE_MSR + (APIC_SELF_IPI >> 4), >> MSR_TYPE_W); >> } >> - } else { >> - /* >> - * Enable reading intercept of all the x2apic >> - * MSRs. We should not rely on vmcs12 to do any >> - * optimizations here, it may have been modified >> - * by L1. >> - */ >> - for (msr = 0x800; msr <= 0x8ff; msr++) >> - __vmx_enable_intercept_for_msr( >> - vmx_msr_bitmap_nested, >> - msr, >> - MSR_TYPE_R); >> - >> - __vmx_enable_intercept_for_msr( >> - vmx_msr_bitmap_nested, >> - APIC_BASE_MSR + (APIC_TASKPRI >> 4), >> - MSR_TYPE_W); >> - __vmx_enable_intercept_for_msr( >> - vmx_msr_bitmap_nested, >> - APIC_BASE_MSR + (APIC_EOI >> 4), >> - MSR_TYPE_W); >> - __vmx_enable_intercept_for_msr( >> - vmx_msr_bitmap_nested, >> - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), >> - MSR_TYPE_W); >> } >> kunmap(page); >> nested_release_page_clean(page); >> @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, >> struct vmcs12 *vmcs12) >> } >> >> if (cpu_has_vmx_msr_bitmap() && >> - exec_control & CPU_BASED_USE_MSR_BITMAPS) { >> - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); >> - /* MSR_BITMAP will be set by following vmx_set_efer. */ >> - } else >> + exec_control & CPU_BASED_USE_MSR_BITMAPS && >> + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) >> + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ >> + else >> exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; >> >> /* >> > > > -- > Yang > Alibaba Cloud Computing -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html