4.4-stable review patch. If anyone has any objections, please let me know. ------------------ From: "Radim Krčmář" <rkrcmar@xxxxxxxxxx> commit d048c098218e91ed0e10dfa1f0f80e2567fe4ef7 upstream. msr bitmap can be used to avoid a VM exit (interception) on guest MSR accesses. In some configurations of VMX controls, the guest can even directly access host's x2APIC MSRs. See SDM 29.5 VIRTUALIZING MSR-BASED APIC ACCESSES. L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI. To do so, L1 would first trick KVM to disable all possible interceptions by enabling APICv features and then would turn those features off; nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would not intercept previously enabled MSRs even though they were not safe with the new configuration. Correctly re-enabling interceptions is not enough as a second bug would still allow L1+L2 to access host's MSRs: msr bitmap was shared for all VMCSs, so L1 could trigger a race to get the desired combination of msr bitmap and VMX controls. This fix allocates a msr bitmap for every L1 VCPU, allows only safe x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would have to intercept everything anyway. Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap") Reported-by: Jim Mattson <jmattson@xxxxxxxxxx> Suggested-by: Wincy Van <fanwenyi0529@xxxxxxxxx> Reviewed-by: Wanpeng Li <wanpeng.li@xxxxxxxxxxx> Signed-off-by: Radim Krčmář <rkrcmar@xxxxxxxxxx> [bwh: Backported to 4.4: - handle_vmon() doesn't allocate a cached vmcs12 - Adjust context] Signed-off-by: Ben Hutchings <ben.hutchings@xxxxxxxxxxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- arch/x86/kvm/vmx.c | 96 ++++++++++++++++++++--------------------------------- 1 file changed, 38 insertions(+), 58 deletions(-) --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -431,6 +431,8 @@ struct nested_vmx { u16 posted_intr_nv; u64 msr_ia32_feature_control; + unsigned long *msr_bitmap; + struct hrtimer preemption_timer; bool preemption_timer_expired; @@ -912,7 +914,6 @@ static unsigned long *vmx_msr_bitmap_leg static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_nested; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -2358,7 +2359,7 @@ static void vmx_set_msr_bitmap(struct kv unsigned long *msr_bitmap; if (is_guest_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_nested; + msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; else if (vcpu->arch.apic_base & X2APIC_ENABLE) { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode_x2apic; @@ -6192,13 +6193,6 @@ static __init int hardware_setup(void) if (!vmx_msr_bitmap_longmode_x2apic) goto out4; - if (nested) { - vmx_msr_bitmap_nested = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_nested) - goto out5; - } - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) goto out6; @@ -6216,8 +6210,6 @@ static __init int hardware_setup(void) memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - if (nested) - memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; @@ -6354,9 +6346,6 @@ out8: out7: free_page((unsigned long)vmx_vmread_bitmap); out6: - if (nested) - free_page((unsigned long)vmx_msr_bitmap_nested); -out5: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); out4: free_page((unsigned long)vmx_msr_bitmap_longmode); @@ -6382,8 +6371,6 @@ static __exit void hardware_unsetup(void free_page((unsigned long)vmx_io_bitmap_a); free_page((unsigned long)vmx_vmwrite_bitmap); free_page((unsigned long)vmx_vmread_bitmap); - if (nested) - free_page((unsigned long)vmx_msr_bitmap_nested); free_kvm_area(); } @@ -6825,10 +6812,17 @@ static int handle_vmon(struct kvm_vcpu * return 1; } + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx->nested.msr_bitmap) + goto out_msr_bitmap; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) - return -ENOMEM; + goto out_shadow_vmcs; /* mark vmcs as shadow */ shadow_vmcs->revision_id |= (1u << 31); /* init shadow vmcs */ @@ -6850,6 +6844,12 @@ static int handle_vmon(struct kvm_vcpu * skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); return 1; + +out_shadow_vmcs: + free_page((unsigned long)vmx->nested.msr_bitmap); + +out_msr_bitmap: + return -ENOMEM; } /* @@ -6919,6 +6919,10 @@ static void free_nested(struct vcpu_vmx vmx->nested.vmxon = false; free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); + if (vmx->nested.msr_bitmap) { + free_page((unsigned long)vmx->nested.msr_bitmap); + vmx->nested.msr_bitmap = NULL; + } if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); /* Unpin physical memory we referred to in current vmcs02 */ @@ -9248,8 +9252,10 @@ static inline bool nested_vmx_merge_msr_ { int msr; struct page *page; - unsigned long *msr_bitmap; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + /* This shortcut is ok because we support only x2APIC MSRs so far. */ if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) return false; @@ -9258,58 +9264,32 @@ static inline bool nested_vmx_merge_msr_ WARN_ON(1); return false; } - msr_bitmap = (unsigned long *)kmap(page); + msr_bitmap_l1 = (unsigned long *)kmap(page); + + memset(msr_bitmap_l0, 0xff, PAGE_SIZE); if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { if (nested_cpu_has_apic_reg_virt(vmcs12)) for (msr = 0x800; msr <= 0x8ff; msr++) nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, msr, MSR_TYPE_R); - /* TPR is allowed */ - nested_vmx_disable_intercept_for_msr(msr_bitmap, - vmx_msr_bitmap_nested, + + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_TASKPRI >> 4), MSR_TYPE_R | MSR_TYPE_W); + if (nested_cpu_has_vid(vmcs12)) { - /* EOI and self-IPI are allowed */ nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_EOI >> 4), MSR_TYPE_W); nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_SELF_IPI >> 4), MSR_TYPE_W); } - } else { - /* - * Enable reading intercept of all the x2apic - * MSRs. We should not rely on vmcs12 to do any - * optimizations here, it may have been modified - * by L1. - */ - for (msr = 0x800; msr <= 0x8ff; msr++) - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - msr, - MSR_TYPE_R); - - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_TASKPRI >> 4), - MSR_TYPE_W); - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_EOI >> 4), - MSR_TYPE_W); - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), - MSR_TYPE_W); } kunmap(page); nested_release_page_clean(page); @@ -9729,10 +9709,10 @@ static void prepare_vmcs02(struct kvm_vc } if (cpu_has_vmx_msr_bitmap() && - exec_control & CPU_BASED_USE_MSR_BITMAPS) { - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); - /* MSR_BITMAP will be set by following vmx_set_efer. */ - } else + exec_control & CPU_BASED_USE_MSR_BITMAPS && + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ + else exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; /*