Sean Christopherson <seanjc@xxxxxxxxxx> writes: > Always check vmcs01's MSR bitmap when merging L0 and L1 bitmaps for L2, > and always update the relevant bits in vmcs02. This fixes two distinct, > but intertwined bugs related to dynamic MSR bitmap modifications. > > The first issue is that KVM fails to enable MSR interception in vmcs02 > for the FS/GS base MSRs if L1 first runs L2 with interception disabled, > and later enables interception. > > The second issue is that KVM fails to honor userspace MSR filtering when > preparing vmcs02. > > Fix both issues simultaneous as fixing only one of the issues (doesn't > matter which) would create a mess that no one should have to bisect. > Fixing only the first bug would exacerbate the MSR filtering issue as > userspace would see inconsistent behavior depending on the whims of L1. > Fixing only the second bug (MSR filtering) effectively requires fixing > the first, as the nVMX code only knows how to transition vmcs02's > bitmap from 1->0. > > Move the various accessor/mutators that are currently buried in vmx.c > into vmx.h so that they can be shared by the nested code. > > Fixes: 1a155254ff93 ("KVM: x86: Introduce MSR filtering") > Fixes: d69129b4e46a ("KVM: nVMX: Disable intercept for FS/GS base MSRs in vmcs02 when possible") > Cc: stable@xxxxxxxxxxxxxxx > Cc: Alexander Graf <graf@xxxxxxxxxx> > Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> > --- > arch/x86/kvm/vmx/nested.c | 111 +++++++++++++++++--------------------- > arch/x86/kvm/vmx/vmx.c | 67 ++--------------------- > arch/x86/kvm/vmx/vmx.h | 63 ++++++++++++++++++++++ > 3 files changed, 116 insertions(+), 125 deletions(-) > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c > index eedcebf58004..3c9657f6923e 100644 > --- a/arch/x86/kvm/vmx/nested.c > +++ b/arch/x86/kvm/vmx/nested.c > @@ -523,29 +523,6 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, > return 0; > } > > -/* > - * Check if MSR is intercepted for L01 MSR bitmap. > - */ > -static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) > -{ > - unsigned long *msr_bitmap; > - int f = sizeof(unsigned long); > - > - if (!cpu_has_vmx_msr_bitmap()) > - return true; > - > - msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; > - > - if (msr <= 0x1fff) { > - return !!test_bit(msr, msr_bitmap + 0x800 / f); > - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { > - msr &= 0x1fff; > - return !!test_bit(msr, msr_bitmap + 0xc00 / f); > - } > - > - return true; > -} > - > /* > * If a msr is allowed by L0, we should check whether it is allowed by L1. > * The corresponding bit will be cleared unless both of L0 and L1 allow it. > @@ -599,6 +576,34 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) > } > } > > +#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ > +static inline \ > +void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ > + unsigned long *msr_bitmap_l1, \ > + unsigned long *msr_bitmap_l0, u32 msr) \ > +{ \ > + if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ > + vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ > + vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ > + else \ > + vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ > +} > +BUILD_NVMX_MSR_INTERCEPT_HELPER(read) > +BUILD_NVMX_MSR_INTERCEPT_HELPER(write) > + > +static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, > + unsigned long *msr_bitmap_l1, > + unsigned long *msr_bitmap_l0, > + u32 msr, int types) > +{ > + if (types & MSR_TYPE_R) > + nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, > + msr_bitmap_l0, msr); > + if (types & MSR_TYPE_W) > + nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, > + msr_bitmap_l0, msr); > +} > + > /* > * Merge L0's and L1's MSR bitmap, return false to indicate that > * we do not use the hardware. > @@ -606,10 +611,11 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) > static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, > struct vmcs12 *vmcs12) > { > + struct vcpu_vmx *vmx = to_vmx(vcpu); > int msr; > unsigned long *msr_bitmap_l1; > - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; > - struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; > + unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; > + struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; > > /* Nothing to do if the MSR bitmap is not in use. */ > if (!cpu_has_vmx_msr_bitmap() || > @@ -660,44 +666,27 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, > } > } > > - /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ > -#ifdef CONFIG_X86_64 > - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, > - MSR_FS_BASE, MSR_TYPE_RW); > - > - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, > - MSR_GS_BASE, MSR_TYPE_RW); > - > - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, > - MSR_KERNEL_GS_BASE, MSR_TYPE_RW); > -#endif > - > /* > - * Checking the L0->L1 bitmap is trying to verify two things: > - * > - * 1. L0 gave a permission to L1 to actually passthrough the MSR. This > - * ensures that we do not accidentally generate an L02 MSR bitmap > - * from the L12 MSR bitmap that is too permissive. > - * 2. That L1 or L2s have actually used the MSR. This avoids > - * unnecessarily merging of the bitmap if the MSR is unused. This > - * works properly because we only update the L01 MSR bitmap lazily. > - * So even if L0 should pass L1 these MSRs, the L01 bitmap is only > - * updated to reflect this when L1 (or its L2s) actually write to > - * the MSR. > + * Always check vmcs01's bitmap to honor userspace MSR filters and any > + * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. > */ > - if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) > - nested_vmx_disable_intercept_for_msr( > - msr_bitmap_l1, msr_bitmap_l0, > - MSR_IA32_SPEC_CTRL, > - MSR_TYPE_R | MSR_TYPE_W); > - > - if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) > - nested_vmx_disable_intercept_for_msr( > - msr_bitmap_l1, msr_bitmap_l0, > - MSR_IA32_PRED_CMD, > - MSR_TYPE_W); > - > - kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); > +#ifdef CONFIG_X86_64 > + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, > + MSR_FS_BASE, MSR_TYPE_RW); > + > + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, > + MSR_GS_BASE, MSR_TYPE_RW); > + > + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, > + MSR_KERNEL_GS_BASE, MSR_TYPE_RW); > +#endif > + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, > + MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); > + > + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, > + MSR_IA32_PRED_CMD, MSR_TYPE_W); > + > + kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); > > return true; > } > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > index d118daed0530..86a8c2713039 100644 > --- a/arch/x86/kvm/vmx/vmx.c > +++ b/arch/x86/kvm/vmx/vmx.c > @@ -766,29 +766,6 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) > vmcs_write32(EXCEPTION_BITMAP, eb); > } > > -/* > - * Check if MSR is intercepted for currently loaded MSR bitmap. > - */ > -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) > -{ > - unsigned long *msr_bitmap; > - int f = sizeof(unsigned long); > - > - if (!cpu_has_vmx_msr_bitmap()) > - return true; > - > - msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; > - > - if (msr <= 0x1fff) { > - return !!test_bit(msr, msr_bitmap + 0x800 / f); > - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { > - msr &= 0x1fff; > - return !!test_bit(msr, msr_bitmap + 0xc00 / f); > - } > - > - return true; > -} > - > static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, > unsigned long entry, unsigned long exit) > { > @@ -3695,46 +3672,6 @@ void free_vpid(int vpid) > spin_unlock(&vmx_vpid_lock); > } > > -static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr) > -{ > - int f = sizeof(unsigned long); > - > - if (msr <= 0x1fff) > - __clear_bit(msr, msr_bitmap + 0x000 / f); > - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) > - __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); > -} > - > -static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr) > -{ > - int f = sizeof(unsigned long); > - > - if (msr <= 0x1fff) > - __clear_bit(msr, msr_bitmap + 0x800 / f); > - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) > - __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); > -} > - > -static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr) > -{ > - int f = sizeof(unsigned long); > - > - if (msr <= 0x1fff) > - __set_bit(msr, msr_bitmap + 0x000 / f); > - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) > - __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); > -} > - > -static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr) > -{ > - int f = sizeof(unsigned long); > - > - if (msr <= 0x1fff) > - __set_bit(msr, msr_bitmap + 0x800 / f); > - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) > - __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); > -} > - > void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) > { > struct vcpu_vmx *vmx = to_vmx(vcpu); > @@ -6749,7 +6686,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) > * If the L02 MSR bitmap does not intercept the MSR, then we need to > * save it. > */ > - if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) > + if (unlikely(cpu_has_vmx_msr_bitmap() && > + vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, > + MSR_IA32_SPEC_CTRL))) > vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); I smoke-tested this patch by running (unrelated) selftests when I tried to put in into my 'Enlightened MSR Bitmap v4' series and my dmesg got flooded with: [ 87.210214] unchecked MSR access error: RDMSR from 0x48 at rIP: 0xffffffffc04e0284 (native_read_msr+0x4/0x30 [kvm_intel]) [ 87.210325] Call Trace: [ 87.210355] vmx_vcpu_run+0xcc7/0x12b0 [kvm_intel] [ 87.210405] ? vmx_prepare_switch_to_guest+0x138/0x1f0 [kvm_intel] [ 87.210466] vcpu_enter_guest+0x98c/0x1380 [kvm] [ 87.210631] ? vmx_vcpu_put+0x2e/0x1f0 [kvm_intel] [ 87.210678] ? vmx_vcpu_load+0x21/0x60 [kvm_intel] [ 87.210729] kvm_arch_vcpu_ioctl_run+0xdf/0x580 [kvm] [ 87.210844] kvm_vcpu_ioctl+0x274/0x660 [kvm] [ 87.210950] __x64_sys_ioctl+0x83/0xb0 [ 87.210996] do_syscall_64+0x3b/0x90 [ 87.211039] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 87.211093] RIP: 0033:0x7f6ef7f9a307 [ 87.211134] Code: 44 00 00 48 8b 05 69 1b 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 39 1b 2d 00 f7 d8 64 89 01 48 [ 87.211293] RSP: 002b:00007ffcacfb3b18 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 87.211367] RAX: ffffffffffffffda RBX: 0000000000a2f300 RCX: 00007f6ef7f9a307 [ 87.211434] RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 0000000000000007 [ 87.211500] RBP: 0000000000000000 R08: 000000000040e769 R09: 0000000000000000 [ 87.211559] R10: 0000000000a2f001 R11: 0000000000000246 R12: 0000000000a2d010 [ 87.211622] R13: 0000000000a2d010 R14: 0000000000402a15 R15: 00000000ffff0ff0 [ 87.212520] Call Trace: [ 87.212597] vmx_vcpu_run+0xcc7/0x12b0 [kvm_intel] [ 87.212683] ? vmx_prepare_switch_to_guest+0x138/0x1f0 [kvm_intel] [ 87.212789] vcpu_enter_guest+0x98c/0x1380 [kvm] [ 87.213059] ? vmx_vcpu_put+0x2e/0x1f0 [kvm_intel] [ 87.213141] ? schedule+0x44/0xa0 [ 87.213200] kvm_arch_vcpu_ioctl_run+0xdf/0x580 [kvm] [ 87.213428] kvm_vcpu_ioctl+0x274/0x660 [kvm] [ 87.213633] __x64_sys_ioctl+0x83/0xb0 [ 87.213705] do_syscall_64+0x3b/0x90 [ 87.213766] entry_SYSCALL_64_after_hwframe+0x44/0xae ... this was an old 'E5-2603 v3' CPU. Any idea what's wrong? > > x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); > diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h > index 592217fd7d92..3f9c8548625d 100644 > --- a/arch/x86/kvm/vmx/vmx.h > +++ b/arch/x86/kvm/vmx/vmx.h > @@ -400,6 +400,69 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, > > void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); > > +static inline bool vmx_test_msr_bitmap_read(ulong *msr_bitmap, u32 msr) > +{ > + int f = sizeof(unsigned long); > + > + if (msr <= 0x1fff) > + return test_bit(msr, msr_bitmap + 0x000 / f); > + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) > + return test_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); > + return true; > +} > + > +static inline bool vmx_test_msr_bitmap_write(ulong *msr_bitmap, u32 msr) > +{ > + int f = sizeof(unsigned long); > + > + if (msr <= 0x1fff) > + return test_bit(msr, msr_bitmap + 0x800 / f); > + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) > + return test_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); > + return true; > +} > + > +static inline void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr) > +{ > + int f = sizeof(unsigned long); > + > + if (msr <= 0x1fff) > + __clear_bit(msr, msr_bitmap + 0x000 / f); > + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) > + __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); > +} > + > +static inline void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr) > +{ > + int f = sizeof(unsigned long); > + > + if (msr <= 0x1fff) > + __clear_bit(msr, msr_bitmap + 0x800 / f); > + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) > + __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); > +} > + > +static inline void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr) > +{ > + int f = sizeof(unsigned long); > + > + if (msr <= 0x1fff) > + __set_bit(msr, msr_bitmap + 0x000 / f); > + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) > + __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); > +} > + > +static inline void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr) > +{ > + int f = sizeof(unsigned long); > + > + if (msr <= 0x1fff) > + __set_bit(msr, msr_bitmap + 0x800 / f); > + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) > + __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); > +} > + > + > static inline u8 vmx_get_rvi(void) > { > return vmcs_read16(GUEST_INTR_STATUS) & 0xff; -- Vitaly