Re: [PATCH v3 1/3] KVM: nVMX: Handle dynamic MSR intercept toggling

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Sean Christopherson <seanjc@xxxxxxxxxx> writes:

> Always check vmcs01's MSR bitmap when merging L0 and L1 bitmaps for L2,
> and always update the relevant bits in vmcs02.  This fixes two distinct,
> but intertwined bugs related to dynamic MSR bitmap modifications.
>
> The first issue is that KVM fails to enable MSR interception in vmcs02
> for the FS/GS base MSRs if L1 first runs L2 with interception disabled,
> and later enables interception.
>
> The second issue is that KVM fails to honor userspace MSR filtering when
> preparing vmcs02.
>
> Fix both issues simultaneous as fixing only one of the issues (doesn't
> matter which) would create a mess that no one should have to bisect.
> Fixing only the first bug would exacerbate the MSR filtering issue as
> userspace would see inconsistent behavior depending on the whims of L1.
> Fixing only the second bug (MSR filtering) effectively requires fixing
> the first, as the nVMX code only knows how to transition vmcs02's
> bitmap from 1->0.
>
> Move the various accessor/mutators that are currently buried in vmx.c
> into vmx.h so that they can be shared by the nested code.
>
> Fixes: 1a155254ff93 ("KVM: x86: Introduce MSR filtering")
> Fixes: d69129b4e46a ("KVM: nVMX: Disable intercept for FS/GS base MSRs in vmcs02 when possible")
> Cc: stable@xxxxxxxxxxxxxxx
> Cc: Alexander Graf <graf@xxxxxxxxxx>
> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> ---
>  arch/x86/kvm/vmx/nested.c | 111 +++++++++++++++++---------------------
>  arch/x86/kvm/vmx/vmx.c    |  67 ++---------------------
>  arch/x86/kvm/vmx/vmx.h    |  63 ++++++++++++++++++++++
>  3 files changed, 116 insertions(+), 125 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index eedcebf58004..3c9657f6923e 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -523,29 +523,6 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
>  	return 0;
>  }
>  
> -/*
> - * Check if MSR is intercepted for L01 MSR bitmap.
> - */
> -static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
> -{
> -	unsigned long *msr_bitmap;
> -	int f = sizeof(unsigned long);
> -
> -	if (!cpu_has_vmx_msr_bitmap())
> -		return true;
> -
> -	msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
> -
> -	if (msr <= 0x1fff) {
> -		return !!test_bit(msr, msr_bitmap + 0x800 / f);
> -	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> -		msr &= 0x1fff;
> -		return !!test_bit(msr, msr_bitmap + 0xc00 / f);
> -	}
> -
> -	return true;
> -}
> -
>  /*
>   * If a msr is allowed by L0, we should check whether it is allowed by L1.
>   * The corresponding bit will be cleared unless both of L0 and L1 allow it.
> @@ -599,6 +576,34 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
>  	}
>  }
>  
> +#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw)					\
> +static inline									\
> +void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx,			\
> +					 unsigned long *msr_bitmap_l1,		\
> +					 unsigned long *msr_bitmap_l0, u32 msr)	\
> +{										\
> +	if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) ||		\
> +	    vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr))			\
> +		vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr);			\
> +	else									\
> +		vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr);			\
> +}
> +BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
> +BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
> +
> +static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
> +						    unsigned long *msr_bitmap_l1,
> +						    unsigned long *msr_bitmap_l0,
> +						    u32 msr, int types)
> +{
> +	if (types & MSR_TYPE_R)
> +		nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
> +						  msr_bitmap_l0, msr);
> +	if (types & MSR_TYPE_W)
> +		nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
> +						   msr_bitmap_l0, msr);
> +}
> +
>  /*
>   * Merge L0's and L1's MSR bitmap, return false to indicate that
>   * we do not use the hardware.
> @@ -606,10 +611,11 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
>  static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>  						 struct vmcs12 *vmcs12)
>  {
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	int msr;
>  	unsigned long *msr_bitmap_l1;
> -	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
> -	struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
> +	unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
> +	struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
>  
>  	/* Nothing to do if the MSR bitmap is not in use.  */
>  	if (!cpu_has_vmx_msr_bitmap() ||
> @@ -660,44 +666,27 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>  		}
>  	}
>  
> -	/* KVM unconditionally exposes the FS/GS base MSRs to L1. */
> -#ifdef CONFIG_X86_64
> -	nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
> -					     MSR_FS_BASE, MSR_TYPE_RW);
> -
> -	nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
> -					     MSR_GS_BASE, MSR_TYPE_RW);
> -
> -	nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
> -					     MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
> -#endif
> -
>  	/*
> -	 * Checking the L0->L1 bitmap is trying to verify two things:
> -	 *
> -	 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
> -	 *    ensures that we do not accidentally generate an L02 MSR bitmap
> -	 *    from the L12 MSR bitmap that is too permissive.
> -	 * 2. That L1 or L2s have actually used the MSR. This avoids
> -	 *    unnecessarily merging of the bitmap if the MSR is unused. This
> -	 *    works properly because we only update the L01 MSR bitmap lazily.
> -	 *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
> -	 *    updated to reflect this when L1 (or its L2s) actually write to
> -	 *    the MSR.
> +	 * Always check vmcs01's bitmap to honor userspace MSR filters and any
> +	 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
>  	 */
> -	if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
> -		nested_vmx_disable_intercept_for_msr(
> -					msr_bitmap_l1, msr_bitmap_l0,
> -					MSR_IA32_SPEC_CTRL,
> -					MSR_TYPE_R | MSR_TYPE_W);
> -
> -	if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
> -		nested_vmx_disable_intercept_for_msr(
> -					msr_bitmap_l1, msr_bitmap_l0,
> -					MSR_IA32_PRED_CMD,
> -					MSR_TYPE_W);
> -
> -	kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
> +#ifdef CONFIG_X86_64
> +	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> +					 MSR_FS_BASE, MSR_TYPE_RW);
> +
> +	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> +					 MSR_GS_BASE, MSR_TYPE_RW);
> +
> +	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> +					 MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
> +#endif
> +	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> +					 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
> +
> +	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
> +					 MSR_IA32_PRED_CMD, MSR_TYPE_W);
> +
> +	kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
>  
>  	return true;
>  }
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index d118daed0530..86a8c2713039 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -766,29 +766,6 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
>  	vmcs_write32(EXCEPTION_BITMAP, eb);
>  }
>  
> -/*
> - * Check if MSR is intercepted for currently loaded MSR bitmap.
> - */
> -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
> -{
> -	unsigned long *msr_bitmap;
> -	int f = sizeof(unsigned long);
> -
> -	if (!cpu_has_vmx_msr_bitmap())
> -		return true;
> -
> -	msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
> -
> -	if (msr <= 0x1fff) {
> -		return !!test_bit(msr, msr_bitmap + 0x800 / f);
> -	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> -		msr &= 0x1fff;
> -		return !!test_bit(msr, msr_bitmap + 0xc00 / f);
> -	}
> -
> -	return true;
> -}
> -
>  static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
>  		unsigned long entry, unsigned long exit)
>  {
> @@ -3695,46 +3672,6 @@ void free_vpid(int vpid)
>  	spin_unlock(&vmx_vpid_lock);
>  }
>  
> -static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
> -{
> -	int f = sizeof(unsigned long);
> -
> -	if (msr <= 0x1fff)
> -		__clear_bit(msr, msr_bitmap + 0x000 / f);
> -	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> -		__clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
> -}
> -
> -static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
> -{
> -	int f = sizeof(unsigned long);
> -
> -	if (msr <= 0x1fff)
> -		__clear_bit(msr, msr_bitmap + 0x800 / f);
> -	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> -		__clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
> -}
> -
> -static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
> -{
> -	int f = sizeof(unsigned long);
> -
> -	if (msr <= 0x1fff)
> -		__set_bit(msr, msr_bitmap + 0x000 / f);
> -	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> -		__set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
> -}
> -
> -static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
> -{
> -	int f = sizeof(unsigned long);
> -
> -	if (msr <= 0x1fff)
> -		__set_bit(msr, msr_bitmap + 0x800 / f);
> -	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> -		__set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
> -}
> -
>  void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -6749,7 +6686,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
>  	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
>  	 * save it.
>  	 */
> -	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
> +	if (unlikely(cpu_has_vmx_msr_bitmap() &&
> +		     vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap,
> +					       MSR_IA32_SPEC_CTRL)))
>  		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);

I smoke-tested this patch by running (unrelated) selftests when I tried
to put in into my 'Enlightened MSR Bitmap v4' series and my dmesg got
flooded with:

[   87.210214] unchecked MSR access error: RDMSR from 0x48 at rIP: 0xffffffffc04e0284 (native_read_msr+0x4/0x30 [kvm_intel])
[   87.210325] Call Trace:
[   87.210355]  vmx_vcpu_run+0xcc7/0x12b0 [kvm_intel]
[   87.210405]  ? vmx_prepare_switch_to_guest+0x138/0x1f0 [kvm_intel]
[   87.210466]  vcpu_enter_guest+0x98c/0x1380 [kvm]
[   87.210631]  ? vmx_vcpu_put+0x2e/0x1f0 [kvm_intel]
[   87.210678]  ? vmx_vcpu_load+0x21/0x60 [kvm_intel]
[   87.210729]  kvm_arch_vcpu_ioctl_run+0xdf/0x580 [kvm]
[   87.210844]  kvm_vcpu_ioctl+0x274/0x660 [kvm]
[   87.210950]  __x64_sys_ioctl+0x83/0xb0
[   87.210996]  do_syscall_64+0x3b/0x90
[   87.211039]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   87.211093] RIP: 0033:0x7f6ef7f9a307
[   87.211134] Code: 44 00 00 48 8b 05 69 1b 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 39 1b 2d 00 f7 d8 64 89 01 48
[   87.211293] RSP: 002b:00007ffcacfb3b18 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[   87.211367] RAX: ffffffffffffffda RBX: 0000000000a2f300 RCX: 00007f6ef7f9a307
[   87.211434] RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 0000000000000007
[   87.211500] RBP: 0000000000000000 R08: 000000000040e769 R09: 0000000000000000
[   87.211559] R10: 0000000000a2f001 R11: 0000000000000246 R12: 0000000000a2d010
[   87.211622] R13: 0000000000a2d010 R14: 0000000000402a15 R15: 00000000ffff0ff0
[   87.212520] Call Trace:
[   87.212597]  vmx_vcpu_run+0xcc7/0x12b0 [kvm_intel]
[   87.212683]  ? vmx_prepare_switch_to_guest+0x138/0x1f0 [kvm_intel]
[   87.212789]  vcpu_enter_guest+0x98c/0x1380 [kvm]
[   87.213059]  ? vmx_vcpu_put+0x2e/0x1f0 [kvm_intel]
[   87.213141]  ? schedule+0x44/0xa0
[   87.213200]  kvm_arch_vcpu_ioctl_run+0xdf/0x580 [kvm]
[   87.213428]  kvm_vcpu_ioctl+0x274/0x660 [kvm]
[   87.213633]  __x64_sys_ioctl+0x83/0xb0
[   87.213705]  do_syscall_64+0x3b/0x90
[   87.213766]  entry_SYSCALL_64_after_hwframe+0x44/0xae
...

this was an old 'E5-2603 v3' CPU. Any idea what's wrong?

>  
>  	x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 592217fd7d92..3f9c8548625d 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -400,6 +400,69 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
>  
>  void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
>  
> +static inline bool vmx_test_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
> +{
> +	int f = sizeof(unsigned long);
> +
> +	if (msr <= 0x1fff)
> +		return test_bit(msr, msr_bitmap + 0x000 / f);
> +	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> +		return test_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
> +	return true;
> +}
> +
> +static inline bool vmx_test_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
> +{
> +	int f = sizeof(unsigned long);
> +
> +	if (msr <= 0x1fff)
> +		return test_bit(msr, msr_bitmap + 0x800 / f);
> +	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> +		return test_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
> +	return true;
> +}
> +
> +static inline void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
> +{
> +	int f = sizeof(unsigned long);
> +
> +	if (msr <= 0x1fff)
> +		__clear_bit(msr, msr_bitmap + 0x000 / f);
> +	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> +		__clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
> +}
> +
> +static inline void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
> +{
> +	int f = sizeof(unsigned long);
> +
> +	if (msr <= 0x1fff)
> +		__clear_bit(msr, msr_bitmap + 0x800 / f);
> +	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> +		__clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
> +}
> +
> +static inline void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
> +{
> +	int f = sizeof(unsigned long);
> +
> +	if (msr <= 0x1fff)
> +		__set_bit(msr, msr_bitmap + 0x000 / f);
> +	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> +		__set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
> +}
> +
> +static inline void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
> +{
> +	int f = sizeof(unsigned long);
> +
> +	if (msr <= 0x1fff)
> +		__set_bit(msr, msr_bitmap + 0x800 / f);
> +	else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
> +		__set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
> +}
> +
> +
>  static inline u8 vmx_get_rvi(void)
>  {
>  	return vmcs_read16(GUEST_INTR_STATUS) & 0xff;

-- 
Vitaly




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux