Re: [RFC PATCH] KVM: x86: Fix APIC page invalidation race

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello

The race window I mentioned in the commit message is pretty small. So it’s difficult to reproduce it.
But with the following ‘delay’ patch, it can be very easy to reproduce.

```
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c17e6eb9ad43..b6728bf80a7d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -55,6 +55,7 @@
 #include <linux/sched/stat.h>
 #include <linux/sched/isolation.h>
 #include <linux/mem_encrypt.h>
+#include <linux/delay.h>
 
 #include <trace/events/kvm.h>
 
@@ -8161,8 +8162,10 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
         * Update it when it becomes invalid.
         */
        apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
-       if (start <= apic_address && apic_address < end)
+       if (start <= apic_address && apic_address < end) {
                kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+               mdelay(1000);
+       }
 
        return 0;
 }
```

Steps to Reproduce:
- start Windows VM(ex: Windows Server 2016) and watch YouTube video to stimulate VM_ENTER/EXIT
- ’stress —vm X —vm-bytes Y’ to make the APIC page swapped out
- Windows OS will crash with BugCheck 0x109

Thanks,

Eiichi

> On Jun 6, 2020, at 13:26, Eiichi Tsukata <eiichi.tsukata@xxxxxxxxxxx> wrote:
> 
> Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried to
> fix inappropriate APIC page invalidation by re-introducing arch specific
> kvm_arch_mmu_notifier_invalidate_range() and calling it from
> kvm_mmu_notifier_invalidate_range_start. But threre could be the
> following race because VMCS APIC address cache can be updated
> *before* it is unmapped.
> 
> Race:
>  (Invalidator) kvm_mmu_notifier_invalidate_range_start()
>  (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD)
>  (KVM VCPU) vcpu_enter_guest()
>  (KVM VCPU) kvm_vcpu_reload_apic_access_page()
>  (Invalidator) actually unmap page
> 
> Symptom:
>  The above race can make Guest OS see already freed page and Guest OS
> will see broken APIC register values. Especially, Windows OS checks
> LAPIC modification so it can cause BSOD crash with BugCheck
> CRITICAL_STRUCTURE_CORRUPTION (109). These symptoms are the same as we
> previously saw in https://urldefense.proofpoint.com/v2/url?u=https-3A__bugzilla.kernel.org_show-5Fbug.cgi-3Fid-3D197951&d=DwIDAg&c=s883GpUCOChKOHiocYtGcg&r=dy01Dr4Ly8mhvnUdx1pZhhT1bkq4h9z5aVWu3paoZtk&m=0Tyk-14RQ4E7qUHEz3qfkUGJEUisqm5fr6wFgen6m9o&s=uTkyasbUNMoptgfsLkg3D5IDb_xxOSjklf2IfLLUzgI&e=  and
> we are currently seeing in
> https://urldefense.proofpoint.com/v2/url?u=https-3A__bugzilla.redhat.com_show-5Fbug.cgi-3Fid-3D1751017&d=DwIDAg&c=s883GpUCOChKOHiocYtGcg&r=dy01Dr4Ly8mhvnUdx1pZhhT1bkq4h9z5aVWu3paoZtk&m=0Tyk-14RQ4E7qUHEz3qfkUGJEUisqm5fr6wFgen6m9o&s=pyRkFbs1A9a9AXxWMqiDEOoGJGBbmF8uJdLu8vKSPCs&e= .
> 
> To prevent Guest OS from accessing already freed page, this patch calls
> kvm_arch_mmu_notifier_invalidate_range() from
> kvm_mmu_notifier_invalidate_range() instead of ..._range_start().
> 
> Fixes: b1394e745b94 ("KVM: x86: fix APIC page invalidation")
> Signed-off-by: Eiichi Tsukata <eiichi.tsukata@xxxxxxxxxxx>
> ---
> arch/x86/kvm/x86.c       |  7 ++-----
> include/linux/kvm_host.h |  4 ++--
> virt/kvm/kvm_main.c      | 26 ++++++++++++++++----------
> 3 files changed, 20 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index c17e6eb9ad43..1700aade39d1 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -8150,9 +8150,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
> 	kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
> }
> 
> -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> -		unsigned long start, unsigned long end,
> -		bool blockable)
> +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> +					    unsigned long start, unsigned long end)
> {
> 	unsigned long apic_address;
> 
> @@ -8163,8 +8162,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> 	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
> 	if (start <= apic_address && apic_address < end)
> 		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
> -
> -	return 0;
> }
> 
> void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 131cc1527d68..92efa39ea3d7 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1406,8 +1406,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
> }
> #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
> 
> -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> -		unsigned long start, unsigned long end, bool blockable);
> +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> +					    unsigned long start, unsigned long end);
> 
> #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
> int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 731c1e517716..77aa91fb08d2 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -155,10 +155,9 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
> static unsigned long long kvm_createvm_count;
> static unsigned long long kvm_active_vms;
> 
> -__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> -		unsigned long start, unsigned long end, bool blockable)
> +__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
> +						   unsigned long start, unsigned long end)
> {
> -	return 0;
> }
> 
> bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
> @@ -384,6 +383,18 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
> 	return container_of(mn, struct kvm, mmu_notifier);
> }
> 
> +static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
> +					      struct mm_struct *mm,
> +					      unsigned long start, unsigned long end)
> +{
> +	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> +	int idx;
> +
> +	idx = srcu_read_lock(&kvm->srcu);
> +	kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
> +	srcu_read_unlock(&kvm->srcu, idx);
> +}
> +
> static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
> 					struct mm_struct *mm,
> 					unsigned long address,
> @@ -408,7 +419,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> {
> 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
> 	int need_tlb_flush = 0, idx;
> -	int ret;
> 
> 	idx = srcu_read_lock(&kvm->srcu);
> 	spin_lock(&kvm->mmu_lock);
> @@ -425,14 +435,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> 		kvm_flush_remote_tlbs(kvm);
> 
> 	spin_unlock(&kvm->mmu_lock);
> -
> -	ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
> -					range->end,
> -					mmu_notifier_range_blockable(range));
> -
> 	srcu_read_unlock(&kvm->srcu, idx);
> 
> -	return ret;
> +	return 0;
> }
> 
> static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> @@ -538,6 +543,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
> 
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +	.invalidate_range	= kvm_mmu_notifier_invalidate_range,
> 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
> 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
> 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
> -- 
> 2.21.3
> 





[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux