Hello The race window I mentioned in the commit message is pretty small. So it’s difficult to reproduce it. But with the following ‘delay’ patch, it can be very easy to reproduce. ``` diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c17e6eb9ad43..b6728bf80a7d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -55,6 +55,7 @@ #include <linux/sched/stat.h> #include <linux/sched/isolation.h> #include <linux/mem_encrypt.h> +#include <linux/delay.h> #include <trace/events/kvm.h> @@ -8161,8 +8162,10 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, * Update it when it becomes invalid. */ apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (start <= apic_address && apic_address < end) + if (start <= apic_address && apic_address < end) { kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + mdelay(1000); + } return 0; } ``` Steps to Reproduce: - start Windows VM(ex: Windows Server 2016) and watch YouTube video to stimulate VM_ENTER/EXIT - ’stress —vm X —vm-bytes Y’ to make the APIC page swapped out - Windows OS will crash with BugCheck 0x109 Thanks, Eiichi > On Jun 6, 2020, at 13:26, Eiichi Tsukata <eiichi.tsukata@xxxxxxxxxxx> wrote: > > Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried to > fix inappropriate APIC page invalidation by re-introducing arch specific > kvm_arch_mmu_notifier_invalidate_range() and calling it from > kvm_mmu_notifier_invalidate_range_start. But threre could be the > following race because VMCS APIC address cache can be updated > *before* it is unmapped. > > Race: > (Invalidator) kvm_mmu_notifier_invalidate_range_start() > (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD) > (KVM VCPU) vcpu_enter_guest() > (KVM VCPU) kvm_vcpu_reload_apic_access_page() > (Invalidator) actually unmap page > > Symptom: > The above race can make Guest OS see already freed page and Guest OS > will see broken APIC register values. Especially, Windows OS checks > LAPIC modification so it can cause BSOD crash with BugCheck > CRITICAL_STRUCTURE_CORRUPTION (109). These symptoms are the same as we > previously saw in https://urldefense.proofpoint.com/v2/url?u=https-3A__bugzilla.kernel.org_show-5Fbug.cgi-3Fid-3D197951&d=DwIDAg&c=s883GpUCOChKOHiocYtGcg&r=dy01Dr4Ly8mhvnUdx1pZhhT1bkq4h9z5aVWu3paoZtk&m=0Tyk-14RQ4E7qUHEz3qfkUGJEUisqm5fr6wFgen6m9o&s=uTkyasbUNMoptgfsLkg3D5IDb_xxOSjklf2IfLLUzgI&e= and > we are currently seeing in > https://urldefense.proofpoint.com/v2/url?u=https-3A__bugzilla.redhat.com_show-5Fbug.cgi-3Fid-3D1751017&d=DwIDAg&c=s883GpUCOChKOHiocYtGcg&r=dy01Dr4Ly8mhvnUdx1pZhhT1bkq4h9z5aVWu3paoZtk&m=0Tyk-14RQ4E7qUHEz3qfkUGJEUisqm5fr6wFgen6m9o&s=pyRkFbs1A9a9AXxWMqiDEOoGJGBbmF8uJdLu8vKSPCs&e= . > > To prevent Guest OS from accessing already freed page, this patch calls > kvm_arch_mmu_notifier_invalidate_range() from > kvm_mmu_notifier_invalidate_range() instead of ..._range_start(). > > Fixes: b1394e745b94 ("KVM: x86: fix APIC page invalidation") > Signed-off-by: Eiichi Tsukata <eiichi.tsukata@xxxxxxxxxxx> > --- > arch/x86/kvm/x86.c | 7 ++----- > include/linux/kvm_host.h | 4 ++-- > virt/kvm/kvm_main.c | 26 ++++++++++++++++---------- > 3 files changed, 20 insertions(+), 17 deletions(-) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index c17e6eb9ad43..1700aade39d1 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -8150,9 +8150,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) > kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap); > } > > -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, > - unsigned long start, unsigned long end, > - bool blockable) > +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, > + unsigned long start, unsigned long end) > { > unsigned long apic_address; > > @@ -8163,8 +8162,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, > apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); > if (start <= apic_address && apic_address < end) > kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); > - > - return 0; > } > > void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 131cc1527d68..92efa39ea3d7 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -1406,8 +1406,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, > } > #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ > > -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, > - unsigned long start, unsigned long end, bool blockable); > +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, > + unsigned long start, unsigned long end); > > #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE > int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 731c1e517716..77aa91fb08d2 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -155,10 +155,9 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); > static unsigned long long kvm_createvm_count; > static unsigned long long kvm_active_vms; > > -__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, > - unsigned long start, unsigned long end, bool blockable) > +__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, > + unsigned long start, unsigned long end) > { > - return 0; > } > > bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) > @@ -384,6 +383,18 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) > return container_of(mn, struct kvm, mmu_notifier); > } > > +static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, > + struct mm_struct *mm, > + unsigned long start, unsigned long end) > +{ > + struct kvm *kvm = mmu_notifier_to_kvm(mn); > + int idx; > + > + idx = srcu_read_lock(&kvm->srcu); > + kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); > + srcu_read_unlock(&kvm->srcu, idx); > +} > + > static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, > struct mm_struct *mm, > unsigned long address, > @@ -408,7 +419,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, > { > struct kvm *kvm = mmu_notifier_to_kvm(mn); > int need_tlb_flush = 0, idx; > - int ret; > > idx = srcu_read_lock(&kvm->srcu); > spin_lock(&kvm->mmu_lock); > @@ -425,14 +435,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, > kvm_flush_remote_tlbs(kvm); > > spin_unlock(&kvm->mmu_lock); > - > - ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, > - range->end, > - mmu_notifier_range_blockable(range)); > - > srcu_read_unlock(&kvm->srcu, idx); > > - return ret; > + return 0; > } > > static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, > @@ -538,6 +543,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, > } > > static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { > + .invalidate_range = kvm_mmu_notifier_invalidate_range, > .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, > .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, > .clear_flush_young = kvm_mmu_notifier_clear_flush_young, > -- > 2.21.3 >