From: Lai Jiangshan <laijs@xxxxxxxxxxxxxxxxx> If gpte is changed from non-present to present, the guest doesn't need to flush tlb per SDM. So the host must synchronze sp before link it. Otherwise the guest might use a wrong mapping. For example: the guest first changes a level-1 pagetable, and then links its parent to a new place where the original gpte is non-present. Finally the guest can access the remapped area without flushing the tlb. The guest's behavior should be allowed per SDM, but the host kvm mmu makes it wrong. Fixes: 4731d4c7a077 ("KVM: MMU: out of sync shadow core") Signed-off-by: Lai Jiangshan <laijs@xxxxxxxxxxxxxxxxx> --- Changed from V1: Don't loop, but just return when it needs to break. arch/x86/kvm/mmu/mmu.c | 15 ++++++++------- arch/x86/kvm/mmu/paging_tmpl.h | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 26f6bd238a77..3c1b069a7bcf 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2024,8 +2024,8 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) } while (!sp->unsync_children); } -static void mmu_sync_children(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *parent) +static int mmu_sync_children(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *parent, bool can_yield) { int i; struct kvm_mmu_page *sp; @@ -2052,12 +2052,16 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + if (!can_yield) + return -EINTR; + cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); flush = false; } } kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + return 0; } static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) @@ -2143,9 +2147,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } - if (sp->unsync_children) - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - __clear_sp_write_flooding_count(sp); trace_get_page: @@ -3642,7 +3643,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) write_lock(&vcpu->kvm->mmu_lock); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); - mmu_sync_children(vcpu, sp); + mmu_sync_children(vcpu, sp, true); kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); write_unlock(&vcpu->kvm->mmu_lock); @@ -3658,7 +3659,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (IS_VALID_PAE_ROOT(root)) { root &= PT64_BASE_ADDR_MASK; sp = to_shadow_page(root); - mmu_sync_children(vcpu, sp); + mmu_sync_children(vcpu, sp, true); } } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 5962d4f8a72e..87374cfd82be 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -704,6 +704,28 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, access = gw->pt_access[it.level - 2]; sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr, it.level-1, false, access); + /* + * We must synchronize the pagetable before link it + * because the guest doens't need to flush tlb when + * gpte is changed from non-present to present. + * Otherwise, the guest may use the wrong mapping. + * + * For PG_LEVEL_4K, kvm_mmu_get_page() has already + * synchronized it transiently via kvm_sync_page(). + * + * For higher level pagetable, we synchronize it + * via slower mmu_sync_children(). If it needs to + * break, returns RET_PF_RETRY and will retry on + * next #PF. It had already made some progress. + * + * It also makes KVM_REQ_MMU_SYNC request if the @sp + * is linked on a different addr to expedite it. + */ + if (sp->unsync_children && + mmu_sync_children(vcpu, sp, false)) { + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + return RET_PF_RETRY; + } } /* -- 2.19.1.6.gb485710b