When using shadow paging, a CR3 switch in the guest results in a VM Exit. In the common case, that VM exit doesn't require much processing by KVM. However, it does acquire the MMU lock, which can start showing signs of contention under some workloads even on a 2 VCPU VM when the guest is using KPTI. Therefore, we add a fast path that avoids acquiring the MMU lock in the most common cases e.g. when switching back and forth between the kernel and user mode CR3s used by KPTI with no guest page table changes in between. For now, this fast path is implemented only for 64-bit guests and hosts, but it can be extended later to 32-bit guests and/or hosts as well. Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 10 ++- arch/x86/kvm/mmu.c | 147 ++++++++++++++++++++++++++++---- arch/x86/kvm/x86.c | 7 +- 3 files changed, 145 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 46b68fa1f599..e9c56f21ac3c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -352,6 +352,8 @@ struct kvm_mmu { u8 shadow_root_level; u8 ept_ad; bool direct_map; + hpa_t prev_root_hpa; + gpa_t prev_cr3; /* * Bitmap; bit set = permission fault @@ -1265,6 +1267,10 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } +#define KVM_MMU_ROOT_CURRENT BIT(0) +#define KVM_MMU_ROOT_PREVIOUS BIT(1) +#define KVM_MMU_ROOTS_ALL (KVM_MMU_ROOT_CURRENT | KVM_MMU_ROOT_PREVIOUS) + int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); @@ -1276,7 +1282,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, uint roots_to_free); gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, @@ -1295,7 +1301,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3); void kvm_enable_tdp(void); void kvm_disable_tdp(void); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9f8c693f2d2f..0107129d8ed9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2702,6 +2702,37 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_unsync_page(vcpu, sp); } + /* + * We need to ensure that the marking of unsync pages is visible + * before the PTE is updated to reflect write-protection because + * kvm_mmu_sync_roots() checks the unsync flags without holding + * the MMU lock. If the PTE was updated before the page has been + * marked as unsync-ed, something like the following could + * happen: + * + * CPU 1 CPU 2 CPU 3 + * ------------------------------------------------------------- + * Host updates SPTE + * to be writable + * Guest writes to + * its page table + * + * Guest signals CPU + * 1 to flush TLB + * Guest issues TLB flush + * triggering VM Exit + * + * Host checks sp->unsync + * and skips sync + * Marks SP as unsync + * + * Guest wrongly accesses + * page using old shadow + * mapping + * + */ + smp_wmb(); + return false; } @@ -3365,21 +3396,30 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, *root_hpa = INVALID_PAGE; } -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu) +/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, uint roots_to_free) { int i; LIST_HEAD(invalid_list); struct kvm_mmu *mmu = &vcpu->arch.mmu; + bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; + bool free_prev_root = roots_to_free & KVM_MMU_ROOT_PREVIOUS; - if (!VALID_PAGE(mmu->root_hpa)) + if ((!VALID_PAGE(mmu->root_hpa) || !free_active_root) && + (!VALID_PAGE(mmu->prev_root_hpa) || !free_prev_root)) return; spin_lock(&vcpu->kvm->mmu_lock); if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { - mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list); - } else { + if (free_prev_root) + mmu_free_root_page(vcpu->kvm, &mmu->prev_root_hpa, + &invalid_list); + if (free_active_root) + mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, + &invalid_list); + } else if (free_active_root) { for (i = 0; i < 4; ++i) if (mmu->pae_root[i] != 0) mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i], @@ -3553,7 +3593,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) return mmu_alloc_shadow_roots(vcpu); } -static void mmu_sync_roots(struct kvm_vcpu *vcpu) +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; @@ -3565,14 +3605,39 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) return; vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + + /* + * Even if another VCPU was marking the SP as unsync-ed + * simultaneously, any guest page table changes are not + * guaranteed to be visible anyway until this VCPU issues a TLB + * flush strictly after those changes are made. We only need to + * ensure that the other CPU sets these flags before any actual + * changes to the page tables are made. The comments in + * kvm_unsync_pages() describe what could go wrong if this + * requirement isn't satisfied. + */ + smp_rmb(); + if (!sp->unsync && !sp->unsync_children) + return; + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + mmu_sync_children(vcpu, sp); + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); + spin_unlock(&vcpu->kvm->mmu_lock); return; } + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -3582,13 +3647,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_children(vcpu, sp); } } - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); -} -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) -{ - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_roots(vcpu); + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); spin_unlock(&vcpu->kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); @@ -3948,13 +4008,60 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; + context->prev_root_hpa = INVALID_PAGE; + context->prev_cr3 = INVALID_PAGE; context->direct_map = true; context->nx = false; } -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) +static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t old_cr3) { - kvm_mmu_free_roots(vcpu); + struct kvm_mmu *mmu = &vcpu->arch.mmu; + gpa_t new_cr3 = kvm_read_cr3(vcpu); + + /* + * For now, limit the fast switch to 64-bit hosts+VMs. We may add + * support for 32-bit VMs later. + */ + if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL && + vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { + if (old_cr3 == new_cr3) { + kvm_err("Not expected to reach %s if old and new CR3 are the same\n", + __func__); + return false; + } + + if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT)) + return false; + + swap(mmu->root_hpa, mmu->prev_root_hpa); + + if (new_cr3 == mmu->prev_cr3 && VALID_PAGE(mmu->root_hpa)) { + /* + * It is possible that the cached previous root page is + * obsolete because of a change in the MMU + * generation number. However, that is accompanied by + * KVM_REQ_MMU_RELOAD, which will free the root that we + * have set here and allocate a new one. + */ + + kvm_mmu_sync_roots(vcpu); + __clear_sp_write_flooding_count( + page_header(mmu->root_hpa)); + mmu->set_cr3(vcpu, mmu->root_hpa); + return true; + } + } + + return false; +} + +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3) +{ + if (!fast_cr3_switch(vcpu, old_cr3)) + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT); + + vcpu->arch.mmu.prev_cr3 = old_cr3; } static unsigned long get_cr3(struct kvm_vcpu *vcpu) @@ -4432,6 +4539,8 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, context->update_pte = paging64_update_pte; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; + context->prev_root_hpa = INVALID_PAGE; + context->prev_cr3 = INVALID_PAGE; context->direct_map = false; } @@ -4462,6 +4571,8 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, context->update_pte = paging32_update_pte; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; + context->prev_root_hpa = INVALID_PAGE; + context->prev_cr3 = INVALID_PAGE; context->direct_map = false; } @@ -4484,6 +4595,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); context->root_hpa = INVALID_PAGE; + context->prev_root_hpa = INVALID_PAGE; + context->prev_cr3 = INVALID_PAGE; context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; context->get_cr3 = get_cr3; @@ -4565,6 +4678,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->update_pte = ept_update_pte; context->root_level = PT64_ROOT_4LEVEL; context->root_hpa = INVALID_PAGE; + context->prev_root_hpa = INVALID_PAGE; + context->prev_cr3 = INVALID_PAGE; context->direct_map = false; context->base_role.ad_disabled = !accessed_dirty; @@ -4667,7 +4782,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { - kvm_mmu_free_roots(vcpu); + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL); WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_unload); @@ -5046,6 +5161,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) { vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.prev_root_hpa = INVALID_PAGE; + vcpu->arch.mmu.prev_cr3 = INVALID_PAGE; vcpu->arch.mmu.translate_gpa = translate_gpa; vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b2ff74b12ec4..847ce7f0a2c8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -842,11 +842,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + unsigned long old_cr3 = kvm_read_cr3(vcpu); + #ifdef CONFIG_X86_64 cr3 &= ~CR3_PCID_INVD; #endif - if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { + if (cr3 == old_cr3 && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); return 0; @@ -861,7 +863,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - kvm_mmu_new_cr3(vcpu); + kvm_mmu_new_cr3(vcpu, old_cr3); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); -- 2.17.0.441.gb46fe60e1d-goog