On 04/05/2018 20:37, Junaid Shahid wrote: > When using shadow paging, a CR3 switch in the guest results in a VM Exit. > In the common case, that VM exit doesn't require much processing by KVM. > However, it does acquire the MMU lock, which can start showing signs of > contention under some workloads even on a 2 VCPU VM when the guest is > using KPTI. Therefore, we add a fast path that avoids acquiring the MMU > lock in the most common cases e.g. when switching back and forth between > the kernel and user mode CR3s used by KPTI with no guest page table > changes in between. > > For now, this fast path is implemented only for 64-bit guests and hosts, > but it can be extended later to 32-bit guests and/or hosts as well. > > Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx> Not having KPTI slow down to a crawl without EPT is surely useful, but I wonder if this could be generalized a bit, so that for example we don't acquire the MMU lock on nested vmentry/vmexit... Paolo > --- > arch/x86/include/asm/kvm_host.h | 10 ++- > arch/x86/kvm/mmu.c | 147 ++++++++++++++++++++++++++++---- > arch/x86/kvm/x86.c | 7 +- > 3 files changed, 145 insertions(+), 19 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 46b68fa1f599..e9c56f21ac3c 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -352,6 +352,8 @@ struct kvm_mmu { > u8 shadow_root_level; > u8 ept_ad; > bool direct_map; > + hpa_t prev_root_hpa; > + gpa_t prev_cr3; > > /* > * Bitmap; bit set = permission fault > @@ -1265,6 +1267,10 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, > return !!(*irq_state); > } > > +#define KVM_MMU_ROOT_CURRENT BIT(0) > +#define KVM_MMU_ROOT_PREVIOUS BIT(1) > +#define KVM_MMU_ROOTS_ALL (KVM_MMU_ROOT_CURRENT | KVM_MMU_ROOT_PREVIOUS) > + > int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); > void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); > > @@ -1276,7 +1282,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); > int kvm_mmu_load(struct kvm_vcpu *vcpu); > void kvm_mmu_unload(struct kvm_vcpu *vcpu); > void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); > -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu); > +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, uint roots_to_free); > gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, > struct x86_exception *exception); > gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, > @@ -1295,7 +1301,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); > int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, > void *insn, int insn_len); > void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); > -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); > +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3); > > void kvm_enable_tdp(void); > void kvm_disable_tdp(void); > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 9f8c693f2d2f..0107129d8ed9 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -2702,6 +2702,37 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, > kvm_unsync_page(vcpu, sp); > } > > + /* > + * We need to ensure that the marking of unsync pages is visible > + * before the PTE is updated to reflect write-protection because > + * kvm_mmu_sync_roots() checks the unsync flags without holding > + * the MMU lock. If the PTE was updated before the page has been > + * marked as unsync-ed, something like the following could > + * happen: > + * > + * CPU 1 CPU 2 CPU 3 > + * ------------------------------------------------------------- > + * Host updates SPTE > + * to be writable > + * Guest writes to > + * its page table > + * > + * Guest signals CPU > + * 1 to flush TLB > + * Guest issues TLB flush > + * triggering VM Exit > + * > + * Host checks sp->unsync > + * and skips sync > + * Marks SP as unsync > + * > + * Guest wrongly accesses > + * page using old shadow > + * mapping > + * > + */ > + smp_wmb(); > + > return false; > } > > @@ -3365,21 +3396,30 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, > *root_hpa = INVALID_PAGE; > } > > -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu) > +/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ > +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, uint roots_to_free) > { > int i; > LIST_HEAD(invalid_list); > struct kvm_mmu *mmu = &vcpu->arch.mmu; > + bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; > + bool free_prev_root = roots_to_free & KVM_MMU_ROOT_PREVIOUS; > > - if (!VALID_PAGE(mmu->root_hpa)) > + if ((!VALID_PAGE(mmu->root_hpa) || !free_active_root) && > + (!VALID_PAGE(mmu->prev_root_hpa) || !free_prev_root)) > return; > > spin_lock(&vcpu->kvm->mmu_lock); > > if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && > (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { > - mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list); > - } else { > + if (free_prev_root) > + mmu_free_root_page(vcpu->kvm, &mmu->prev_root_hpa, > + &invalid_list); > + if (free_active_root) > + mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, > + &invalid_list); > + } else if (free_active_root) { > for (i = 0; i < 4; ++i) > if (mmu->pae_root[i] != 0) > mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i], > @@ -3553,7 +3593,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) > return mmu_alloc_shadow_roots(vcpu); > } > > -static void mmu_sync_roots(struct kvm_vcpu *vcpu) > +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) > { > int i; > struct kvm_mmu_page *sp; > @@ -3565,14 +3605,39 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) > return; > > vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); > - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); > + > if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { > hpa_t root = vcpu->arch.mmu.root_hpa; > + > sp = page_header(root); > + > + /* > + * Even if another VCPU was marking the SP as unsync-ed > + * simultaneously, any guest page table changes are not > + * guaranteed to be visible anyway until this VCPU issues a TLB > + * flush strictly after those changes are made. We only need to > + * ensure that the other CPU sets these flags before any actual > + * changes to the page tables are made. The comments in > + * kvm_unsync_pages() describe what could go wrong if this > + * requirement isn't satisfied. > + */ > + smp_rmb(); > + if (!sp->unsync && !sp->unsync_children) > + return; > + > + spin_lock(&vcpu->kvm->mmu_lock); > + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); > + > mmu_sync_children(vcpu, sp); > + > kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); > + spin_unlock(&vcpu->kvm->mmu_lock); > return; > } > + > + spin_lock(&vcpu->kvm->mmu_lock); > + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); > + > for (i = 0; i < 4; ++i) { > hpa_t root = vcpu->arch.mmu.pae_root[i]; > > @@ -3582,13 +3647,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) > mmu_sync_children(vcpu, sp); > } > } > - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); > -} > > -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) > -{ > - spin_lock(&vcpu->kvm->mmu_lock); > - mmu_sync_roots(vcpu); > + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); > spin_unlock(&vcpu->kvm->mmu_lock); > } > EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); > @@ -3948,13 +4008,60 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, > context->root_level = 0; > context->shadow_root_level = PT32E_ROOT_LEVEL; > context->root_hpa = INVALID_PAGE; > + context->prev_root_hpa = INVALID_PAGE; > + context->prev_cr3 = INVALID_PAGE; > context->direct_map = true; > context->nx = false; > } > > -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) > +static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t old_cr3) > { > - kvm_mmu_free_roots(vcpu); > + struct kvm_mmu *mmu = &vcpu->arch.mmu; > + gpa_t new_cr3 = kvm_read_cr3(vcpu); > + > + /* > + * For now, limit the fast switch to 64-bit hosts+VMs. We may add > + * support for 32-bit VMs later. > + */ > + if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL && > + vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { > + if (old_cr3 == new_cr3) { > + kvm_err("Not expected to reach %s if old and new CR3 are the same\n", > + __func__); > + return false; > + } > + > + if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT)) > + return false; > + > + swap(mmu->root_hpa, mmu->prev_root_hpa); > + > + if (new_cr3 == mmu->prev_cr3 && VALID_PAGE(mmu->root_hpa)) { > + /* > + * It is possible that the cached previous root page is > + * obsolete because of a change in the MMU > + * generation number. However, that is accompanied by > + * KVM_REQ_MMU_RELOAD, which will free the root that we > + * have set here and allocate a new one. > + */ > + > + kvm_mmu_sync_roots(vcpu); > + __clear_sp_write_flooding_count( > + page_header(mmu->root_hpa)); > + mmu->set_cr3(vcpu, mmu->root_hpa); > + return true; > + } > + } > + > + return false; > +} > + > +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3) > +{ > + if (!fast_cr3_switch(vcpu, old_cr3)) > + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT); > + > + vcpu->arch.mmu.prev_cr3 = old_cr3; > } > > static unsigned long get_cr3(struct kvm_vcpu *vcpu) > @@ -4432,6 +4539,8 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, > context->update_pte = paging64_update_pte; > context->shadow_root_level = level; > context->root_hpa = INVALID_PAGE; > + context->prev_root_hpa = INVALID_PAGE; > + context->prev_cr3 = INVALID_PAGE; > context->direct_map = false; > } > > @@ -4462,6 +4571,8 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, > context->update_pte = paging32_update_pte; > context->shadow_root_level = PT32E_ROOT_LEVEL; > context->root_hpa = INVALID_PAGE; > + context->prev_root_hpa = INVALID_PAGE; > + context->prev_cr3 = INVALID_PAGE; > context->direct_map = false; > } > > @@ -4484,6 +4595,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) > context->update_pte = nonpaging_update_pte; > context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); > context->root_hpa = INVALID_PAGE; > + context->prev_root_hpa = INVALID_PAGE; > + context->prev_cr3 = INVALID_PAGE; > context->direct_map = true; > context->set_cr3 = kvm_x86_ops->set_tdp_cr3; > context->get_cr3 = get_cr3; > @@ -4565,6 +4678,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, > context->update_pte = ept_update_pte; > context->root_level = PT64_ROOT_4LEVEL; > context->root_hpa = INVALID_PAGE; > + context->prev_root_hpa = INVALID_PAGE; > + context->prev_cr3 = INVALID_PAGE; > context->direct_map = false; > context->base_role.ad_disabled = !accessed_dirty; > > @@ -4667,7 +4782,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); > > void kvm_mmu_unload(struct kvm_vcpu *vcpu) > { > - kvm_mmu_free_roots(vcpu); > + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL); > WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); > } > EXPORT_SYMBOL_GPL(kvm_mmu_unload); > @@ -5046,6 +5161,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) > { > vcpu->arch.walk_mmu = &vcpu->arch.mmu; > vcpu->arch.mmu.root_hpa = INVALID_PAGE; > + vcpu->arch.mmu.prev_root_hpa = INVALID_PAGE; > + vcpu->arch.mmu.prev_cr3 = INVALID_PAGE; > vcpu->arch.mmu.translate_gpa = translate_gpa; > vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index b2ff74b12ec4..847ce7f0a2c8 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -842,11 +842,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); > > int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) > { > + unsigned long old_cr3 = kvm_read_cr3(vcpu); > + > #ifdef CONFIG_X86_64 > cr3 &= ~CR3_PCID_INVD; > #endif > > - if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { > + if (cr3 == old_cr3 && !pdptrs_changed(vcpu)) { > kvm_mmu_sync_roots(vcpu); > kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); > return 0; > @@ -861,7 +863,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) > > vcpu->arch.cr3 = cr3; > __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); > - kvm_mmu_new_cr3(vcpu); > + kvm_mmu_new_cr3(vcpu, old_cr3); > + > return 0; > } > EXPORT_SYMBOL_GPL(kvm_set_cr3); >