----- liran.alon@xxxxxxxxxx wrote: > ----- junaids@xxxxxxxxxx wrote: > > > When using shadow paging, a CR3 switch in the guest results in a VM > > Exit. > > In the common case, that VM exit doesn't require much processing by > > KVM. > > However, it does acquire the MMU lock, which can start showing > signs > > of > > contention under some workloads even on a 2 VCPU VM when the guest > is > > using KPTI. Therefore, we add a fast path that avoids acquiring the > > MMU > > lock in the most common cases e.g. when switching back and forth > > between > > the kernel and user mode CR3s used by KPTI with no guest page table > > changes in between. > > > > For now, this fast path is implemented only for 64-bit guests and > > hosts, > > but it can be extended later to 32-bit guests and/or hosts as well. > > It is unclear to me from the commit message and patch itself why this > decision was made. > It is best if you clarify in commit message and comments what is the > complexity of 32-bit guests/hosts. > > > > > Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx> > > --- > > arch/x86/include/asm/kvm_host.h | 10 ++- > > arch/x86/kvm/mmu.c | 147 > > ++++++++++++++++++++++++++++---- > > arch/x86/kvm/x86.c | 7 +- > > 3 files changed, 145 insertions(+), 19 deletions(-) > > > > diff --git a/arch/x86/include/asm/kvm_host.h > > b/arch/x86/include/asm/kvm_host.h > > index 46b68fa1f599..e9c56f21ac3c 100644 > > --- a/arch/x86/include/asm/kvm_host.h > > +++ b/arch/x86/include/asm/kvm_host.h > > @@ -352,6 +352,8 @@ struct kvm_mmu { > > u8 shadow_root_level; > > u8 ept_ad; > > bool direct_map; > > + hpa_t prev_root_hpa; > > + gpa_t prev_cr3; > > > > /* > > * Bitmap; bit set = permission fault > > @@ -1265,6 +1267,10 @@ static inline int > __kvm_irq_line_state(unsigned > > long *irq_state, > > return !!(*irq_state); > > } > > > > +#define KVM_MMU_ROOT_CURRENT BIT(0) > > +#define KVM_MMU_ROOT_PREVIOUS BIT(1) > > +#define KVM_MMU_ROOTS_ALL (KVM_MMU_ROOT_CURRENT | > > KVM_MMU_ROOT_PREVIOUS) > > + > > I think it is a bit weird that this patch introduce "uint > roots_to_free" as flags > while it is really used as a bool: Either you free both prev & active > root or you free just active root. > Therefore it makes more sense at this point for parameter to be "bool > free_prev_root". > Another weird effect of this is what does it mean to call > kvm_mmu_free_roots() with roots_to_free=0? > Interface seems a bit strange. > > > int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int > irq_source_id, > > int level); > > void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); > > > > @@ -1276,7 +1282,7 @@ void __kvm_mmu_free_some_pages(struct > kvm_vcpu > > *vcpu); > > int kvm_mmu_load(struct kvm_vcpu *vcpu); > > void kvm_mmu_unload(struct kvm_vcpu *vcpu); > > void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); > > -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu); > > +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, uint > roots_to_free); > > gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 > > access, > > struct x86_exception *exception); > > gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, > > @@ -1295,7 +1301,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu > > *vcpu); > > int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 > > error_code, > > void *insn, int insn_len); > > void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); > > -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); > > +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3); > > > > void kvm_enable_tdp(void); > > void kvm_disable_tdp(void); > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > > index 9f8c693f2d2f..0107129d8ed9 100644 > > --- a/arch/x86/kvm/mmu.c > > +++ b/arch/x86/kvm/mmu.c > > @@ -2702,6 +2702,37 @@ static bool mmu_need_write_protect(struct > > kvm_vcpu *vcpu, gfn_t gfn, > > kvm_unsync_page(vcpu, sp); > > } > > > > + /* > > + * We need to ensure that the marking of unsync pages is visible > > + * before the PTE is updated to reflect write-protection because > > + * kvm_mmu_sync_roots() checks the unsync flags without holding > > + * the MMU lock. If the PTE was updated before the page has been > > + * marked as unsync-ed, something like the following could > > + * happen: > > + * > > + * CPU 1 CPU 2 CPU 3 > > + * ------------------------------------------------------------- > > + * Host updates SPTE > > + * to be writable > > + * Guest writes to > > + * its page table > > + * > > + * Guest signals CPU > > + * 1 to flush TLB > > + * Guest issues TLB flush > > + * triggering VM Exit > > + * > > + * Host checks sp->unsync > > + * and skips sync > > + * Marks SP as unsync > > + * > > + * Guest wrongly accesses > > + * page using old shadow > > + * mapping > > + * > > + */ > > + smp_wmb(); > > + > > return false; > > } > > > > @@ -3365,21 +3396,30 @@ static void mmu_free_root_page(struct kvm > > *kvm, hpa_t *root_hpa, > > *root_hpa = INVALID_PAGE; > > } > > > > -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu) > > +/* roots_to_free must be some combination of the KVM_MMU_ROOT_* > flags > > */ > > +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, uint roots_to_free) > > { > > int i; > > LIST_HEAD(invalid_list); > > struct kvm_mmu *mmu = &vcpu->arch.mmu; > > + bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; > > + bool free_prev_root = roots_to_free & KVM_MMU_ROOT_PREVIOUS; > > > > - if (!VALID_PAGE(mmu->root_hpa)) > > + if ((!VALID_PAGE(mmu->root_hpa) || !free_active_root) && > > + (!VALID_PAGE(mmu->prev_root_hpa) || !free_prev_root)) > > return; > > Shouldn't the condition instead be: > if ((free_active_root && !VALID_PAGE(mmu->root_hpa)) || > (free_prev_root && !VALID_PAGE(mmu->prev_root_hpa)) > return; > ? > > > > > spin_lock(&vcpu->kvm->mmu_lock); > > > > if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && > > (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { > > - mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list); > > - } else { > > + if (free_prev_root) > > + mmu_free_root_page(vcpu->kvm, &mmu->prev_root_hpa, > > + &invalid_list); > > + if (free_active_root) > > + mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, > > + &invalid_list); > > + } else if (free_active_root) { > > for (i = 0; i < 4; ++i) > > if (mmu->pae_root[i] != 0) > > mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i], > > @@ -3553,7 +3593,7 @@ static int mmu_alloc_roots(struct kvm_vcpu > > *vcpu) > > return mmu_alloc_shadow_roots(vcpu); > > } > > > > -static void mmu_sync_roots(struct kvm_vcpu *vcpu) > > +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) > > { > > int i; > > struct kvm_mmu_page *sp; > > @@ -3565,14 +3605,39 @@ static void mmu_sync_roots(struct kvm_vcpu > > *vcpu) > > return; > > > > vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); > > - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); > > + > > if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { > > hpa_t root = vcpu->arch.mmu.root_hpa; > > + > > sp = page_header(root); > > + > > + /* > > + * Even if another VCPU was marking the SP as unsync-ed > > + * simultaneously, any guest page table changes are not > > + * guaranteed to be visible anyway until this VCPU issues a TLB > > + * flush strictly after those changes are made. We only need to > > + * ensure that the other CPU sets these flags before any actual > > + * changes to the page tables are made. The comments in > > + * kvm_unsync_pages() describe what could go wrong if this > > + * requirement isn't satisfied. > > + */ > > + smp_rmb(); > > + if (!sp->unsync && !sp->unsync_children) > > + return; > > + > > + spin_lock(&vcpu->kvm->mmu_lock); > > + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); > > + > > mmu_sync_children(vcpu, sp); > > + > > kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); > > + spin_unlock(&vcpu->kvm->mmu_lock); > > return; > > } > > + > > + spin_lock(&vcpu->kvm->mmu_lock); > > + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); > > + > > for (i = 0; i < 4; ++i) { > > hpa_t root = vcpu->arch.mmu.pae_root[i]; > > > > @@ -3582,13 +3647,8 @@ static void mmu_sync_roots(struct kvm_vcpu > > *vcpu) > > mmu_sync_children(vcpu, sp); > > } > > } > > - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); > > -} > > > > -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) > > -{ > > - spin_lock(&vcpu->kvm->mmu_lock); > > - mmu_sync_roots(vcpu); > > + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); > > spin_unlock(&vcpu->kvm->mmu_lock); > > } > > EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); > > @@ -3948,13 +4008,60 @@ static void nonpaging_init_context(struct > > kvm_vcpu *vcpu, > > context->root_level = 0; > > context->shadow_root_level = PT32E_ROOT_LEVEL; > > context->root_hpa = INVALID_PAGE; > > + context->prev_root_hpa = INVALID_PAGE; > > + context->prev_cr3 = INVALID_PAGE; > > context->direct_map = true; > > context->nx = false; > > } > > > > -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) > > +static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t old_cr3) > > { > > - kvm_mmu_free_roots(vcpu); > > + struct kvm_mmu *mmu = &vcpu->arch.mmu; > > + gpa_t new_cr3 = kvm_read_cr3(vcpu); > > + > > + /* > > + * For now, limit the fast switch to 64-bit hosts+VMs. We may add > > + * support for 32-bit VMs later. > > + */ > > + if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL && > > + vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { > > + if (old_cr3 == new_cr3) { > > + kvm_err("Not expected to reach %s if old and new CR3 are the > > same\n", > > + __func__); > > + return false; > > + } > > + > > + if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT)) > > + return false; > > + > > + swap(mmu->root_hpa, mmu->prev_root_hpa); > > + > > + if (new_cr3 == mmu->prev_cr3 && VALID_PAGE(mmu->root_hpa)) { > > + /* > > + * It is possible that the cached previous root page is > > + * obsolete because of a change in the MMU > > + * generation number. However, that is accompanied by > > + * KVM_REQ_MMU_RELOAD, which will free the root that we > > + * have set here and allocate a new one. > > + */ > > + > > + kvm_mmu_sync_roots(vcpu); > > + __clear_sp_write_flooding_count( > > + page_header(mmu->root_hpa)); > > + mmu->set_cr3(vcpu, mmu->root_hpa); > > + return true; > > + } > > + } > > + > > + return false; > > +} > > + > > +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3) > > +{ > > + if (!fast_cr3_switch(vcpu, old_cr3)) > > + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT); > > + > > + vcpu->arch.mmu.prev_cr3 = old_cr3; > > } > > It's probably just me but I fail to understand the optimization > regarding fast_cr3_switch(). > How can fast_cr3_switch() reach the case that (new_cr3 == > mmu->prev_cr3) > while kvm_set_cr3() haven't seen (cr3 == old_cr3 && > !pdptrs_changed(vcpu))? > IIUC, this can only happen if pdptrs_changed(vcpu). > However, fast_cr3_switch() optimization is only relevant for 64-bit > guests which doesn't use PDPTRs cache... > Can you clarify this for me? I'm probably missing something trivial. You can ignore my entire comment block here. I noticed the trivial part I was missing: mmu.prev_cr3 is assigned with old_cr3. From some reason I read it in my head as new_cr3... Now this makes sense :) > > In general, it seems that all the diff regarding fast_cr3_switch() and > saving mmu.prev_cr3 > is unrelated to the optimization described in the commit message: > Optimization of the common case where CR3 was changed while no guest > page table changes happened in between. > For this optimization, we only need the change regarding > mmu_sync_roots() to not grab mmu_lock when > (!root->unsync && !root->unsync_children). > Am I missing something? I still think that this should be split to 2 separate patches. The first only optimizing mmu_sync_roots() to not grab mmu_lock(). The second optimizing to not free previous root SPT (as we are probably about to return to it because of how KPTI works). > > > > > static unsigned long get_cr3(struct kvm_vcpu *vcpu) > > @@ -4432,6 +4539,8 @@ static void > paging64_init_context_common(struct > > kvm_vcpu *vcpu, > > context->update_pte = paging64_update_pte; > > context->shadow_root_level = level; > > context->root_hpa = INVALID_PAGE; > > + context->prev_root_hpa = INVALID_PAGE; > > + context->prev_cr3 = INVALID_PAGE; > > context->direct_map = false; > > } > > > > @@ -4462,6 +4571,8 @@ static void paging32_init_context(struct > > kvm_vcpu *vcpu, > > context->update_pte = paging32_update_pte; > > context->shadow_root_level = PT32E_ROOT_LEVEL; > > context->root_hpa = INVALID_PAGE; > > + context->prev_root_hpa = INVALID_PAGE; > > + context->prev_cr3 = INVALID_PAGE; > > context->direct_map = false; > > } > > > > @@ -4484,6 +4595,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu > > *vcpu) > > context->update_pte = nonpaging_update_pte; > > context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); > > context->root_hpa = INVALID_PAGE; > > + context->prev_root_hpa = INVALID_PAGE; > > + context->prev_cr3 = INVALID_PAGE; > > context->direct_map = true; > > context->set_cr3 = kvm_x86_ops->set_tdp_cr3; > > context->get_cr3 = get_cr3; > > @@ -4565,6 +4678,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu > > *vcpu, bool execonly, > > context->update_pte = ept_update_pte; > > context->root_level = PT64_ROOT_4LEVEL; > > context->root_hpa = INVALID_PAGE; > > + context->prev_root_hpa = INVALID_PAGE; > > + context->prev_cr3 = INVALID_PAGE; > > context->direct_map = false; > > context->base_role.ad_disabled = !accessed_dirty; > > > > @@ -4667,7 +4782,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); > > > > void kvm_mmu_unload(struct kvm_vcpu *vcpu) > > { > > - kvm_mmu_free_roots(vcpu); > > + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL); > > WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); > > } > > EXPORT_SYMBOL_GPL(kvm_mmu_unload); > > @@ -5046,6 +5161,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) > > { > > vcpu->arch.walk_mmu = &vcpu->arch.mmu; > > vcpu->arch.mmu.root_hpa = INVALID_PAGE; > > + vcpu->arch.mmu.prev_root_hpa = INVALID_PAGE; > > + vcpu->arch.mmu.prev_cr3 = INVALID_PAGE; > > vcpu->arch.mmu.translate_gpa = translate_gpa; > > vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index b2ff74b12ec4..847ce7f0a2c8 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -842,11 +842,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); > > > > int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) > > { > > + unsigned long old_cr3 = kvm_read_cr3(vcpu); > > + > > #ifdef CONFIG_X86_64 > > cr3 &= ~CR3_PCID_INVD; > > #endif > > > > - if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { > > + if (cr3 == old_cr3 && !pdptrs_changed(vcpu)) { > > kvm_mmu_sync_roots(vcpu); > > kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); > > return 0; > > @@ -861,7 +863,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned > > long cr3) > > > > vcpu->arch.cr3 = cr3; > > __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); > > - kvm_mmu_new_cr3(vcpu); > > + kvm_mmu_new_cr3(vcpu, old_cr3); > > + > > return 0; > > } > > EXPORT_SYMBOL_GPL(kvm_set_cr3); > > -- > > 2.17.0.441.gb46fe60e1d-goog