Re: [PATCH 03/11] kvm: x86: Add fast CR3 switch code path

Liran Alon <liran.alon@xxxxxxxxxx> · Sun, 13 May 2018 14:49:06 -0700 (PDT)

----- liran.alon@xxxxxxxxxx wrote:

> ----- junaids@xxxxxxxxxx wrote:
> 
> > When using shadow paging, a CR3 switch in the guest results in a VM
> > Exit.
> > In the common case, that VM exit doesn't require much processing by
> > KVM.
> > However, it does acquire the MMU lock, which can start showing
> signs
> > of
> > contention under some workloads even on a 2 VCPU VM when the guest
> is
> > using KPTI. Therefore, we add a fast path that avoids acquiring the
> > MMU
> > lock in the most common cases e.g. when switching back and forth
> > between
> > the kernel and user mode CR3s used by KPTI with no guest page table
> > changes in between.
> > 
> > For now, this fast path is implemented only for 64-bit guests and
> > hosts,
> > but it can be extended later to 32-bit guests and/or hosts as well.
> 
> It is unclear to me from the commit message and patch itself why this
> decision was made.
> It is best if you clarify in commit message and comments what is the
> complexity of 32-bit guests/hosts.
> 
> > 
> > Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx>
> > ---
> >  arch/x86/include/asm/kvm_host.h |  10 ++-
> >  arch/x86/kvm/mmu.c              | 147
> > ++++++++++++++++++++++++++++----
> >  arch/x86/kvm/x86.c              |   7 +-
> >  3 files changed, 145 insertions(+), 19 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h
> > b/arch/x86/include/asm/kvm_host.h
> > index 46b68fa1f599..e9c56f21ac3c 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -352,6 +352,8 @@ struct kvm_mmu {
> >  	u8 shadow_root_level;
> >  	u8 ept_ad;
> >  	bool direct_map;
> > +	hpa_t prev_root_hpa;
> > +	gpa_t prev_cr3;
> >  
> >  	/*
> >  	 * Bitmap; bit set = permission fault
> > @@ -1265,6 +1267,10 @@ static inline int
> __kvm_irq_line_state(unsigned
> > long *irq_state,
> >  	return !!(*irq_state);
> >  }
> >  
> > +#define KVM_MMU_ROOT_CURRENT	BIT(0)
> > +#define KVM_MMU_ROOT_PREVIOUS	BIT(1)
> > +#define KVM_MMU_ROOTS_ALL	(KVM_MMU_ROOT_CURRENT |
> > KVM_MMU_ROOT_PREVIOUS)
> > +
> 
> I think it is a bit weird that this patch introduce "uint
> roots_to_free" as flags
> while it is really used as a bool: Either you free both prev & active
> root or you free just active root.
> Therefore it makes more sense at this point for parameter to be "bool
> free_prev_root".
> Another weird effect of this is what does it mean to call
> kvm_mmu_free_roots() with roots_to_free=0?
> Interface seems a bit strange.
> 
> >  int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int
> irq_source_id,
> > int level);
> >  void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
> >  
> > @@ -1276,7 +1282,7 @@ void __kvm_mmu_free_some_pages(struct
> kvm_vcpu
> > *vcpu);
> >  int kvm_mmu_load(struct kvm_vcpu *vcpu);
> >  void kvm_mmu_unload(struct kvm_vcpu *vcpu);
> >  void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
> > -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu);
> > +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, uint
> roots_to_free);
> >  gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32
> > access,
> >  			   struct x86_exception *exception);
> >  gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
> > @@ -1295,7 +1301,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu
> > *vcpu);
> >  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64
> > error_code,
> >  		       void *insn, int insn_len);
> >  void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
> > -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
> > +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3);
> >  
> >  void kvm_enable_tdp(void);
> >  void kvm_disable_tdp(void);
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 9f8c693f2d2f..0107129d8ed9 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -2702,6 +2702,37 @@ static bool mmu_need_write_protect(struct
> > kvm_vcpu *vcpu, gfn_t gfn,
> >  		kvm_unsync_page(vcpu, sp);
> >  	}
> >  
> > +	/*
> > +	 * We need to ensure that the marking of unsync pages is visible
> > +	 * before the PTE is updated to reflect write-protection because
> > +	 * kvm_mmu_sync_roots() checks the unsync flags without holding
> > +	 * the MMU lock. If the PTE was updated before the page has been
> > +	 * marked as unsync-ed, something like the following could
> > +	 * happen:
> > +	 *
> > +	 * CPU 1                 CPU 2                CPU 3
> > +	 * -------------------------------------------------------------
> > +	 *                       Host updates SPTE
> > +	 *                       to be writable
> > +	 *                                            Guest writes to
> > +	 *                                            its page table
> > +	 *
> > +	 *                                            Guest signals CPU
> > +	 *                                            1 to flush TLB
> > +	 * Guest issues TLB flush
> > +	 * triggering VM Exit
> > +	 *
> > +	 * Host checks sp->unsync
> > +	 * and skips sync
> > +	 *                       Marks SP as unsync
> > +	 *
> > +	 * Guest wrongly accesses
> > +	 * page using old shadow
> > +	 * mapping
> > +	 *
> > +	 */
> > +	smp_wmb();
> > +
> >  	return false;
> >  }
> >  
> > @@ -3365,21 +3396,30 @@ static void mmu_free_root_page(struct kvm
> > *kvm, hpa_t *root_hpa,
> >  	*root_hpa = INVALID_PAGE;
> >  }
> >  
> > -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu)
> > +/* roots_to_free must be some combination of the KVM_MMU_ROOT_*
> flags
> > */
> > +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, uint roots_to_free)
> >  {
> >  	int i;
> >  	LIST_HEAD(invalid_list);
> >  	struct kvm_mmu *mmu = &vcpu->arch.mmu;
> > +	bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
> > +	bool free_prev_root = roots_to_free & KVM_MMU_ROOT_PREVIOUS;
> >  
> > -	if (!VALID_PAGE(mmu->root_hpa))
> > +	if ((!VALID_PAGE(mmu->root_hpa) || !free_active_root) &&
> > +	    (!VALID_PAGE(mmu->prev_root_hpa) || !free_prev_root))
> >  		return;
> 
> Shouldn't the condition instead be:
> if ((free_active_root && !VALID_PAGE(mmu->root_hpa)) ||
>     (free_prev_root && !VALID_PAGE(mmu->prev_root_hpa))
>     return;
> ?
> 
> >  
> >  	spin_lock(&vcpu->kvm->mmu_lock);
> >  
> >  	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
> >  	    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
> > -		mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list);
> > -	} else {
> > +		if (free_prev_root)
> > +			mmu_free_root_page(vcpu->kvm, &mmu->prev_root_hpa,
> > +					   &invalid_list);
> > +		if (free_active_root)
> > +			mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
> > +					   &invalid_list);
> > +	} else if (free_active_root) {
> >  		for (i = 0; i < 4; ++i)
> >  			if (mmu->pae_root[i] != 0)
> >  				mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i],
> > @@ -3553,7 +3593,7 @@ static int mmu_alloc_roots(struct kvm_vcpu
> > *vcpu)
> >  		return mmu_alloc_shadow_roots(vcpu);
> >  }
> >  
> > -static void mmu_sync_roots(struct kvm_vcpu *vcpu)
> > +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
> >  {
> >  	int i;
> >  	struct kvm_mmu_page *sp;
> > @@ -3565,14 +3605,39 @@ static void mmu_sync_roots(struct kvm_vcpu
> > *vcpu)
> >  		return;
> >  
> >  	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
> > -	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
> > +
> >  	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
> >  		hpa_t root = vcpu->arch.mmu.root_hpa;
> > +
> >  		sp = page_header(root);
> > +
> > +		/*
> > +		 * Even if another VCPU was marking the SP as unsync-ed
> > +		 * simultaneously, any guest page table changes are not
> > +		 * guaranteed to be visible anyway until this VCPU issues a TLB
> > +		 * flush strictly after those changes are made. We only need to
> > +		 * ensure that the other CPU sets these flags before any actual
> > +		 * changes to the page tables are made. The comments in
> > +		 * kvm_unsync_pages() describe what could go wrong if this
> > +		 * requirement isn't satisfied.
> > +		 */
> > +		smp_rmb();
> > +		if (!sp->unsync && !sp->unsync_children)
> > +			return;
> > +
> > +		spin_lock(&vcpu->kvm->mmu_lock);
> > +		kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
> > +
> >  		mmu_sync_children(vcpu, sp);
> > +
> >  		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
> > +		spin_unlock(&vcpu->kvm->mmu_lock);
> >  		return;
> >  	}
> > +
> > +	spin_lock(&vcpu->kvm->mmu_lock);
> > +	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
> > +
> >  	for (i = 0; i < 4; ++i) {
> >  		hpa_t root = vcpu->arch.mmu.pae_root[i];
> >  
> > @@ -3582,13 +3647,8 @@ static void mmu_sync_roots(struct kvm_vcpu
> > *vcpu)
> >  			mmu_sync_children(vcpu, sp);
> >  		}
> >  	}
> > -	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
> > -}
> >  
> > -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
> > -{
> > -	spin_lock(&vcpu->kvm->mmu_lock);
> > -	mmu_sync_roots(vcpu);
> > +	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
> >  	spin_unlock(&vcpu->kvm->mmu_lock);
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
> > @@ -3948,13 +4008,60 @@ static void nonpaging_init_context(struct
> > kvm_vcpu *vcpu,
> >  	context->root_level = 0;
> >  	context->shadow_root_level = PT32E_ROOT_LEVEL;
> >  	context->root_hpa = INVALID_PAGE;
> > +	context->prev_root_hpa = INVALID_PAGE;
> > +	context->prev_cr3 = INVALID_PAGE;
> >  	context->direct_map = true;
> >  	context->nx = false;
> >  }
> >  
> > -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
> > +static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t old_cr3)
> >  {
> > -	kvm_mmu_free_roots(vcpu);
> > +	struct kvm_mmu *mmu = &vcpu->arch.mmu;
> > +	gpa_t new_cr3 = kvm_read_cr3(vcpu);
> > +
> > +	/*
> > +	 * For now, limit the fast switch to 64-bit hosts+VMs. We may add
> > +	 * support for 32-bit VMs later.
> > +	 */
> > +	if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL &&
> > +	    vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
> > +		if (old_cr3 == new_cr3) {
> > +			kvm_err("Not expected to reach %s if old and new CR3 are the
> > same\n",
> > +				__func__);
> > +			return false;
> > +		}
> > +
> > +		if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
> > +			return false;
> > +
> > +		swap(mmu->root_hpa, mmu->prev_root_hpa);
> > +
> > +		if (new_cr3 == mmu->prev_cr3 && VALID_PAGE(mmu->root_hpa)) {
> > +			/*
> > +			 * It is possible that the cached previous root page is
> > +			 * obsolete because of a change in the MMU
> > +			 * generation number. However, that is accompanied by
> > +			 * KVM_REQ_MMU_RELOAD, which will free the root that we
> > +			 * have set here and allocate a new one.
> > +			 */
> > +
> > +			kvm_mmu_sync_roots(vcpu);
> > +			__clear_sp_write_flooding_count(
> > +						page_header(mmu->root_hpa));
> > +			mmu->set_cr3(vcpu, mmu->root_hpa);
> > +			return true;
> > +		}
> > +	}
> > +
> > +	return false;
> > +}
> > +
> > +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3)
> > +{
> > +	if (!fast_cr3_switch(vcpu, old_cr3))
> > +		kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
> > +
> > +	vcpu->arch.mmu.prev_cr3 = old_cr3;
> >  }
> 
> It's probably just me but I fail to understand the optimization
> regarding fast_cr3_switch().
> How can fast_cr3_switch() reach the case that (new_cr3 ==
> mmu->prev_cr3)
> while kvm_set_cr3() haven't seen (cr3 == old_cr3 &&
> !pdptrs_changed(vcpu))?
> IIUC, this can only happen if pdptrs_changed(vcpu).
> However, fast_cr3_switch() optimization is only relevant for 64-bit
> guests which doesn't use PDPTRs cache...
> Can you clarify this for me? I'm probably missing something trivial.

You can ignore my entire comment block here.
I noticed the trivial part I was missing:
mmu.prev_cr3 is assigned with old_cr3. From some reason I read it in my head as new_cr3...
Now this makes sense :)

> 
> In general, it seems that all the diff regarding fast_cr3_switch() and
> saving mmu.prev_cr3
> is unrelated to the optimization described in the commit message:
> Optimization of the common case where CR3 was changed while no guest
> page table changes happened in between.
> For this optimization, we only need the change regarding
> mmu_sync_roots() to not grab mmu_lock when
> (!root->unsync && !root->unsync_children).
> Am I missing something?

I still think that this should be split to 2 separate patches.
The first only optimizing mmu_sync_roots() to not grab mmu_lock().
The second optimizing to not free previous root SPT
(as we are probably about to return to it because of how KPTI works).

> 
> >  
> >  static unsigned long get_cr3(struct kvm_vcpu *vcpu)
> > @@ -4432,6 +4539,8 @@ static void
> paging64_init_context_common(struct
> > kvm_vcpu *vcpu,
> >  	context->update_pte = paging64_update_pte;
> >  	context->shadow_root_level = level;
> >  	context->root_hpa = INVALID_PAGE;
> > +	context->prev_root_hpa = INVALID_PAGE;
> > +	context->prev_cr3 = INVALID_PAGE;
> >  	context->direct_map = false;
> >  }
> >  
> > @@ -4462,6 +4571,8 @@ static void paging32_init_context(struct
> > kvm_vcpu *vcpu,
> >  	context->update_pte = paging32_update_pte;
> >  	context->shadow_root_level = PT32E_ROOT_LEVEL;
> >  	context->root_hpa = INVALID_PAGE;
> > +	context->prev_root_hpa = INVALID_PAGE;
> > +	context->prev_cr3 = INVALID_PAGE;
> >  	context->direct_map = false;
> >  }
> >  
> > @@ -4484,6 +4595,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu
> > *vcpu)
> >  	context->update_pte = nonpaging_update_pte;
> >  	context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
> >  	context->root_hpa = INVALID_PAGE;
> > +	context->prev_root_hpa = INVALID_PAGE;
> > +	context->prev_cr3 = INVALID_PAGE;
> >  	context->direct_map = true;
> >  	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
> >  	context->get_cr3 = get_cr3;
> > @@ -4565,6 +4678,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu
> > *vcpu, bool execonly,
> >  	context->update_pte = ept_update_pte;
> >  	context->root_level = PT64_ROOT_4LEVEL;
> >  	context->root_hpa = INVALID_PAGE;
> > +	context->prev_root_hpa = INVALID_PAGE;
> > +	context->prev_cr3 = INVALID_PAGE;
> >  	context->direct_map = false;
> >  	context->base_role.ad_disabled = !accessed_dirty;
> >  
> > @@ -4667,7 +4782,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
> >  
> >  void kvm_mmu_unload(struct kvm_vcpu *vcpu)
> >  {
> > -	kvm_mmu_free_roots(vcpu);
> > +	kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
> >  	WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_mmu_unload);
> > @@ -5046,6 +5161,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
> >  {
> >  	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
> >  	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
> > +	vcpu->arch.mmu.prev_root_hpa = INVALID_PAGE;
> > +	vcpu->arch.mmu.prev_cr3 = INVALID_PAGE;
> >  	vcpu->arch.mmu.translate_gpa = translate_gpa;
> >  	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
> >  
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index b2ff74b12ec4..847ce7f0a2c8 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -842,11 +842,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
> >  
> >  int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
> >  {
> > +	unsigned long old_cr3 = kvm_read_cr3(vcpu);
> > +
> >  #ifdef CONFIG_X86_64
> >  	cr3 &= ~CR3_PCID_INVD;
> >  #endif
> >  
> > -	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
> > +	if (cr3 == old_cr3 && !pdptrs_changed(vcpu)) {
> >  		kvm_mmu_sync_roots(vcpu);
> >  		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
> >  		return 0;
> > @@ -861,7 +863,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned
> > long cr3)
> >  
> >  	vcpu->arch.cr3 = cr3;
> >  	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
> > -	kvm_mmu_new_cr3(vcpu);
> > +	kvm_mmu_new_cr3(vcpu, old_cr3);
> > +
> >  	return 0;
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_set_cr3);
> > -- 
> > 2.17.0.441.gb46fe60e1d-goog