Re: [PATCH 03/11] kvm: x86: Add fast CR3 switch code path

Liran Alon <liran.alon@xxxxxxxxxx> · Sun, 13 May 2018 13:33:38 -0700 (PDT)

----- junaids@xxxxxxxxxx wrote:

> When using shadow paging, a CR3 switch in the guest results in a VM
> Exit.
> In the common case, that VM exit doesn't require much processing by
> KVM.
> However, it does acquire the MMU lock, which can start showing signs
> of
> contention under some workloads even on a 2 VCPU VM when the guest is
> using KPTI. Therefore, we add a fast path that avoids acquiring the
> MMU
> lock in the most common cases e.g. when switching back and forth
> between
> the kernel and user mode CR3s used by KPTI with no guest page table
> changes in between.
> 
> For now, this fast path is implemented only for 64-bit guests and
> hosts,
> but it can be extended later to 32-bit guests and/or hosts as well.

It is unclear to me from the commit message and patch itself why this decision was made.
It is best if you clarify in commit message and comments what is the complexity of 32-bit guests/hosts.

> 
> Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |  10 ++-
>  arch/x86/kvm/mmu.c              | 147
> ++++++++++++++++++++++++++++----
>  arch/x86/kvm/x86.c              |   7 +-
>  3 files changed, 145 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h
> b/arch/x86/include/asm/kvm_host.h
> index 46b68fa1f599..e9c56f21ac3c 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -352,6 +352,8 @@ struct kvm_mmu {
>  	u8 shadow_root_level;
>  	u8 ept_ad;
>  	bool direct_map;
> +	hpa_t prev_root_hpa;
> +	gpa_t prev_cr3;
>  
>  	/*
>  	 * Bitmap; bit set = permission fault
> @@ -1265,6 +1267,10 @@ static inline int __kvm_irq_line_state(unsigned
> long *irq_state,
>  	return !!(*irq_state);
>  }
>  
> +#define KVM_MMU_ROOT_CURRENT	BIT(0)
> +#define KVM_MMU_ROOT_PREVIOUS	BIT(1)
> +#define KVM_MMU_ROOTS_ALL	(KVM_MMU_ROOT_CURRENT |
> KVM_MMU_ROOT_PREVIOUS)
> +

I think it is a bit weird that this patch introduce "uint roots_to_free" as flags
while it is really used as a bool: Either you free both prev & active root or you free just active root.
Therefore it makes more sense at this point for parameter to be "bool free_prev_root".
Another weird effect of this is what does it mean to call kvm_mmu_free_roots() with roots_to_free=0?
Interface seems a bit strange.

>  int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id,
> int level);
>  void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
>  
> @@ -1276,7 +1282,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu
> *vcpu);
>  int kvm_mmu_load(struct kvm_vcpu *vcpu);
>  void kvm_mmu_unload(struct kvm_vcpu *vcpu);
>  void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
> -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu);
> +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, uint roots_to_free);
>  gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32
> access,
>  			   struct x86_exception *exception);
>  gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
> @@ -1295,7 +1301,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu
> *vcpu);
>  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64
> error_code,
>  		       void *insn, int insn_len);
>  void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
> -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
> +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3);
>  
>  void kvm_enable_tdp(void);
>  void kvm_disable_tdp(void);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 9f8c693f2d2f..0107129d8ed9 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2702,6 +2702,37 @@ static bool mmu_need_write_protect(struct
> kvm_vcpu *vcpu, gfn_t gfn,
>  		kvm_unsync_page(vcpu, sp);
>  	}
>  
> +	/*
> +	 * We need to ensure that the marking of unsync pages is visible
> +	 * before the PTE is updated to reflect write-protection because
> +	 * kvm_mmu_sync_roots() checks the unsync flags without holding
> +	 * the MMU lock. If the PTE was updated before the page has been
> +	 * marked as unsync-ed, something like the following could
> +	 * happen:
> +	 *
> +	 * CPU 1                 CPU 2                CPU 3
> +	 * -------------------------------------------------------------
> +	 *                       Host updates SPTE
> +	 *                       to be writable
> +	 *                                            Guest writes to
> +	 *                                            its page table
> +	 *
> +	 *                                            Guest signals CPU
> +	 *                                            1 to flush TLB
> +	 * Guest issues TLB flush
> +	 * triggering VM Exit
> +	 *
> +	 * Host checks sp->unsync
> +	 * and skips sync
> +	 *                       Marks SP as unsync
> +	 *
> +	 * Guest wrongly accesses
> +	 * page using old shadow
> +	 * mapping
> +	 *
> +	 */
> +	smp_wmb();
> +
>  	return false;
>  }
>  
> @@ -3365,21 +3396,30 @@ static void mmu_free_root_page(struct kvm
> *kvm, hpa_t *root_hpa,
>  	*root_hpa = INVALID_PAGE;
>  }
>  
> -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu)
> +/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags
> */
> +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, uint roots_to_free)
>  {
>  	int i;
>  	LIST_HEAD(invalid_list);
>  	struct kvm_mmu *mmu = &vcpu->arch.mmu;
> +	bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
> +	bool free_prev_root = roots_to_free & KVM_MMU_ROOT_PREVIOUS;
>  
> -	if (!VALID_PAGE(mmu->root_hpa))
> +	if ((!VALID_PAGE(mmu->root_hpa) || !free_active_root) &&
> +	    (!VALID_PAGE(mmu->prev_root_hpa) || !free_prev_root))
>  		return;

Shouldn't the condition instead be:
if ((free_active_root && !VALID_PAGE(mmu->root_hpa)) ||
    (free_prev_root && !VALID_PAGE(mmu->prev_root_hpa))
    return;
?

>  
>  	spin_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
>  	    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
> -		mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list);
> -	} else {
> +		if (free_prev_root)
> +			mmu_free_root_page(vcpu->kvm, &mmu->prev_root_hpa,
> +					   &invalid_list);
> +		if (free_active_root)
> +			mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
> +					   &invalid_list);
> +	} else if (free_active_root) {
>  		for (i = 0; i < 4; ++i)
>  			if (mmu->pae_root[i] != 0)
>  				mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i],
> @@ -3553,7 +3593,7 @@ static int mmu_alloc_roots(struct kvm_vcpu
> *vcpu)
>  		return mmu_alloc_shadow_roots(vcpu);
>  }
>  
> -static void mmu_sync_roots(struct kvm_vcpu *vcpu)
> +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
>  {
>  	int i;
>  	struct kvm_mmu_page *sp;
> @@ -3565,14 +3605,39 @@ static void mmu_sync_roots(struct kvm_vcpu
> *vcpu)
>  		return;
>  
>  	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
> -	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
> +
>  	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
>  		hpa_t root = vcpu->arch.mmu.root_hpa;
> +
>  		sp = page_header(root);
> +
> +		/*
> +		 * Even if another VCPU was marking the SP as unsync-ed
> +		 * simultaneously, any guest page table changes are not
> +		 * guaranteed to be visible anyway until this VCPU issues a TLB
> +		 * flush strictly after those changes are made. We only need to
> +		 * ensure that the other CPU sets these flags before any actual
> +		 * changes to the page tables are made. The comments in
> +		 * kvm_unsync_pages() describe what could go wrong if this
> +		 * requirement isn't satisfied.
> +		 */
> +		smp_rmb();
> +		if (!sp->unsync && !sp->unsync_children)
> +			return;
> +
> +		spin_lock(&vcpu->kvm->mmu_lock);
> +		kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
> +
>  		mmu_sync_children(vcpu, sp);
> +
>  		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
> +		spin_unlock(&vcpu->kvm->mmu_lock);
>  		return;
>  	}
> +
> +	spin_lock(&vcpu->kvm->mmu_lock);
> +	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
> +
>  	for (i = 0; i < 4; ++i) {
>  		hpa_t root = vcpu->arch.mmu.pae_root[i];
>  
> @@ -3582,13 +3647,8 @@ static void mmu_sync_roots(struct kvm_vcpu
> *vcpu)
>  			mmu_sync_children(vcpu, sp);
>  		}
>  	}
> -	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
> -}
>  
> -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
> -{
> -	spin_lock(&vcpu->kvm->mmu_lock);
> -	mmu_sync_roots(vcpu);
> +	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
>  	spin_unlock(&vcpu->kvm->mmu_lock);
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
> @@ -3948,13 +4008,60 @@ static void nonpaging_init_context(struct
> kvm_vcpu *vcpu,
>  	context->root_level = 0;
>  	context->shadow_root_level = PT32E_ROOT_LEVEL;
>  	context->root_hpa = INVALID_PAGE;
> +	context->prev_root_hpa = INVALID_PAGE;
> +	context->prev_cr3 = INVALID_PAGE;
>  	context->direct_map = true;
>  	context->nx = false;
>  }
>  
> -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
> +static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t old_cr3)
>  {
> -	kvm_mmu_free_roots(vcpu);
> +	struct kvm_mmu *mmu = &vcpu->arch.mmu;
> +	gpa_t new_cr3 = kvm_read_cr3(vcpu);
> +
> +	/*
> +	 * For now, limit the fast switch to 64-bit hosts+VMs. We may add
> +	 * support for 32-bit VMs later.
> +	 */
> +	if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL &&
> +	    vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
> +		if (old_cr3 == new_cr3) {
> +			kvm_err("Not expected to reach %s if old and new CR3 are the
> same\n",
> +				__func__);
> +			return false;
> +		}
> +
> +		if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
> +			return false;
> +
> +		swap(mmu->root_hpa, mmu->prev_root_hpa);
> +
> +		if (new_cr3 == mmu->prev_cr3 && VALID_PAGE(mmu->root_hpa)) {
> +			/*
> +			 * It is possible that the cached previous root page is
> +			 * obsolete because of a change in the MMU
> +			 * generation number. However, that is accompanied by
> +			 * KVM_REQ_MMU_RELOAD, which will free the root that we
> +			 * have set here and allocate a new one.
> +			 */
> +
> +			kvm_mmu_sync_roots(vcpu);
> +			__clear_sp_write_flooding_count(
> +						page_header(mmu->root_hpa));
> +			mmu->set_cr3(vcpu, mmu->root_hpa);
> +			return true;
> +		}
> +	}
> +
> +	return false;
> +}
> +
> +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3)
> +{
> +	if (!fast_cr3_switch(vcpu, old_cr3))
> +		kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
> +
> +	vcpu->arch.mmu.prev_cr3 = old_cr3;
>  }

It's probably just me but I fail to understand the optimization regarding fast_cr3_switch().
How can fast_cr3_switch() reach the case that (new_cr3 == mmu->prev_cr3)
while kvm_set_cr3() haven't seen (cr3 == old_cr3 && !pdptrs_changed(vcpu))?
IIUC, this can only happen if pdptrs_changed(vcpu).
However, fast_cr3_switch() optimization is only relevant for 64-bit guests which doesn't use PDPTRs cache...
Can you clarify this for me? I'm probably missing something trivial.

In general, it seems that all the diff regarding fast_cr3_switch() and saving mmu.prev_cr3
is unrelated to the optimization described in the commit message:
Optimization of the common case where CR3 was changed while no guest page table changes happened in between.
For this optimization, we only need the change regarding mmu_sync_roots() to not grab mmu_lock when
(!root->unsync && !root->unsync_children).
Am I missing something?

>  
>  static unsigned long get_cr3(struct kvm_vcpu *vcpu)
> @@ -4432,6 +4539,8 @@ static void paging64_init_context_common(struct
> kvm_vcpu *vcpu,
>  	context->update_pte = paging64_update_pte;
>  	context->shadow_root_level = level;
>  	context->root_hpa = INVALID_PAGE;
> +	context->prev_root_hpa = INVALID_PAGE;
> +	context->prev_cr3 = INVALID_PAGE;
>  	context->direct_map = false;
>  }
>  
> @@ -4462,6 +4571,8 @@ static void paging32_init_context(struct
> kvm_vcpu *vcpu,
>  	context->update_pte = paging32_update_pte;
>  	context->shadow_root_level = PT32E_ROOT_LEVEL;
>  	context->root_hpa = INVALID_PAGE;
> +	context->prev_root_hpa = INVALID_PAGE;
> +	context->prev_cr3 = INVALID_PAGE;
>  	context->direct_map = false;
>  }
>  
> @@ -4484,6 +4595,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu
> *vcpu)
>  	context->update_pte = nonpaging_update_pte;
>  	context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
>  	context->root_hpa = INVALID_PAGE;
> +	context->prev_root_hpa = INVALID_PAGE;
> +	context->prev_cr3 = INVALID_PAGE;
>  	context->direct_map = true;
>  	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
>  	context->get_cr3 = get_cr3;
> @@ -4565,6 +4678,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu
> *vcpu, bool execonly,
>  	context->update_pte = ept_update_pte;
>  	context->root_level = PT64_ROOT_4LEVEL;
>  	context->root_hpa = INVALID_PAGE;
> +	context->prev_root_hpa = INVALID_PAGE;
> +	context->prev_cr3 = INVALID_PAGE;
>  	context->direct_map = false;
>  	context->base_role.ad_disabled = !accessed_dirty;
>  
> @@ -4667,7 +4782,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
>  
>  void kvm_mmu_unload(struct kvm_vcpu *vcpu)
>  {
> -	kvm_mmu_free_roots(vcpu);
> +	kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
>  	WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_unload);
> @@ -5046,6 +5161,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
>  {
>  	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
>  	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
> +	vcpu->arch.mmu.prev_root_hpa = INVALID_PAGE;
> +	vcpu->arch.mmu.prev_cr3 = INVALID_PAGE;
>  	vcpu->arch.mmu.translate_gpa = translate_gpa;
>  	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b2ff74b12ec4..847ce7f0a2c8 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -842,11 +842,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
>  
>  int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  {
> +	unsigned long old_cr3 = kvm_read_cr3(vcpu);
> +
>  #ifdef CONFIG_X86_64
>  	cr3 &= ~CR3_PCID_INVD;
>  #endif
>  
> -	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
> +	if (cr3 == old_cr3 && !pdptrs_changed(vcpu)) {
>  		kvm_mmu_sync_roots(vcpu);
>  		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
>  		return 0;
> @@ -861,7 +863,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned
> long cr3)
>  
>  	vcpu->arch.cr3 = cr3;
>  	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
> -	kvm_mmu_new_cr3(vcpu);
> +	kvm_mmu_new_cr3(vcpu, old_cr3);
> +
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(kvm_set_cr3);
> -- 
> 2.17.0.441.gb46fe60e1d-goog