Re: [PATCH 03/11] kvm: x86: Add fast CR3 switch code path

Paolo Bonzini <pbonzini@xxxxxxxxxx> · Mon, 7 May 2018 18:27:19 +0200

On 04/05/2018 20:37, Junaid Shahid wrote:
> When using shadow paging, a CR3 switch in the guest results in a VM Exit.
> In the common case, that VM exit doesn't require much processing by KVM.
> However, it does acquire the MMU lock, which can start showing signs of
> contention under some workloads even on a 2 VCPU VM when the guest is
> using KPTI. Therefore, we add a fast path that avoids acquiring the MMU
> lock in the most common cases e.g. when switching back and forth between
> the kernel and user mode CR3s used by KPTI with no guest page table
> changes in between.
> 
> For now, this fast path is implemented only for 64-bit guests and hosts,
> but it can be extended later to 32-bit guests and/or hosts as well.
> 
> Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx>

Not having KPTI slow down to a crawl without EPT is surely useful, but I
wonder if this could be generalized a bit, so that for example we don't
acquire the MMU lock on nested vmentry/vmexit...

Paolo

> ---
>  arch/x86/include/asm/kvm_host.h |  10 ++-
>  arch/x86/kvm/mmu.c              | 147 ++++++++++++++++++++++++++++----
>  arch/x86/kvm/x86.c              |   7 +-
>  3 files changed, 145 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 46b68fa1f599..e9c56f21ac3c 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -352,6 +352,8 @@ struct kvm_mmu {
>  	u8 shadow_root_level;
>  	u8 ept_ad;
>  	bool direct_map;
> +	hpa_t prev_root_hpa;
> +	gpa_t prev_cr3;
>  
>  	/*
>  	 * Bitmap; bit set = permission fault
> @@ -1265,6 +1267,10 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
>  	return !!(*irq_state);
>  }
>  
> +#define KVM_MMU_ROOT_CURRENT	BIT(0)
> +#define KVM_MMU_ROOT_PREVIOUS	BIT(1)
> +#define KVM_MMU_ROOTS_ALL	(KVM_MMU_ROOT_CURRENT | KVM_MMU_ROOT_PREVIOUS)
> +
>  int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
>  void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
>  
> @@ -1276,7 +1282,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
>  int kvm_mmu_load(struct kvm_vcpu *vcpu);
>  void kvm_mmu_unload(struct kvm_vcpu *vcpu);
>  void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
> -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu);
> +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, uint roots_to_free);
>  gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
>  			   struct x86_exception *exception);
>  gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
> @@ -1295,7 +1301,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
>  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
>  		       void *insn, int insn_len);
>  void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
> -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
> +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3);
>  
>  void kvm_enable_tdp(void);
>  void kvm_disable_tdp(void);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 9f8c693f2d2f..0107129d8ed9 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2702,6 +2702,37 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
>  		kvm_unsync_page(vcpu, sp);
>  	}
>  
> +	/*
> +	 * We need to ensure that the marking of unsync pages is visible
> +	 * before the PTE is updated to reflect write-protection because
> +	 * kvm_mmu_sync_roots() checks the unsync flags without holding
> +	 * the MMU lock. If the PTE was updated before the page has been
> +	 * marked as unsync-ed, something like the following could
> +	 * happen:
> +	 *
> +	 * CPU 1                 CPU 2                CPU 3
> +	 * -------------------------------------------------------------
> +	 *                       Host updates SPTE
> +	 *                       to be writable
> +	 *                                            Guest writes to
> +	 *                                            its page table
> +	 *
> +	 *                                            Guest signals CPU
> +	 *                                            1 to flush TLB
> +	 * Guest issues TLB flush
> +	 * triggering VM Exit
> +	 *
> +	 * Host checks sp->unsync
> +	 * and skips sync
> +	 *                       Marks SP as unsync
> +	 *
> +	 * Guest wrongly accesses
> +	 * page using old shadow
> +	 * mapping
> +	 *
> +	 */
> +	smp_wmb();
> +
>  	return false;
>  }
>  
> @@ -3365,21 +3396,30 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
>  	*root_hpa = INVALID_PAGE;
>  }
>  
> -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu)
> +/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
> +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, uint roots_to_free)
>  {
>  	int i;
>  	LIST_HEAD(invalid_list);
>  	struct kvm_mmu *mmu = &vcpu->arch.mmu;
> +	bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
> +	bool free_prev_root = roots_to_free & KVM_MMU_ROOT_PREVIOUS;
>  
> -	if (!VALID_PAGE(mmu->root_hpa))
> +	if ((!VALID_PAGE(mmu->root_hpa) || !free_active_root) &&
> +	    (!VALID_PAGE(mmu->prev_root_hpa) || !free_prev_root))
>  		return;
>  
>  	spin_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
>  	    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
> -		mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list);
> -	} else {
> +		if (free_prev_root)
> +			mmu_free_root_page(vcpu->kvm, &mmu->prev_root_hpa,
> +					   &invalid_list);
> +		if (free_active_root)
> +			mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
> +					   &invalid_list);
> +	} else if (free_active_root) {
>  		for (i = 0; i < 4; ++i)
>  			if (mmu->pae_root[i] != 0)
>  				mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i],
> @@ -3553,7 +3593,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
>  		return mmu_alloc_shadow_roots(vcpu);
>  }
>  
> -static void mmu_sync_roots(struct kvm_vcpu *vcpu)
> +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
>  {
>  	int i;
>  	struct kvm_mmu_page *sp;
> @@ -3565,14 +3605,39 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
>  		return;
>  
>  	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
> -	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
> +
>  	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
>  		hpa_t root = vcpu->arch.mmu.root_hpa;
> +
>  		sp = page_header(root);
> +
> +		/*
> +		 * Even if another VCPU was marking the SP as unsync-ed
> +		 * simultaneously, any guest page table changes are not
> +		 * guaranteed to be visible anyway until this VCPU issues a TLB
> +		 * flush strictly after those changes are made. We only need to
> +		 * ensure that the other CPU sets these flags before any actual
> +		 * changes to the page tables are made. The comments in
> +		 * kvm_unsync_pages() describe what could go wrong if this
> +		 * requirement isn't satisfied.
> +		 */
> +		smp_rmb();
> +		if (!sp->unsync && !sp->unsync_children)
> +			return;
> +
> +		spin_lock(&vcpu->kvm->mmu_lock);
> +		kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
> +
>  		mmu_sync_children(vcpu, sp);
> +
>  		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
> +		spin_unlock(&vcpu->kvm->mmu_lock);
>  		return;
>  	}
> +
> +	spin_lock(&vcpu->kvm->mmu_lock);
> +	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
> +
>  	for (i = 0; i < 4; ++i) {
>  		hpa_t root = vcpu->arch.mmu.pae_root[i];
>  
> @@ -3582,13 +3647,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
>  			mmu_sync_children(vcpu, sp);
>  		}
>  	}
> -	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
> -}
>  
> -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
> -{
> -	spin_lock(&vcpu->kvm->mmu_lock);
> -	mmu_sync_roots(vcpu);
> +	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
>  	spin_unlock(&vcpu->kvm->mmu_lock);
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
> @@ -3948,13 +4008,60 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
>  	context->root_level = 0;
>  	context->shadow_root_level = PT32E_ROOT_LEVEL;
>  	context->root_hpa = INVALID_PAGE;
> +	context->prev_root_hpa = INVALID_PAGE;
> +	context->prev_cr3 = INVALID_PAGE;
>  	context->direct_map = true;
>  	context->nx = false;
>  }
>  
> -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
> +static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t old_cr3)
>  {
> -	kvm_mmu_free_roots(vcpu);
> +	struct kvm_mmu *mmu = &vcpu->arch.mmu;
> +	gpa_t new_cr3 = kvm_read_cr3(vcpu);
> +
> +	/*
> +	 * For now, limit the fast switch to 64-bit hosts+VMs. We may add
> +	 * support for 32-bit VMs later.
> +	 */
> +	if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL &&
> +	    vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
> +		if (old_cr3 == new_cr3) {
> +			kvm_err("Not expected to reach %s if old and new CR3 are the same\n",
> +				__func__);
> +			return false;
> +		}
> +
> +		if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
> +			return false;
> +
> +		swap(mmu->root_hpa, mmu->prev_root_hpa);
> +
> +		if (new_cr3 == mmu->prev_cr3 && VALID_PAGE(mmu->root_hpa)) {
> +			/*
> +			 * It is possible that the cached previous root page is
> +			 * obsolete because of a change in the MMU
> +			 * generation number. However, that is accompanied by
> +			 * KVM_REQ_MMU_RELOAD, which will free the root that we
> +			 * have set here and allocate a new one.
> +			 */
> +
> +			kvm_mmu_sync_roots(vcpu);
> +			__clear_sp_write_flooding_count(
> +						page_header(mmu->root_hpa));
> +			mmu->set_cr3(vcpu, mmu->root_hpa);
> +			return true;
> +		}
> +	}
> +
> +	return false;
> +}
> +
> +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3)
> +{
> +	if (!fast_cr3_switch(vcpu, old_cr3))
> +		kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
> +
> +	vcpu->arch.mmu.prev_cr3 = old_cr3;
>  }
>  
>  static unsigned long get_cr3(struct kvm_vcpu *vcpu)
> @@ -4432,6 +4539,8 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
>  	context->update_pte = paging64_update_pte;
>  	context->shadow_root_level = level;
>  	context->root_hpa = INVALID_PAGE;
> +	context->prev_root_hpa = INVALID_PAGE;
> +	context->prev_cr3 = INVALID_PAGE;
>  	context->direct_map = false;
>  }
>  
> @@ -4462,6 +4571,8 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
>  	context->update_pte = paging32_update_pte;
>  	context->shadow_root_level = PT32E_ROOT_LEVEL;
>  	context->root_hpa = INVALID_PAGE;
> +	context->prev_root_hpa = INVALID_PAGE;
> +	context->prev_cr3 = INVALID_PAGE;
>  	context->direct_map = false;
>  }
>  
> @@ -4484,6 +4595,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
>  	context->update_pte = nonpaging_update_pte;
>  	context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
>  	context->root_hpa = INVALID_PAGE;
> +	context->prev_root_hpa = INVALID_PAGE;
> +	context->prev_cr3 = INVALID_PAGE;
>  	context->direct_map = true;
>  	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
>  	context->get_cr3 = get_cr3;
> @@ -4565,6 +4678,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
>  	context->update_pte = ept_update_pte;
>  	context->root_level = PT64_ROOT_4LEVEL;
>  	context->root_hpa = INVALID_PAGE;
> +	context->prev_root_hpa = INVALID_PAGE;
> +	context->prev_cr3 = INVALID_PAGE;
>  	context->direct_map = false;
>  	context->base_role.ad_disabled = !accessed_dirty;
>  
> @@ -4667,7 +4782,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
>  
>  void kvm_mmu_unload(struct kvm_vcpu *vcpu)
>  {
> -	kvm_mmu_free_roots(vcpu);
> +	kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
>  	WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_unload);
> @@ -5046,6 +5161,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
>  {
>  	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
>  	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
> +	vcpu->arch.mmu.prev_root_hpa = INVALID_PAGE;
> +	vcpu->arch.mmu.prev_cr3 = INVALID_PAGE;
>  	vcpu->arch.mmu.translate_gpa = translate_gpa;
>  	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b2ff74b12ec4..847ce7f0a2c8 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -842,11 +842,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
>  
>  int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  {
> +	unsigned long old_cr3 = kvm_read_cr3(vcpu);
> +
>  #ifdef CONFIG_X86_64
>  	cr3 &= ~CR3_PCID_INVD;
>  #endif
>  
> -	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
> +	if (cr3 == old_cr3 && !pdptrs_changed(vcpu)) {
>  		kvm_mmu_sync_roots(vcpu);
>  		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
>  		return 0;
> @@ -861,7 +863,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  
>  	vcpu->arch.cr3 = cr3;
>  	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
> -	kvm_mmu_new_cr3(vcpu);
> +	kvm_mmu_new_cr3(vcpu, old_cr3);
> +
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(kvm_set_cr3);
>