Re: [PATCH 08/11] kvm: x86: Skip TLB flush on fast CR3 switch when indicated by guest

Liran Alon <liran.alon@xxxxxxxxxx> · Sun, 13 May 2018 15:19:25 -0700 (PDT)

----- junaids@xxxxxxxxxx wrote:

> When PCIDs are enabled, the MSb of the source operand for a
> MOV-to-CR3
> instruction indicates that the TLB doesn't need to be flushed.
> 
> This change enables this optimization for MOV-to-CR3s in the guest
> that have been intercepted by KVM for shadow paging and are handled
> within the fast CR3 switch path.
> 
> Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |  2 +-
>  arch/x86/kvm/mmu.c              | 10 ++++++----
>  arch/x86/kvm/x86.c              | 11 ++++++++---
>  3 files changed, 15 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h
> b/arch/x86/include/asm/kvm_host.h
> index 9b30003b4429..9f97ed12c951 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1305,7 +1305,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu,
> gva_t gva, u64 error_code,
>  		       void *insn, int insn_len);
>  void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
>  void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned
> long pcid);
> -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3);
> +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3, bool
> skip_tlb_flush);
>  
>  void kvm_enable_tdp(void);
>  void kvm_disable_tdp(void);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 1258b8e4718b..7ecd25a21534 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -4014,7 +4014,8 @@ static void nonpaging_init_context(struct
> kvm_vcpu *vcpu,
>  	context->nx = false;
>  }
>  
> -static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t old_cr3)
> +static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t old_cr3,
> +			    bool skip_tlb_flush)
>  {
>  	struct kvm_mmu *mmu = &vcpu->arch.mmu;
>  	gpa_t new_cr3 = kvm_read_cr3(vcpu);
> @@ -4050,7 +4051,8 @@ static bool fast_cr3_switch(struct kvm_vcpu
> *vcpu, gpa_t old_cr3)
>  			kvm_mmu_sync_roots(vcpu);
>  			__clear_sp_write_flooding_count(
>  						page_header(mmu->root_hpa));
> -			mmu->set_cr3(vcpu, mmu->root_hpa | pcid, false);
> +			mmu->set_cr3(vcpu, mmu->root_hpa | pcid,
> +				     skip_tlb_flush);
>  			return true;
>  		}
>  	}
> @@ -4058,9 +4060,9 @@ static bool fast_cr3_switch(struct kvm_vcpu
> *vcpu, gpa_t old_cr3)
>  	return false;
>  }
>  
> -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3)
> +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3, bool
> skip_tlb_flush)
>  {
> -	if (!fast_cr3_switch(vcpu, old_cr3))
> +	if (!fast_cr3_switch(vcpu, old_cr3, skip_tlb_flush))
>  		kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
>  
>  	vcpu->arch.mmu.prev_cr3 = old_cr3;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 5a19d220a9c3..baeb8447ede2 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -843,17 +843,22 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
>  int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>  {
>  	unsigned long old_cr3 = kvm_read_cr3(vcpu);
> +	bool skip_tlb_flush = false;
>  
>  #ifdef CONFIG_X86_64
>  	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
>  
> -	if (pcid_enabled)
> +	if (pcid_enabled) {
> +		skip_tlb_flush = cr3 & CR3_PCID_INVD;
>  		cr3 &= ~CR3_PCID_INVD;
> +	}
>  #endif
>  
>  	if (cr3 == old_cr3 && !pdptrs_changed(vcpu)) {
>  		kvm_mmu_sync_roots(vcpu);
> -		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
> +
> +		if (!skip_tlb_flush)
> +			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
>  		return 0;
>  	}
>  
> @@ -866,7 +871,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned
> long cr3)
>  
>  	vcpu->arch.cr3 = cr3;
>  	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
> -	kvm_mmu_new_cr3(vcpu, old_cr3);
> +	kvm_mmu_new_cr3(vcpu, old_cr3, skip_tlb_flush);
>  
>  	return 0;
>  }
> -- 
> 2.17.0.441.gb46fe60e1d-goog

I think this patch breaks the INVPCID handling of previous patch.

In patch 7 in series ("kvm: vmx: Support INVPCID in shadow paging mode"), INVPCID was implemented
such that it only sync SPTs (shadow page-tables) and flush TLB for INVPCID invoked with current active PCID.
This is because it assumes that on every guest CR3-switch, host will both sync SPTs and flush TLB as it
cause host to switch CR3 as-well.
However, this patch partially breaks this assumption by not flushing TLB entries when switching between
"current_cr3" and "prev_cr3" with bit CR3_PCID_INVD set.

I am thinking about the following scenario:
Let's assume: {current_cr3=X, current_pcid=A, prev_cr3=Y, prev_pcid=B}.
What if now guest executes INVPCID on address G and PCID B (Individual-address invalidation).
and then immediately switch CR3 to value Y (==prev_cr3) with bit CR3_PCID_INVD set?

In this case, INVPCID handling does nothing as current_pcid!=B however we also don't flush TLB
when CR3-switch is done to prev_cr3=Y. Therefore, physical TLB could still hold stale entry
that maps G with PCID of B. In contrast to a real CPU that would implement INVPCID correctly.