Re: [PATCH v7 056/102] KVM: x86/mmu: steal software usable git to record if GFN is for shared or not

Yuan Yao <yuan.yao@xxxxxxxxxxxxxxx> · Mon, 18 Jul 2022 16:37:43 +0800



On Mon, Jun 27, 2022 at 02:53:48PM -0700, isaku.yamahata@xxxxxxxxx wrote:
> From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>

Subject: s/git/bit

>
> With TDX, all GFNs are private at guest boot time.  At run time guest TD
> can explicitly change it to shared from private or vice-versa by MapGPA
> hypercall.  If it's specified, the given GFN can't be used as otherwise.
> That's is, if a guest tells KVM that the GFN is shared, it can't be used
> as private.  or vice-versa.
>
> Steal software usable bit, SPTE_SHARED_MASK, for it from MMIO counter to
> record it.  Use the bit SPTE_SHARED_MASK in shared or private EPT to
> determine which mapping, shared or private, is allowed.  If requested
> mapping isn't allowed, return RET_PF_RETRY to wait for other vcpu to change
> it.  The bit is recorded in both shared and private shadow page to avoid
> traverse one more shadow page when resolving KVM page fault.
>
> The bit needs to be kept over zapping the EPT entry.  Currently the EPT
> entry is initialized SHADOW_NONPRESENT_VALUE unconditionally to clear
> SPTE_SHARED_MASK bit.  To carry SPTE_SHARED_MASK bit, introduce a helper
> function to get initial value for zapped entry with SPTE_SHARED_MASK bit.
> Replace SHADOW_NONPRESENT_VALUE with it.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> ---
>  arch/x86/kvm/mmu/spte.h    | 17 +++++++---
>  arch/x86/kvm/mmu/tdp_mmu.c | 65 ++++++++++++++++++++++++++++++++------
>  2 files changed, 68 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> index 96312ab4fffb..7c1aaf0e963e 100644
> --- a/arch/x86/kvm/mmu/spte.h
> +++ b/arch/x86/kvm/mmu/spte.h
> @@ -14,6 +14,9 @@
>   */
>  #define SPTE_MMU_PRESENT_MASK		BIT_ULL(11)
>
> +/* Masks that used to track for shared GPA **/
> +#define SPTE_SHARED_MASK		BIT_ULL(62)
> +
>  /*
>   * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
>   * be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
> @@ -104,7 +107,7 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
>   * the memslots generation and is derived as follows:
>   *
>   * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
> - * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
> + * Bits 8-18 of the MMIO generation are propagated to spte bits 52-61

Should be 8-17.

>   *
>   * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
>   * the MMIO generation number, as doing so would require stealing a bit from
> @@ -118,7 +121,7 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
>  #define MMIO_SPTE_GEN_LOW_END		10
>
>  #define MMIO_SPTE_GEN_HIGH_START	52
> -#define MMIO_SPTE_GEN_HIGH_END		62
> +#define MMIO_SPTE_GEN_HIGH_END		61
>
>  #define MMIO_SPTE_GEN_LOW_MASK		GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
>  						    MMIO_SPTE_GEN_LOW_START)
> @@ -131,7 +134,7 @@ static_assert(!(SPTE_MMU_PRESENT_MASK &
>  #define MMIO_SPTE_GEN_HIGH_BITS		(MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
>
>  /* remember to adjust the comment above as well if you change these */
> -static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
> +static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 10);
>
>  #define MMIO_SPTE_GEN_LOW_SHIFT		(MMIO_SPTE_GEN_LOW_START - 0)
>  #define MMIO_SPTE_GEN_HIGH_SHIFT	(MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
> @@ -208,6 +211,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
>  /* Removed SPTEs must not be misconstrued as shadow present PTEs. */
>  static_assert(!(__REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
>  static_assert(!(__REMOVED_SPTE & SHADOW_NONPRESENT_VALUE));
> +static_assert(!(__REMOVED_SPTE & SPTE_SHARED_MASK));
>
>  /*
>   * See above comment around __REMOVED_SPTE.  REMOVED_SPTE is the actual
> @@ -217,7 +221,12 @@ static_assert(!(__REMOVED_SPTE & SHADOW_NONPRESENT_VALUE));
>
>  static inline bool is_removed_spte(u64 spte)
>  {
> -	return spte == REMOVED_SPTE;
> +	return (spte & ~SPTE_SHARED_MASK) == REMOVED_SPTE;
> +}
> +
> +static inline u64 spte_shared_mask(u64 spte)
> +{
> +	return spte & SPTE_SHARED_MASK;
>  }
>
>  /*
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index fef6246086a8..4f279700b3cc 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -758,6 +758,11 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
>  	return 0;
>  }
>
> +static u64 shadow_nonpresent_spte(u64 old_spte)
> +{
> +	return SHADOW_NONPRESENT_VALUE | spte_shared_mask(old_spte);
> +}
> +
>  static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
>  					  struct tdp_iter *iter)
>  {
> @@ -791,7 +796,8 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
>  	 * SHADOW_NONPRESENT_VALUE (which sets "suppress #VE" bit) so it
>  	 * can be set when EPT table entries are zapped.
>  	 */
> -	__kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
> +	__kvm_tdp_mmu_write_spte(iter->sptep,
> +			       shadow_nonpresent_spte(iter->old_spte));
>
>  	return 0;
>  }
> @@ -975,8 +981,11 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
>  			continue;
>
>  		if (!shared)
> -			tdp_mmu_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
> -		else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
> +			tdp_mmu_set_spte(kvm, &iter,
> +					 shadow_nonpresent_spte(iter.old_spte));
> +		else if (tdp_mmu_set_spte_atomic(
> +				 kvm, &iter,
> +				 shadow_nonpresent_spte(iter.old_spte)))
>  			goto retry;
>  	}
>  }
> @@ -1033,7 +1042,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>  		return false;
>
>  	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
> -			   SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1,
> +			   shadow_nonpresent_spte(old_spte),
> +			   sp->gfn, sp->role.level + 1,
>  			   true, true, is_private_sp(sp));
>
>  	return true;
> @@ -1075,11 +1085,20 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
>  			continue;
>  		}
>
> +		/*
> +		 * SPTE_SHARED_MASK is stored as 4K granularity.  The
> +		 * information is lost if we delete upper level SPTE page.
> +		 * TODO: support large page.
> +		 */
> +		if (kvm_gfn_shared_mask(kvm) && iter.level > PG_LEVEL_4K)
> +			continue;
> +
>  		if (!is_shadow_present_pte(iter.old_spte) ||
>  		    !is_last_spte(iter.old_spte, iter.level))
>  			continue;
>
> -		tdp_mmu_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
> +		tdp_mmu_set_spte(kvm, &iter,
> +				 shadow_nonpresent_spte(iter.old_spte));
>  		flush = true;
>  	}
>
> @@ -1195,18 +1214,44 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
>  	gfn_t gfn_unalias = iter->gfn & ~kvm_gfn_shared_mask(vcpu->kvm);
>
>  	WARN_ON(sp->role.level != fault->goal_level);
> +	WARN_ON(is_private_sptep(iter->sptep) != fault->is_private);
>
> -	/* TDX shared GPAs are no executable, enforce this for the SDV. */
> -	if (kvm_gfn_shared_mask(vcpu->kvm) && !fault->is_private)
> -		pte_access &= ~ACC_EXEC_MASK;
> +	if (kvm_gfn_shared_mask(vcpu->kvm)) {
> +		if (fault->is_private) {
> +			/*
> +			 * SPTE allows only RWX mapping. PFN can't be mapped it
> +			 * as READONLY in GPA.
> +			 */
> +			if (fault->slot && !fault->map_writable)
> +				return RET_PF_RETRY;
> +			/*
> +			 * This GPA is not allowed to map as private.  Let
> +			 * vcpu loop in page fault until other vcpu change it
> +			 * by MapGPA hypercall.
> +			 */
> +			if (fault->slot &&

Please consider to merge this if into above "if (fault->slot) {}"

> +				spte_shared_mask(iter->old_spte))
> +				return RET_PF_RETRY;
> +		} else {
> +			/* This GPA is not allowed to map as shared. */
> +			if (fault->slot &&
> +				!spte_shared_mask(iter->old_spte))
> +				return RET_PF_RETRY;
> +			/* TDX shared GPAs are no executable, enforce this. */
> +			pte_access &= ~ACC_EXEC_MASK;
> +		}
> +	}
>
>  	if (unlikely(!fault->slot))
>  		new_spte = make_mmio_spte(vcpu, gfn_unalias, pte_access);
> -	else
> +	else {
>  		wrprot = make_spte(vcpu, sp, fault->slot, pte_access,
>  				   gfn_unalias, fault->pfn, iter->old_spte,
>  				   fault->prefetch, true, fault->map_writable,
>  				   &new_spte);
> +		if (spte_shared_mask(iter->old_spte))
> +			new_spte |= SPTE_SHARED_MASK;
> +	}

The if can be eliminated:
new_spte |= spte_shared_mask(iter->old_spte);

>
>  	if (new_spte == iter->old_spte)
>  		ret = RET_PF_SPURIOUS;
> @@ -1509,7 +1554,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
>  	 * invariant that the PFN of a present * leaf SPTE can never change.
>  	 * See __handle_changed_spte().
>  	 */
> -	tdp_mmu_set_spte(kvm, iter, SHADOW_NONPRESENT_VALUE);
> +	tdp_mmu_set_spte(kvm, iter, shadow_nonpresent_spte(iter->old_spte));
>
>  	if (!pte_write(range->pte)) {
>  		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
> --
> 2.25.1
>