Re: [RFC PATCH 02/28] kvm: mmu: Separate pte generation from set_spte

Sean Christopherson <sean.j.christopherson@xxxxxxxxx> · Wed, 27 Nov 2019 10:25:53 -0800

On Thu, Sep 26, 2019 at 04:17:58PM -0700, Ben Gardon wrote:
> Separate the functions for generating leaf page table entries from the
> function that inserts them into the paging structure. This refactoring
> will allow changes to the MMU sychronization model to use atomic
> compare / exchanges (which are not guaranteed to succeed) instead of a
> monolithic MMU lock.
> 
> Signed-off-by: Ben Gardon <bgardon@xxxxxxxxxx>
> ---
>  arch/x86/kvm/mmu.c | 93 ++++++++++++++++++++++++++++------------------
>  1 file changed, 57 insertions(+), 36 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 781c2ca7455e3..7e5ab9c6e2b09 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2964,21 +2964,15 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
>  #define SET_SPTE_WRITE_PROTECTED_PT	BIT(0)
>  #define SET_SPTE_NEED_REMOTE_TLB_FLUSH	BIT(1)
>  
> -static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
> -		    unsigned pte_access, int level,
> -		    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
> -		    bool can_unsync, bool host_writable)
> +static int generate_pte(struct kvm_vcpu *vcpu, unsigned pte_access, int level,

Similar comment on "generate".  Note, I don't necessarily like the names
get_mmio_spte_value() or get_spte_value() either as they could be
misinterpreted as reading the value from memory.  Maybe
calc_{mmio_}spte_value()?

> +		    gfn_t gfn, kvm_pfn_t pfn, u64 old_pte, bool speculative,
> +		    bool can_unsync, bool host_writable, bool ad_disabled,
> +		    u64 *ptep)
>  {
> -	u64 spte = 0;
> +	u64 pte;

Renames and unrelated refactoring (leaving the variable uninitialized and
setting it directdly to shadow_present_mask) belong in separate patches.
The renames especially make this patch much more difficult to review.  And,
I disagree with the rename, I think it's important to keep the "spte"
nomenclature, even though it's a bit of a misnomer for TDP entries, so that
it is easy to differentiate data that is coming from the host PTEs versus
data that is for KVM's MMU.

>  	int ret = 0;
> -	struct kvm_mmu_page *sp;
> -
> -	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
> -		return 0;
>  
> -	sp = page_header(__pa(sptep));
> -	if (sp_ad_disabled(sp))
> -		spte |= shadow_acc_track_value;
> +	*ptep = 0;
>  
>  	/*
>  	 * For the EPT case, shadow_present_mask is 0 if hardware
> @@ -2986,36 +2980,39 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  	 * ACC_USER_MASK and shadow_user_mask are used to represent
>  	 * read access.  See FNAME(gpte_access) in paging_tmpl.h.
>  	 */
> -	spte |= shadow_present_mask;
> +	pte = shadow_present_mask;
> +
> +	if (ad_disabled)
> +		pte |= shadow_acc_track_value;
> +
>  	if (!speculative)
> -		spte |= spte_shadow_accessed_mask(spte);
> +		pte |= spte_shadow_accessed_mask(pte);
>  
>  	if (pte_access & ACC_EXEC_MASK)
> -		spte |= shadow_x_mask;
> +		pte |= shadow_x_mask;
>  	else
> -		spte |= shadow_nx_mask;
> +		pte |= shadow_nx_mask;
>  
>  	if (pte_access & ACC_USER_MASK)
> -		spte |= shadow_user_mask;
> +		pte |= shadow_user_mask;
>  
>  	if (level > PT_PAGE_TABLE_LEVEL)
> -		spte |= PT_PAGE_SIZE_MASK;
> +		pte |= PT_PAGE_SIZE_MASK;
>  	if (tdp_enabled)
> -		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
> +		pte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
>  			kvm_is_mmio_pfn(pfn));
>  
>  	if (host_writable)
> -		spte |= SPTE_HOST_WRITEABLE;
> +		pte |= SPTE_HOST_WRITEABLE;
>  	else
>  		pte_access &= ~ACC_WRITE_MASK;
>  
>  	if (!kvm_is_mmio_pfn(pfn))
> -		spte |= shadow_me_mask;
> +		pte |= shadow_me_mask;
>  
> -	spte |= (u64)pfn << PAGE_SHIFT;
> +	pte |= (u64)pfn << PAGE_SHIFT;
>  
>  	if (pte_access & ACC_WRITE_MASK) {
> -
>  		/*
>  		 * Other vcpu creates new sp in the window between
>  		 * mapping_level() and acquiring mmu-lock. We can
> @@ -3024,9 +3021,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  		 */
>  		if (level > PT_PAGE_TABLE_LEVEL &&
>  		    mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
> -			goto done;
> +			return 0;
>  
> -		spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
> +		pte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
>  
>  		/*
>  		 * Optimization: for pte sync, if spte was writable the hash
> @@ -3034,30 +3031,54 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  		 * is responsibility of mmu_get_page / kvm_sync_page.
>  		 * Same reasoning can be applied to dirty page accounting.
>  		 */
> -		if (!can_unsync && is_writable_pte(*sptep))
> -			goto set_pte;
> +		if (!can_unsync && is_writable_pte(old_pte)) {
> +			*ptep = pte;
> +			return 0;
> +		}
>  
>  		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
>  			pgprintk("%s: found shadow page for %llx, marking ro\n",
>  				 __func__, gfn);
> -			ret |= SET_SPTE_WRITE_PROTECTED_PT;
> +			ret = SET_SPTE_WRITE_PROTECTED_PT;

More unnecessary refactoring.

>  			pte_access &= ~ACC_WRITE_MASK;
> -			spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
> +			pte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
>  		}
>  	}
>  
> -	if (pte_access & ACC_WRITE_MASK) {
> -		kvm_vcpu_mark_page_dirty(vcpu, gfn);
> -		spte |= spte_shadow_dirty_mask(spte);
> -	}
> +	if (pte_access & ACC_WRITE_MASK)
> +		pte |= spte_shadow_dirty_mask(pte);
>  
>  	if (speculative)
> -		spte = mark_spte_for_access_track(spte);
> +		pte = mark_spte_for_access_track(pte);
> +
> +	*ptep = pte;
> +	return ret;
> +}
> +
> +static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
> +		    int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative,
> +		    bool can_unsync, bool host_writable)
> +{
> +	u64 spte;
> +	int ret;
> +	struct kvm_mmu_page *sp;
> +
> +	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
> +		return 0;
> +
> +	sp = page_header(__pa(sptep));
> +
> +	ret = generate_pte(vcpu, pte_access, level, gfn, pfn, *sptep,
> +			   speculative, can_unsync, host_writable,
> +			   sp_ad_disabled(sp), &spte);

Yowsers, that's a big prototype.  This is something that came up in an
unrelated internal discussion.  I wonder if it would make sense to define
a struct to hold all of the data needed to insert an spte and pass that
on the stack isntead of having a bajillion parameters.  Just spitballing,
no idea if it's feasible and/or reasonable.

> +	if (!spte)
> +		return 0;
> +
> +	if (spte & PT_WRITABLE_MASK)
> +		kvm_vcpu_mark_page_dirty(vcpu, gfn);
>  
> -set_pte:
>  	if (mmu_spte_update(sptep, spte))
>  		ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
> -done:
>  	return ret;
>  }
>  
> -- 
> 2.23.0.444.g18eeb5a265-goog
>