Re: [PATCH v2 4/5] kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A bits.

Paolo Bonzini <pbonzini@xxxxxxxxxx> · Mon, 21 Nov 2016 15:42:23 +0100

On 09/11/2016 00:00, Junaid Shahid wrote:
> This change implements lockless access tracking for Intel CPUs without EPT
> A bits. This is achieved by marking the PTEs as not-present (but not
> completely clearing them) when clear_flush_young() is called after marking
> the pages as accessed. When an EPT Violation is generated as a result of
> the VM accessing those pages, the PTEs are restored to their original values.
> 
> Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx>
> ---
>  arch/x86/include/asm/vmx.h |  39 ++++++
>  arch/x86/kvm/mmu.c         | 314 ++++++++++++++++++++++++++++++++++-----------
>  arch/x86/kvm/mmu.h         |   2 +
>  arch/x86/kvm/vmx.c         |  20 ++-
>  4 files changed, 301 insertions(+), 74 deletions(-)
> 
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 60991fb..3d63098 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -434,6 +434,45 @@ enum vmcs_field {
>  #define VMX_EPT_IPAT_BIT    			(1ull << 6)
>  #define VMX_EPT_ACCESS_BIT				(1ull << 8)
>  #define VMX_EPT_DIRTY_BIT				(1ull << 9)
> +#define VMX_EPT_RWX_MASK                        (VMX_EPT_READABLE_MASK |       \
> +						 VMX_EPT_WRITABLE_MASK |       \
> +						 VMX_EPT_EXECUTABLE_MASK)
> +#define VMX_EPT_MT_MASK				(7ull << VMX_EPT_MT_EPTE_SHIFT)
> +
> +/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
> +#define VMX_EPT_MISCONFIG_WX_VALUE		(VMX_EPT_WRITABLE_MASK |       \
> +						 VMX_EPT_EXECUTABLE_MASK)
> +

So far so good.

> + * The shift to use for saving the original RWX value when marking the PTE as
> + * not-present for tracking purposes.
> + */
> +#define VMX_EPT_RWX_SAVE_SHIFT			52
> +
> +/*
> + * The shift/mask for determining the type of tracking (if any) being used for a
> + * not-present PTE. Currently, only two bits are used, but more can be added.
> + *
> + * NOTE: Bit 63 is an architecturally ignored bit (and hence can be used for our
> + *       purpose) when the EPT PTE is in a misconfigured state. However, it is
> + *       not necessarily an ignored bit otherwise (even in a not-present state).
> + *       Since the existing MMIO code already uses this bit and since KVM
> + *       doesn't use #VEs currently (where this bit comes into play), so we can
> + *       continue to use it for storing the type. But to be on the safe side,
> + *       we should not set it to 1 in those TRACK_TYPEs where the tracking is
> + *       done via EPT Violations instead of EPT Misconfigurations.
> + */
> +#define VMX_EPT_TRACK_TYPE_SHIFT		62
> +#define VMX_EPT_TRACK_TYPE_MASK			(3ull <<                       \
> +						 VMX_EPT_TRACK_TYPE_SHIFT)
> +
> +/* Sets only bit 62 as the tracking is done by EPT Violations. See note above */
> +#define VMX_EPT_TRACK_ACCESS			(1ull <<                       \
> +						 VMX_EPT_TRACK_TYPE_SHIFT)
> +/* Sets bits 62 and 63. See note above */
> +#define VMX_EPT_TRACK_MMIO			(3ull <<                       \
> +						 VMX_EPT_TRACK_TYPE_SHIFT)
> +

I think this is overengineered.  Let's just say bit 62 in the SPTE 
denotes "special SPTE" violations, of both the access tracking and MMIO 
kinds; no need to use bit 63 as well.  Let's call it SPTE_SPECIAL_MASK 
and, first of all, make KVM use it for the fast MMIO case.

Second, is_shadow_present_pte can just be

	return (pte != 0) && !is_mmio_spte(pte);

Third we can add this on top, but let's keep the cases simple.  You can 
fix:

- shadow_acc_track_saved_bits_mask to 7 (for XWR in the EPT case, but it would work just as well for UWP for "traditional" page tables)

- shadow_acc_track_saved_bits_shift to PT64_SECOND_AVAIL_BITS_SHIFT

- shadow_acc_track_value to SPTE_SPECIAL_MASK aka bit 62

In the end, all that needs customization is shadow_acc_track_mask.  
That one is 0 if fast access tracking is not used, and 
VMX_EPT_RWX_MASK|SPTE_SPECIAL_MASK for ept && !eptad.  So you can add a 
single argument to kvm_mmu_set_mask_ptes.

> @@ -490,6 +532,9 @@ static bool spte_has_volatile_bits(u64 spte)
>  	if (spte_can_locklessly_be_made_writable(spte))
>  		return true;
>  
> +	if (is_access_track_spte(spte))
> +		return true;
> +
>  	if (!shadow_accessed_mask)
>  		return false;

I think it's worth rewriting the whole function:

	if (spte_is_locklessly_modifiable(spte))
		return true;

	if (!is_shadow_present_pte(spte))
		return false;

	if (shadow_accessed_mask)
		return ((spte & shadow_accessed_mask) == 0 ||
			(is_writable_pte(spte) &&
			 (spte & shadow_dirty_mask) == 0))
        else
		return is_access_track_spte(spte);

> @@ -533,17 +578,21 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
>   * will find a read-only spte, even though the writable spte
>   * might be cached on a CPU's TLB, the return value indicates this
>   * case.
> + *
> + * Returns true if the TLB needs to be flushed
>   */
>  static bool mmu_spte_update(u64 *sptep, u64 new_spte)
>  {
>  	u64 old_spte = *sptep;
> -	bool ret = false;
> +	bool flush = false;
> +	bool writable_cleared;
> +	bool acc_track_enabled;
>  
>  	WARN_ON(!is_shadow_present_pte(new_spte));
>  
>  	if (!is_shadow_present_pte(old_spte)) {
>  		mmu_spte_set(sptep, new_spte);
> -		return ret;
> +		return flush;
>  	}
>  
>  	if (!spte_has_volatile_bits(old_spte))
> @@ -551,24 +600,16 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
>  	else
>  		old_spte = __update_clear_spte_slow(sptep, new_spte);
>  
> +	BUG_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));

WARN_ON, please.

>  	/*
>  	 * For the spte updated out of mmu-lock is safe, since
>  	 * we always atomically update it, see the comments in
>  	 * spte_has_volatile_bits().
>  	 */
>  	if (spte_can_locklessly_be_made_writable(old_spte) &&
> -	      !is_writable_pte(new_spte))
> -		ret = true;
> -
> -	if (!shadow_accessed_mask) {
> -		/*
> -		 * We don't set page dirty when dropping non-writable spte.
> -		 * So do it now if the new spte is becoming non-writable.
> -		 */
> -		if (ret)
> -			kvm_set_pfn_dirty(spte_to_pfn(old_spte));
> -		return ret;
> -	}
> +	    !is_writable_pte(new_spte))
> +		flush = true;
>  
>  	/*
>  	 * Flush TLB when accessed/dirty bits are changed in the page tables,
> @@ -576,20 +617,34 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
>  	 */
>  	if (spte_is_bit_changed(old_spte, new_spte,
>                                  shadow_accessed_mask | shadow_dirty_mask))
> -		ret = true;
> +		flush = true;
>  
> -	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
> +	writable_cleared = is_writable_pte(old_spte) &&
> +			   !is_writable_pte(new_spte);
> +	acc_track_enabled = !is_access_track_spte(old_spte) &&
> +			    is_access_track_spte(new_spte);
> +
> +	if (writable_cleared || acc_track_enabled)
> +		flush = true;
> +
> +	if (shadow_accessed_mask ?
> +	    spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask) :
> +	    acc_track_enabled)

Please introduce a new function spte_is_access_tracking_enabled(u64 
old_spte, u64 new_spte) and use it here:

	if (shadow_accessed_mask ?
	    spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask) :
	    spte_is_access_tracking_enabled(old_spte, new_spte)) {
		flush |= !shadow_accessed_mask;
		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
	}

>  		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
> -	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
> +
> +	if (shadow_dirty_mask ?
> +	    spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask) :
> +	    writable_cleared)

writable_cleared can be inline here and written as

	spte_is_bit_cleared(old_spte, new_spte, PT_WRITABLE_MASK)

so

	if (shadow_dirty_mask ?
	    spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask) :
	    spte_is_bit_cleared(old_spte, new_spte, PT_WRITABLE_MASK)) {
		flush |= !shadow_dirty_mask;
		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
	}
>  		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
>  
> -	return ret;
> +	return flush;
>  }

But please anticipate the changes to this function, except for 
introducing is_access_track_spte of course, to a separate function.

>  /*
>   * Rules for using mmu_spte_clear_track_bits:
>   * It sets the sptep from present to nonpresent, and track the
>   * state bits, it is used to clear the last level sptep.
> + * Returns non-zero if the PTE was previously valid.
>   */
>  static int mmu_spte_clear_track_bits(u64 *sptep)
>  {
> @@ -604,6 +659,13 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
>  	if (!is_shadow_present_pte(old_spte))
>  		return 0;
>  
> +	/*
> +	 * For access tracking SPTEs, the pfn was already marked accessed/dirty
> +	 * when the SPTE was marked for access tracking, so nothing to do here.
> +	 */
> +	if (is_access_track_spte(old_spte))
> +		return 1;

This should go after the "WARN_ON", since that's a valuable check.  In 
addition, I think it's a good idea to keep similar idioms between 
mmu_spte_update and mmu_spte_clear_track_bits, like this:

	if (shadow_accessed_mask
	    ? old_spte & shadow_accessed_mask
	    : !is_access_track_spte(old_spte))
		kvm_set_pfn_accessed(pfn);
        if (shadow_dirty_mask
	    ? old_spte & shadow_dirty_mask
	    : old_spte & PT_WRITABLE_MASK)
                kvm_set_pfn_dirty(pfn);

	return 1;

or (you pick)

	if (shadow_accessed_mask
	    ? !(old_spte & shadow_accessed_mask)
	    : is_access_track_spte(old_spte))
		return 1;

	kvm_set_pfn_accessed(pfn);
        if (shadow_dirty_mask
	    ? old_spte & shadow_dirty_mask
	    : old_spte & PT_WRITABLE_MASK)
                kvm_set_pfn_dirty(pfn);

	return 1;

>  	pfn = spte_to_pfn(old_spte);
>  
>  	/*
> @@ -618,6 +680,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
>  	if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask :
>  					    PT_WRITABLE_MASK))
>  		kvm_set_pfn_dirty(pfn);
> +
>  	return 1;
>  }
>  
> @@ -636,6 +699,52 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
>  	return __get_spte_lockless(sptep);
>  }
>  
> +static u64 mark_spte_for_access_track(u64 spte)
> +{
> +	if (shadow_acc_track_mask == 0)
> +		return spte;

Should this return spte & ~shadow_accessed_mask if shadow_accessed_mask 
is nonzero?  See for example:

 			new_spte &= ~shadow_accessed_mask;

+			new_spte = mark_spte_for_access_track(new_spte);

> +	/*
> +	 * Verify that the write-protection that we do below will be fixable
> +	 * via the fast page fault path. Currently, that is always the case, at
> +	 * least when using EPT (which is when access tracking would be used).
> +	 */
> +	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
> +		  !spte_can_locklessly_be_made_writable(spte),
> +		  "Writable SPTE is not locklessly dirty-trackable\n");
> +
> +	/*
> +	 * Any PTE marked for access tracking should also be marked for dirty
> +	 * tracking (by being non-writable)
> +	 */
> +	spte &= ~PT_WRITABLE_MASK;

This should be implicit in the definition of 
shadow_acc_track_mask/value, so it's not necessary (or it can be a 
WARN_ONCE).

> +	spte &= ~(shadow_acc_track_saved_bits_mask <<
> +		  shadow_acc_track_saved_bits_shift);

Should there be a WARN_ON if these bits are not zero?

> +	spte |= (spte & shadow_acc_track_saved_bits_mask) <<
> +		shadow_acc_track_saved_bits_shift;
> +	spte &= ~shadow_acc_track_mask;
> +	spte |= shadow_acc_track_value;
> +
> +	return spte;
> +}
> +
> +/* Returns true if the TLB needs to be flushed */
> +static bool mmu_spte_enable_access_track(u64 *sptep)
> +{
> +	u64 spte = mmu_spte_get_lockless(sptep);
> +
> +	if (is_access_track_spte(spte))
> +		return false;
> +
> +	/* Access tracking should not be enabled if CPU supports A/D bits */
> +	BUG_ON(shadow_accessed_mask != 0);

WARN_ONCE please.  However, I think mmu_spte_enable_access_track should 
be renamed to mmu_spte_age and handle the shadow_accessed_mask case as

	clear_bit((ffs(shadow_accessed_mask) - 1),
		  (unsigned long *)sptep);

similar to kvm_age_rmapp.  See below about kvm_age_rmapp.

> +	spte = mark_spte_for_access_track(spte);
> +
> +	return mmu_spte_update(sptep, spte);
> +}
> +
>  static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
>  {
>  	/*
> @@ -1403,6 +1512,25 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
>  	return kvm_zap_rmapp(kvm, rmap_head);
>  }
>  
> +static int kvm_acc_track_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
> +			       struct kvm_memory_slot *slot, gfn_t gfn,
> +			       int level, unsigned long data)
> +{
> +	u64 *sptep;
> +	struct rmap_iterator iter;
> +	int need_tlb_flush = 0;
> +
> +	for_each_rmap_spte(rmap_head, &iter, sptep) {
> +

Unnecessary blank line---but see below about kvm_acc_track_rmapp.

> +		rmap_printk("kvm_acc_track_rmapp: spte %p %llx gfn %llx (%d)\n",
> +			    sptep, *sptep, gfn, level);
> +
> +		need_tlb_flush |= mmu_spte_enable_access_track(sptep);
> +	}
> +
> +	return need_tlb_flush;
> +}
> +
>  static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
>  			     struct kvm_memory_slot *slot, gfn_t gfn, int level,
>  			     unsigned long data)
> @@ -1419,8 +1547,9 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
>  
>  restart:
>  	for_each_rmap_spte(rmap_head, &iter, sptep) {
> +

Unnecessary blank line.

>  		rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
> -			     sptep, *sptep, gfn, level);
> +			    sptep, *sptep, gfn, level);
>  
>  		need_flush = 1;
>  
> @@ -1435,6 +1564,8 @@ restart:
>  			new_spte &= ~SPTE_HOST_WRITEABLE;
>  			new_spte &= ~shadow_accessed_mask;
>  
> +			new_spte = mark_spte_for_access_track(new_spte);
> +
>  			mmu_spte_clear_track_bits(sptep);
>  			mmu_spte_set(sptep, new_spte);
>  		}
> @@ -1615,24 +1746,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
>  {
>  	u64 *sptep;
>  	struct rmap_iterator iter;
> -	int young = 0;
> -
> -	/*
> -	 * If there's no access bit in the secondary pte set by the
> -	 * hardware it's up to gup-fast/gup to set the access bit in
> -	 * the primary pte or in the page structure.
> -	 */
> -	if (!shadow_accessed_mask)
> -		goto out;
>  
>  	for_each_rmap_spte(rmap_head, &iter, sptep) {
> -		if (*sptep & shadow_accessed_mask) {
> -			young = 1;
> -			break;
> -		}
> +		if ((*sptep & shadow_accessed_mask) ||
> +		    (!shadow_accessed_mask && !is_access_track_spte(*sptep)))

This can also be written, like before, as

	if (shadow_accessed_mask
	    ? *sptep & shadow_accessed_mask
	    : !is_access_track_spte(*sptep))

Introducing a new helper is_accessed_spte starts to look like a good idea!

> +			return 1;
>  	}
> -out:
> -	return young;
> +
> +	return 0;
>  }
>  
>  #define RMAP_RECYCLE_THRESHOLD 1000
> @@ -1669,7 +1790,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
>  		 */
>  		kvm->mmu_notifier_seq++;
>  		return kvm_handle_hva_range(kvm, start, end, 0,
> -					    kvm_unmap_rmapp);
> +					    shadow_acc_track_mask != 0
> +					    ? kvm_acc_track_rmapp
> +					    : kvm_unmap_rmapp);

Please rewrite kvm_age_rmapp to use the new mmu_spte_age instead, and test

	if (shadow_accessed_mask || shadow_acc_track_mask)

in kvm_age_hva.

> @@ -2877,16 +3003,27 @@ static bool page_fault_can_be_fast(u32 error_code)
>  	if (unlikely(error_code & PFERR_RSVD_MASK))
>  		return false;
>  
> -	/*
> -	 * #PF can be fast only if the shadow page table is present and it
> -	 * is caused by write-protect, that means we just need change the
> -	 * W bit of the spte which can be done out of mmu-lock.
> -	 */
> -	if (!(error_code & PFERR_PRESENT_MASK) ||
> -	      !(error_code & PFERR_WRITE_MASK))
> +	/* See if the page fault is due to an NX violation */
> +	if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
> +		      == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
>  		return false;
>  
> -	return true;
> +	/*
> +	 * #PF can be fast if:
> +	 * 1. The shadow page table entry is not present, which could mean that
> +	 *    the fault is potentially caused by access tracking (if enabled).
> +	 * 2. The shadow page table entry is present and the fault
> +	 *    is caused by write-protect, that means we just need change the W
> +	 *    bit of the spte which can be done out of mmu-lock.
> +	 *
> +	 * However, if Access Tracking is disabled, then the first condition
> +	 * above cannot be handled by the fast path. So if access tracking is
> +	 * disabled, we return true only if the second condition is met.

Better:

However, if access tracking is disabled we know that a non-present page 
must be a genuine page fault where we have to create a new SPTE.  So, 
if access tracking is disabled, we return true only for write accesses 
to a present page.

> +	 */
> +
> +	return shadow_acc_track_mask != 0 ||
> +	       ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
> +		== (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
>  }
>  
>  /*
> @@ -2895,17 +3032,24 @@ static bool page_fault_can_be_fast(u32 error_code)
>   */
>  static bool
>  fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> -			u64 *sptep, u64 spte)
> +			u64 *sptep, u64 old_spte,
> +			bool remove_write_prot, bool remove_acc_track)
>  {
>  	gfn_t gfn;
> +	u64 new_spte = old_spte;
>  
>  	WARN_ON(!sp->role.direct);
>  
> -	/*
> -	 * The gfn of direct spte is stable since it is calculated
> -	 * by sp->gfn.
> -	 */
> -	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> +	if (remove_acc_track) {
> +		u64 saved_bits = old_spte & (shadow_acc_track_saved_bits_mask <<
> +					     shadow_acc_track_saved_bits_shift);

You can shift right old_spte here...

> +		new_spte &= ~shadow_acc_track_mask;
> +		new_spte |= saved_bits >> shadow_acc_track_saved_bits_shift;

... instead of shifting saved_bits left here.

> +	}
> +
> +	if (remove_write_prot)
> +		new_spte |= PT_WRITABLE_MASK;
>  
>  	/*
>  	 * Theoretically we could also set dirty bit (and flush TLB) here in
> @@ -2919,10 +3063,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>  	 *
>  	 * Compare with set_spte where instead shadow_dirty_mask is set.
>  	 */
> -	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte)
> +	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
>  		return false;
>  
> -	kvm_vcpu_mark_page_dirty(vcpu, gfn);
> +	if (remove_write_prot) {

Stupid question ahead, why not call kvm_vcpu_mark_page_accessed in this 
function?

> +		/*
> +		 * The gfn of direct spte is stable since it is
> +		 * calculated by sp->gfn.
> +		 */
> +		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> +		kvm_vcpu_mark_page_dirty(vcpu, gfn);
> +	}
>  
>  	return true;
>  }
> @@ -2937,7 +3088,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
>  {
>  	struct kvm_shadow_walk_iterator iterator;
>  	struct kvm_mmu_page *sp;
> -	bool ret = false;
> +	bool fault_handled = false;
>  	u64 spte = 0ull;
>  	uint retry_count = 0;
>  
> @@ -2953,36 +3104,43 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
>  			break;
>  
>  	do {
> -		/*
> -		 * If the mapping has been changed, let the vcpu fault on the
> -		 * same address again.
> -		 */
> -		if (!is_shadow_present_pte(spte)) {
> -			ret = true;
> -			break;
> -		}
> +		bool remove_write_prot = (error_code & PFERR_WRITE_MASK) &&
> +					 !(spte & PT_WRITABLE_MASK);
> +		bool remove_acc_track;
> +		bool valid_exec_access = (error_code & PFERR_FETCH_MASK) &&
> +					 (spte & shadow_x_mask);
>  
>  		sp = page_header(__pa(iterator.sptep));
>  		if (!is_last_spte(spte, sp->role.level))
>  			break;
>  
>  		/*
> -		 * Check if it is a spurious fault caused by TLB lazily flushed.
> +		 * Check whether the memory access that caused the fault would
> +		 * still cause it if it were to be performed right now. If not,
> +		 * then this is a spurious fault caused by TLB lazily flushed,
> +		 * or some other CPU has already fixed the PTE after the
> +		 * current CPU took the fault.
> +		 *
> +		 * If Write-Only mappings ever become supported, then the
> +		 * condition below would need to be changed appropriately.
>  		 *
>  		 * Need not check the access of upper level table entries since
>  		 * they are always ACC_ALL.
>  		 */
> -		if (is_writable_pte(spte)) {
> -			ret = true;
> +		if (((spte & PT_PRESENT_MASK) && !remove_write_prot) ||
> +		    valid_exec_access) {
> +			fault_handled = true;
>  			break;
>  		}

Let's separate the three conditions (R/W/X):

		if ((error_code & PFERR_FETCH_MASK) {
			if ((spte & (shadow_x_mask|shadow_nx_mask))
			    == shadow_x_mask) {
				fault_handled = true;
				break;
			}
		}
		if (error_code & PFERR_WRITE_MASK) {
			if (is_writable_pte(spte)) {
				fault_handled = true;
				break;
			}
			remove_write_prot =
				spte_can_locklessly_be_made_writable(spte);
		}
		if (!(error_code & PFERR_PRESENT_MASK)) {
			if (!is_access_track_spte(spte)) {
				fault_handled = true;
				break;
			}
			remove_acc_track = true;
		}

> +		remove_acc_track = is_access_track_spte(spte);
> +
>  		/*
> -		 * Currently, to simplify the code, only the spte
> -		 * write-protected by dirty-log can be fast fixed.
> +		 * Currently, to simplify the code, write-protection can be
> +		 * removed in the fast path only if the SPTE was write-protected
> +		 * for dirty-logging.
>  		 */
> -		if (!spte_can_locklessly_be_made_writable(spte))
> -			break;
> +		remove_write_prot &= spte_can_locklessly_be_made_writable(spte);
>  
>  		/*
>  		 * Do not fix write-permission on the large spte since we only
> @@ -2998,13 +3156,20 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
>  		if (sp->role.level > PT_PAGE_TABLE_LEVEL)
>  			break;
>  
> +		/* Verify that the fault can be handled in the fast path */
> +		if (!remove_acc_track && !remove_write_prot)
> +			break;
> +
>  		/*
>  		 * Currently, fast page fault only works for direct mapping
>  		 * since the gfn is not stable for indirect shadow page. See
>  		 * Documentation/virtual/kvm/locking.txt to get more detail.
>  		 */
> -		ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
> -		if (ret)
> +		fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
> +							iterator.sptep, spte,
> +							remove_write_prot,
> +							remove_acc_track);
> +		if (fault_handled)
>  			break;
>  
>  		if (++retry_count > 4) {
> @@ -3018,10 +3183,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
>  	} while (true);
>  
>  	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
> -			      spte, ret);
> +			      spte, fault_handled);
>  	walk_shadow_page_lockless_end(vcpu);
>  
> -	return ret;
> +	return fault_handled;
>  }
>  
>  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> @@ -4300,6 +4465,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
>  	vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
>  }
>  
> +/* This is only supposed to be used for non-EPT mappings */

It's only used for non-EPT mappings, why is it only *supposed* to be 
used for non-EPT mappings?  It seems to me that it would work.

>  static bool need_remote_flush(u64 old, u64 new)
>  {
>  	if (!is_shadow_present_pte(old))
> @@ -5067,6 +5233,8 @@ static void mmu_destroy_caches(void)
>  
>  int kvm_mmu_module_init(void)
>  {
> +	kvm_mmu_clear_all_pte_masks();
> +
>  	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
>  					    sizeof(struct pte_list_desc),
>  					    0, 0, NULL);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index ddc56e9..dfd3056 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -52,6 +52,8 @@ static inline u64 rsvd_bits(int s, int e)
>  }
>  
>  void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
> +void kvm_mmu_set_access_track_masks(u64 acc_track_mask, u64 acc_track_value,
> +				    u64 saved_bits_mask, u64 saved_bits_shift);
>  
>  void
>  reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 88e3b02..363517e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -5019,7 +5019,22 @@ static void ept_set_mmio_spte_mask(void)
>  	 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
>  	 * spte.
>  	 */
> -	kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
> +	kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE |
> +				   VMX_EPT_TRACK_MMIO);
> +}
> +
> +static void ept_set_acc_track_spte_mask(void)
> +{
> +	/*
> +	 * For access track PTEs we use a non-present PTE to trigger an EPT
> +	 * Violation. The original RWX value is saved in some unused bits in
> +	 * the PTE and restored when the violation is fixed.
> +	 */
> +	kvm_mmu_set_access_track_masks(VMX_EPT_RWX_MASK |
> +				       VMX_EPT_TRACK_TYPE_MASK,
> +				       VMX_EPT_TRACK_ACCESS,
> +				       VMX_EPT_RWX_MASK,
> +				       VMX_EPT_RWX_SAVE_SHIFT);
>  }
>  
>  #define VMX_XSS_EXIT_BITMAP 0
> @@ -6551,6 +6566,9 @@ static __init int hardware_setup(void)
>  				      0ull : VMX_EPT_READABLE_MASK);
>  		ept_set_mmio_spte_mask();
>  		kvm_enable_tdp();
> +
> +		if (!enable_ept_ad_bits)
> +			ept_set_acc_track_spte_mask();

Let's put the whole "then" block in a single function vmx_enable_tdp.

Thanks,

Paolo

>  	} else
>  		kvm_disable_tdp();
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html