Re: [PATCH] KVM: x86/mmu: consider the hva in mmu_notifer retry

Sean Christopherson <seanjc@xxxxxxxxxx> · Mon, 25 Jan 2021 10:35:08 -0800

+Cc the other architectures, I'm guessing this would be a helpful optimization
for all archs.

Quite a few comments, but they're all little more than nits.  Nice!

On Mon, Jan 25, 2021, David Stevens wrote:
> From: David Stevens <stevensd@xxxxxxxxxxxx>
> 
> Use the range passed to mmu_notifer's invalidate_range_start to prevent

s/mmu_notifer/mmu_notifier.  

And maybe avoid calling out invalidate_range_start() by name?  It took me a few
reads to understand it's referring to the function, i.e. the start of the
invalidation, not the start of the range.

> spurious page fault retries due to changes in unrelated host virtual
> addresses.

This needs to elaborate on the exact scenario this is handling, as is it sounds
like KVM is tracking the history of invalidations or something.  Understanding
this patch requires a priori knowledge of mmu_notifier_count.  Something like:

  Track the range being invalidated by mmu_notifier and skip page fault
  retries if the fault address is not affected by the in-progress
  invalidation.  Disable the optimization if multiple invalidations are
  in-progress to keep things simple, as tracking multiple ranges has
  diminishing returns.

> This has the secondary effect of greatly reducing the likelihood of extreme

Out of curiosity, is this really the _secondary_ effect?  I would expect this
change to primarily benefit scenarios where the invalidation has gotten
waylaid for whatever reason.

> latency when handing a page fault due to another thread having been preempted
> while modifying host virtual addresses.
> 
> Signed-off-by: David Stevens <stevensd@xxxxxxxxxxxx>
> ---

...

> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..79166288ed03 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3658,8 +3658,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  }
>  
>  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> -			 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
> -			 bool *writable)
> +			 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
> +			 bool write, bool *writable)

Side topic, I'm all for creating a 'struct kvm_page_fault' or whatever to hold
all these variables.  The helper functions stacks are getting unwieldy.
Definitely doesn't need to be addressed here, this just reminded of how ugly
these stacks are.

>  {
>  	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
>  	bool async;
> @@ -3672,7 +3672,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>  	}
>  
>  	async = false;
> -	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
> +	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
> +				    write, writable, hva);
>  	if (!async)
>  		return false; /* *pfn has correct page already */
>  
> @@ -3686,7 +3687,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>  			return true;
>  	}
>  
> -	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
> +	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
> +				    write, writable, hva);
>  	return false;
>  }
>  
> @@ -3699,6 +3701,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
>  	gfn_t gfn = gpa >> PAGE_SHIFT;
>  	unsigned long mmu_seq;
>  	kvm_pfn_t pfn;
> +	hva_t hva;
>  	int r;
>  
>  	if (page_fault_handle_page_track(vcpu, error_code, gfn))
> @@ -3717,7 +3720,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
>  
> -	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
> +	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
> +			 write, &map_writable))
>  		return RET_PF_RETRY;
>  
>  	if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
> @@ -3725,7 +3729,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
>  
>  	r = RET_PF_RETRY;
>  	spin_lock(&vcpu->kvm->mmu_lock);
> -	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
> +	if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
>  		goto out_unlock;
>  	r = make_mmu_pages_available(vcpu);
>  	if (r)
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 50e268eb8e1a..3171784139a4 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -790,6 +790,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
>  	struct guest_walker walker;
>  	int r;
>  	kvm_pfn_t pfn;
> +	hva_t hva;
>  	unsigned long mmu_seq;
>  	bool map_writable, is_self_change_mapping;
>  	int max_level;
> @@ -840,8 +841,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
>  
> -	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
> -			 &map_writable))
> +	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
> +			 write_fault, &map_writable))
>  		return RET_PF_RETRY;
>  
>  	if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
> @@ -869,7 +870,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
>  
>  	r = RET_PF_RETRY;
>  	spin_lock(&vcpu->kvm->mmu_lock);
> -	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
> +	if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
>  		goto out_unlock;
>  
>  	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index f3b1013fb22c..b70097685249 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -502,6 +502,8 @@ struct kvm {
>  	struct mmu_notifier mmu_notifier;
>  	unsigned long mmu_notifier_seq;
>  	long mmu_notifier_count;
> +	unsigned long mmu_notifier_range_start;
> +	unsigned long mmu_notifier_range_end;
>  #endif
>  	long tlbs_dirty;
>  	struct list_head devices;
> @@ -729,7 +731,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
>  kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
>  kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
>  			       bool atomic, bool *async, bool write_fault,
> -			       bool *writable);
> +			       bool *writable, hva_t *hva);
>  
>  void kvm_release_pfn_clean(kvm_pfn_t pfn);
>  void kvm_release_pfn_dirty(kvm_pfn_t pfn);
> @@ -1203,6 +1205,24 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
>  		return 1;
>  	return 0;
>  }
> +
> +static inline int mmu_notifier_retry_hva(struct kvm *kvm,
> +					 unsigned long mmu_seq,
> +					 unsigned long hva)
> +{
> +	/*
> +	 * Unlike mmu_notifier_retry, this function relies on
> +	 * kvm->mmu_lock for consistency.

mmu_notifier_retry is the outlier due to PPC behavior.  Maybe just add a lockdep
annonation and call it good?

> +	 */

This needs a comment to explicitly state that 'count > 1' cannot be done at
this time.  My initial thought is that it would be more intuitive to check for
'count > 1' here, but that would potentially check the wrong wrange when count
goes from 2->1.  The comment about persistence in invalidate_range_start() is a
good hint, but I think it's worth being explicit to avoid bad "cleanup" in the
future.

> +	if (unlikely(kvm->mmu_notifier_count)) {
> +		if (kvm->mmu_notifier_range_start <= hva &&
> +		    hva < kvm->mmu_notifier_range_end)

Combine these into a single statement?  I think the result is easier to read?

	if (unlikely(kvm->mmu_notifier_count) &&
	    kvm->mmu_notifier_range_start <= hva &&
	    hva < kvm->mmu_notifier_range_end)

> +			return 1;
> +	}
> +	if (kvm->mmu_notifier_seq != mmu_seq)
> +		return 1;
> +	return 0;
> +}
>  #endif
>  
>  #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index fa9e3614d30e..d6e1ef5cb184 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -483,6 +483,18 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
>  	 * count is also read inside the mmu_lock critical section.
>  	 */
>  	kvm->mmu_notifier_count++;
> +	if (likely(kvm->mmu_notifier_count == 1)) {
> +		kvm->mmu_notifier_range_start = range->start;
> +		kvm->mmu_notifier_range_end = range->end;
> +	} else {
> +		/**
> +		 * Tracking multiple concurrent ranges has diminishing returns,
> +		 * so just use the maximum range. This persists until after all
> +		 * outstanding invalidation operations complete.
> +		 */
> +		kvm->mmu_notifier_range_start = 0;
> +		kvm->mmu_notifier_range_end = ULONG_MAX;

Hrm, I don't think there's a corner case in practice, but ULONG_MAX is a legal
virtual address and range_end is exclusive.  E.g. passing hva=-1ul would get a
false negative in mmu_notifier_retry_hva().  It's not an issue as written
because hva is generated from the gfn, and hva can't be a kernel address.  I'm
guessing mmu_notifier also doesn't fire on kernel addresses.  I assume that all
holds true for other architectures, and adding checks in mmu_notifier_retry_hva()
feels like a waste of cycles, but it still bugs me. :-)

Maybe zero out range_end and explicitly check for that, just to be paranoid?

	if (unlikely(kvm->mmu_notifier_count) &&
	    (!kvm->mmu_notifier_range_end ||
            (kvm->mmu_notifier_range_start <= hva &&
             hva < kvm->mmu_notifier_range_end))

> +	}
>  	need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
>  					     range->flags);
>  	/* we've to flush the tlb before the pages can be freed */
> @@ -2010,9 +2022,11 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
>  
>  kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
>  			       bool atomic, bool *async, bool write_fault,
> -			       bool *writable)
> +			       bool *writable, hva_t *hva)

Hrm, it feels like we should really split gfn->hva and hva->pfn into separate
operations, but pretty much every arch needs the hva error handling.  Splitting
it would probably do more harm than good, at least not without a lot of
additional refactoring.  Bummer.

>  {
>  	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);

Newline here.

> +	if (hva)
> +		*hva = addr;
>  
>  	if (addr == KVM_HVA_ERR_RO_BAD) {
>  		if (writable)
> @@ -2041,19 +2055,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
>  		      bool *writable)
>  {
>  	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
> -				    write_fault, writable);
> +				    write_fault, writable, NULL);
>  }
>  EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
>  
>  kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
>  {
> -	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
> +	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
>  }
>  EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
>  
>  kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
>  {
> -	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
> +	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
>  }
>  EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
>  
> -- 
> 2.30.0.280.ga3ce27912f-goog
>