Re: [PATCH 3/3] KVM: x86/mmu: Split huge pages mapped by the TDP MMU on fault

Sean Christopherson <seanjc@xxxxxxxxxx> · Thu, 7 Apr 2022 19:39:42 +0000

On Fri, Apr 01, 2022, David Matlack wrote:
> ---
>  arch/x86/kvm/mmu/tdp_mmu.c | 37 +++++++++++++++++++++++++------------
>  1 file changed, 25 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 9263765c8068..5a2120d85347 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1131,6 +1131,10 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
>  	return 0;
>  }
>  
> +static int tdp_mmu_split_huge_page_atomic(struct kvm_vcpu *vcpu,
> +					  struct tdp_iter *iter,
> +					  bool account_nx);
> +
>  /*
>   * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
>   * page tables and SPTEs to translate the faulting guest physical address.
> @@ -1140,6 +1144,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>  	struct kvm_mmu *mmu = vcpu->arch.mmu;
>  	struct tdp_iter iter;
>  	struct kvm_mmu_page *sp;
> +	bool account_nx;
>  	int ret;
>  
>  	kvm_mmu_hugepage_adjust(vcpu, fault);
> @@ -1155,28 +1160,22 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>  		if (iter.level == fault->goal_level)
>  			break;
>  
> +		account_nx = fault->huge_page_disallowed &&
> +			     fault->req_level >= iter.level;
> +
>  		/*
>  		 * If there is an SPTE mapping a large page at a higher level
> -		 * than the target, that SPTE must be cleared and replaced
> -		 * with a non-leaf SPTE.
> +		 * than the target, split it down one level.
>  		 */
>  		if (is_shadow_present_pte(iter.old_spte) &&
>  		    is_large_pte(iter.old_spte)) {
> -			if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
> +			if (tdp_mmu_split_huge_page_atomic(vcpu, &iter, account_nx))

As Ben brought up in patch 2, this conflicts in nasty ways with Mingwei's series
to more preciesly check sp->lpage_disallowed.  There's apparently a bug in that
code when using shadow paging, but assuming said bug isn't a blocking issue, I'd
prefer to land this on top of Mingwei's series.

With a bit of massaging, I think we can make the whole thing a bit more
straightforward.  This is what I ended up with (compile tested only, your patch 2
dropped, might split moving the "init" to a prep patch).   I'll give it a spin,
and assuming it works and Mingwei's bug is resolved, I'll post this and Mingwei's
series as a single thing.

---
 arch/x86/kvm/mmu/tdp_mmu.c | 99 ++++++++++++++++++--------------------
 1 file changed, 48 insertions(+), 51 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index f046af20f3d6..b0abf14570ea 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1126,6 +1126,9 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
 	return 0;
 }

+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+				   struct kvm_mmu_page *sp, bool shared);
+
 /*
  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
  * page tables and SPTEs to translate the faulting guest physical address.
@@ -1136,7 +1139,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	struct kvm *kvm = vcpu->kvm;
 	struct tdp_iter iter;
 	struct kvm_mmu_page *sp;
-	int ret;
+	bool account_nx;
+	int ret, r;

 	kvm_mmu_hugepage_adjust(vcpu, fault);

@@ -1151,57 +1155,50 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		if (iter.level == fault->goal_level)
 			break;

-		/*
-		 * If there is an SPTE mapping a large page at a higher level
-		 * than the target, that SPTE must be cleared and replaced
-		 * with a non-leaf SPTE.
-		 */
+		/* Nothing to do if there's already a shadow page installed. */
 		if (is_shadow_present_pte(iter.old_spte) &&
-		    is_large_pte(iter.old_spte)) {
-			if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
-				break;
-
-			/*
-			 * The iter must explicitly re-read the spte here
-			 * because the new value informs the !present
-			 * path below.
-			 */
-			iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
+		    !is_large_pte(iter.old_spte))
+			continue;
+
+		/*
+		 * If the SPTE has been frozen by another thread, just give up
+		 * and retry to avoid unnecessary page table alloc and free.
+		 */
+		if (is_removed_spte(iter.old_spte))
+			break;
+
+		/*
+		 * The SPTE is either invalid or points at a huge page that
+		 * needs to be split.
+		 */
+		sp = tdp_mmu_alloc_sp(vcpu);
+		tdp_mmu_init_child_sp(sp, &iter);
+
+		account_nx = fault->huge_page_disallowed &&
+			     fault->req_level >= iter.level;
+
+		sp->lpage_disallowed = account_nx;
+		/*
+		 * Ensure lpage_disallowed is visible before the SP is marked
+		 * present (or not-huge), as mmu_lock is held for read.  Pairs
+		 * with the smp_rmb() in disallowed_hugepage_adjust().
+		 */
+		smp_wmb();
+
+		if (!is_shadow_present_pte(iter.old_spte))
+			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
+		else
+			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
+
+		if (r) {
+			tdp_mmu_free_sp(sp);
+			break;
 		}

-		if (!is_shadow_present_pte(iter.old_spte)) {
-			bool account_nx = fault->huge_page_disallowed &&
-					  fault->req_level >= iter.level;
-
-			/*
-			 * If SPTE has been frozen by another thread, just
-			 * give up and retry, avoiding unnecessary page table
-			 * allocation and free.
-			 */
-			if (is_removed_spte(iter.old_spte))
-				break;
-
-			sp = tdp_mmu_alloc_sp(vcpu);
-			tdp_mmu_init_child_sp(sp, &iter);
-
-			sp->lpage_disallowed = account_nx;
-			/*
-			 * Ensure lpage_disallowed is visible before the SP is
-			 * marked present, as mmu_lock is held for read.  Pairs
-			 * with the smp_rmb() in disallowed_hugepage_adjust().
-			 */
-			smp_wmb();
-
-			if (tdp_mmu_link_sp(kvm, &iter, sp, true)) {
-				tdp_mmu_free_sp(sp);
-				break;
-			}
-
-			if (account_nx) {
-				spin_lock(&kvm->arch.tdp_mmu_pages_lock);
-				__account_huge_nx_page(kvm, sp);
-				spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
-			}
+		if (account_nx) {
+			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+			__account_huge_nx_page(kvm, sp);
+			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 		}
 	}

@@ -1472,8 +1469,6 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
 	const int level = iter->level;
 	int ret, i;

-	tdp_mmu_init_child_sp(sp, iter);
-
 	/*
 	 * No need for atomics when writing to sp->spt since the page table has
 	 * not been linked in yet and thus is not reachable from any other CPU.
@@ -1549,6 +1544,8 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
 				continue;
 		}

+		tdp_mmu_init_child_sp(sp, &iter);
+
 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
 			goto retry;


base-commit: f06d9d4f3d89912c40c57da45d64b9827d8580ac
--