Re: [PATCH v19 062/130] KVM: x86/tdp_mmu: Support TDX private mapping for TDP MMU

Chao Gao <chao.gao@xxxxxxxxx> · Wed, 27 Mar 2024 21:07:21 +0800

>--- a/arch/x86/kvm/mmu/mmu.c
>+++ b/arch/x86/kvm/mmu/mmu.c
>@@ -3717,7 +3717,12 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
> 		goto out_unlock;
> 
> 	if (tdp_mmu_enabled) {
>-		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
>+		if (kvm_gfn_shared_mask(vcpu->kvm) &&
>+		    !VALID_PAGE(mmu->private_root_hpa)) {
>+			root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, true);
>+			mmu->private_root_hpa = root;

just
			mmu->private_root_hpa =
				kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, true);
>+		}
>+		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, false);
> 		mmu->root.hpa = root;

ditto

> 	} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> 		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
>@@ -4627,7 +4632,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> 	if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
> 		for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
> 			int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
>-			gfn_t base = gfn_round_for_level(fault->gfn,
>+			gfn_t base = gfn_round_for_level(gpa_to_gfn(fault->addr),
> 							 fault->max_level);

...

> 
> 			if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
>@@ -4662,6 +4667,7 @@ int kvm_mmu_map_tdp_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
> 	};
> 
> 	WARN_ON_ONCE(!vcpu->arch.mmu->root_role.direct);
>+	fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_shared_mask(vcpu->kvm);

Could you clarify when shared bits need to be masked out or kept? shared bits
are masked out here but kept in the hunk right above and ..

>+++ b/arch/x86/kvm/mmu/tdp_iter.h
>@@ -91,7 +91,7 @@ struct tdp_iter {
> 	tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
> 	/* A pointer to the current SPTE */
> 	tdp_ptep_t sptep;
>-	/* The lowest GFN mapped by the current SPTE */
>+	/* The lowest GFN (shared bits included) mapped by the current SPTE */
> 	gfn_t gfn;

.. in @gfn of tdp_iter.

> 	/* The level of the root page given to the iterator */
> 	int root_level;

> 
>-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
>+static struct kvm_mmu_page *kvm_tdp_mmu_get_vcpu_root(struct kvm_vcpu *vcpu,

Maybe fold it into its sole caller.

>+						      bool private)
> {
> 	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
> 	struct kvm *kvm = vcpu->kvm;
>@@ -221,6 +225,8 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
> 	 * Check for an existing root before allocating a new one.  Note, the
> 	 * role check prevents consuming an invalid root.
> 	 */
>+	if (private)
>+		kvm_mmu_page_role_set_private(&role);
> 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
> 		if (root->role.word == role.word &&
> 		    kvm_tdp_mmu_get_root(root))
>@@ -244,12 +250,17 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
> 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> 
> out:
>-	return __pa(root->spt);
>+	return root;
>+}
>+
>+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu, bool private)
>+{
>+	return __pa(kvm_tdp_mmu_get_vcpu_root(vcpu, private)->spt);
> }
> 
> static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
>-				u64 old_spte, u64 new_spte, int level,
>-				bool shared);
>+				u64 old_spte, u64 new_spte,
>+				union kvm_mmu_page_role role, bool shared);
> 
> static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
> {
>@@ -376,12 +387,78 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
> 							  REMOVED_SPTE, level);
> 		}
> 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
>-				    old_spte, REMOVED_SPTE, level, shared);
>+				    old_spte, REMOVED_SPTE, sp->role,
>+				    shared);
>+	}
>+
>+	if (is_private_sp(sp) &&
>+	    WARN_ON(static_call(kvm_x86_free_private_spt)(kvm, sp->gfn, sp->role.level,

WARN_ON_ONCE()?

>+							  kvm_mmu_private_spt(sp)))) {
>+		/*
>+		 * Failed to unlink Secure EPT page and there is nothing to do
>+		 * further.  Intentionally leak the page to prevent the kernel
>+		 * from accessing the encrypted page.
>+		 */
>+		kvm_mmu_init_private_spt(sp, NULL);
> 	}
> 
> 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
> }
> 

> 	rcu_read_lock();
> 
> 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
>@@ -960,10 +1158,26 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
> 
> 	if (unlikely(!fault->slot))
> 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
>-	else
>-		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
>-					 fault->pfn, iter->old_spte, fault->prefetch, true,
>-					 fault->map_writable, &new_spte);
>+	else {
>+		unsigned long pte_access = ACC_ALL;
>+		gfn_t gfn = iter->gfn;
>+
>+		if (kvm_gfn_shared_mask(vcpu->kvm)) {
>+			if (fault->is_private)
>+				gfn |= kvm_gfn_shared_mask(vcpu->kvm);

this is an open-coded kvm_gfn_to_shared().

I don't get why a spte is installed for a shared gfn when fault->is_private
is true. could you elaborate?

>+			else
>+				/*
>+				 * TDX shared GPAs are no executable, enforce
>+				 * this for the SDV.
>+				 */

what do you mean by the SDV?

>+				pte_access &= ~ACC_EXEC_MASK;
>+		}
>+
>+		wrprot = make_spte(vcpu, sp, fault->slot, pte_access, gfn,
>+				   fault->pfn, iter->old_spte,
>+				   fault->prefetch, true, fault->map_writable,
>+				   &new_spte);
>+	}
> 
> 	if (new_spte == iter->old_spte)
> 		ret = RET_PF_SPURIOUS;
>@@ -1041,6 +1255,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> 	struct kvm *kvm = vcpu->kvm;
> 	struct tdp_iter iter;
> 	struct kvm_mmu_page *sp;
>+	gfn_t raw_gfn;
>+	bool is_private = fault->is_private && kvm_gfn_shared_mask(kvm);
> 	int ret = RET_PF_RETRY;
> 
> 	kvm_mmu_hugepage_adjust(vcpu, fault);
>@@ -1049,7 +1265,17 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> 
> 	rcu_read_lock();
> 
>-	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
>+	raw_gfn = gpa_to_gfn(fault->addr);
>+
>+	if (is_error_noslot_pfn(fault->pfn) ||
>+	    !kvm_pfn_to_refcounted_page(fault->pfn)) {
>+		if (is_private) {
>+			rcu_read_unlock();
>+			return -EFAULT;

This needs a comment. why this check is necessary? does this imply some
kernel bugs?