Re: [PATCH 14/21] KVM: TDX: Implement hooks to propagate changes of TDP MMU mirror page table

"Huang, Kai" <kai.huang@xxxxxxxxx> · Fri, 6 Sep 2024 14:10:03 +1200

--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -36,9 +36,21 @@ static __init int vt_hardware_setup(void)
  	 * is KVM may allocate couple of more bytes than needed for
  	 * each VM.
  	 */
-	if (enable_tdx)
+	if (enable_tdx) {
  		vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size,
  				sizeof(struct kvm_tdx));
+		/*
+		 * Note, TDX may fail to initialize in a later time in
+		 * vt_init(), in which case it is not necessary to setup
+		 * those callbacks.  But making them valid here even
+		 * when TDX fails to init later is fine because those
+		 * callbacks won't be called if the VM isn't TDX guest.
+		 */
+		vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
+		vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
+		vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
+		vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;

Nit:  The callbacks in 'struct kvm_x86_ops' have name "external", but 
TDX callbacks have name "private".  Should we rename TDX callbacks to 
make them aligned?

+	}
  
  	return 0;
  }

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 6feb3ab96926..b8cd5a629a80 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -447,6 +447,177 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
  	td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
  }
  
+static void tdx_unpin(struct kvm *kvm, kvm_pfn_t pfn)
+{
+	struct page *page = pfn_to_page(pfn);
+
+	put_page(page);
+}
+
+static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
+			    enum pg_level level, kvm_pfn_t pfn)
+{
+	int tdx_level = pg_level_to_tdx_sept_level(level);
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	hpa_t hpa = pfn_to_hpa(pfn);
+	gpa_t gpa = gfn_to_gpa(gfn);
+	u64 entry, level_state;
+	u64 err;
+
+	err = tdh_mem_page_aug(kvm_tdx, gpa, hpa, &entry, &level_state);
+	if (unlikely(err == TDX_ERROR_SEPT_BUSY)) {
+		tdx_unpin(kvm, pfn);
+		return -EAGAIN;
+	}

Nit: Here (and other non-fatal error cases) I think we should return 
-EBUSY to make it consistent with non-TDX case?  E.g., the non-TDX case has:

                if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
                        return -EBUSY;

And the comment of tdp_mmu_set_spte_atomic() currently says it can only 
return 0 or -EBUSY.  It needs to be patched to reflect it can also 
return other non-0 errors like -EIO but those are fatal.  In terms of 
non-fatal error I don't think we need another -EAGAIN.

/*
 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically

[...]

 * Return:
 * * 0      - If the SPTE was set.
 * * -EBUSY - If the SPTE cannot be set. In this case this function will
 *	      have no side-effects other than setting iter->old_spte to
 *            the last known value of the spte.
 */

[...]

+
+static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
+				      enum pg_level level, kvm_pfn_t pfn)
+{

[...]

+
+	hpa_with_hkid = set_hkid_to_hpa(hpa, (u16)kvm_tdx->hkid);
+	do {
+		/*
+		 * TDX_OPERAND_BUSY can happen on locking PAMT entry.  Because
+		 * this page was removed above, other thread shouldn't be
+		 * repeatedly operating on this page.  Just retry loop.
+		 */
+		err = tdh_phymem_page_wbinvd(hpa_with_hkid);
+	} while (unlikely(err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX)));

In what case(s) other threads can concurrently lock the PAMT entry, 
leading to the above BUSY?

[...]

+
+int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+				 enum pg_level level, kvm_pfn_t pfn)
+{
+	int ret;
+
+	/*
+	 * HKID is released when vm_free() which is after closing gmem_fd

From latest dev branch HKID is freed from vt_vm_destroy(), but not 
vm_free() (which should be tdx_vm_free() btw).

static void vt_vm_destroy(struct kvm *kvm)
{
        if (is_td(kvm))
                return tdx_mmu_release_hkid(kvm);

        vmx_vm_destroy(kvm);
}

Btw, why not have a tdx_vm_destroy() wrapper?  Seems all other vt_xx()s 
have a tdx_xx() but only this one calls tdx_mmu_release_hkid() directly.

+	 * which causes gmem invalidation to zap all spte.
+	 * Population is only allowed after KVM_TDX_INIT_VM.
+	 */

What does the second sentence ("Population ...")  meaning?  Why is it 
relevant here?