Re: [PATCH v10 016/108] KVM: TDX: create/destroy VM structure

Ackerley Tng <ackerleytng@xxxxxxxxxx> · Fri, 09 Dec 2022 11:15:35 -0800

In tdx_vm_init, it is possible to have a double-reclaim, which
eventually causes a host crash. I have a selftest that reliably
reproduces this, and I believe the problem is that withiin
tdx_vm_free(), we don't reset kvm->tdcs = NULL and kvm->tdr.added to
false.

+int tdx_vm_init(struct kvm *kvm)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	cpumask_var_t packages;
+	int ret, i;
+	u64 err;
+
+	ret = tdx_keyid_alloc();
+	if (ret < 0)
+		return ret;
+	kvm_tdx->hkid = ret;
+
+	ret = tdx_alloc_td_page(&kvm_tdx->tdr);
+	if (ret)
+		goto free_hkid;
+
+	kvm_tdx->tdcs = kcalloc(tdx_caps.tdcs_nr_pages, sizeof(*kvm_tdx->tdcs),
+				GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (!kvm_tdx->tdcs)
+		goto free_tdr;
+	for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
+		ret = tdx_alloc_td_page(&kvm_tdx->tdcs[i]);
+		if (ret)
+			goto free_tdcs;
+	}
+
+	if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto free_tdcs;
+	}
+	cpus_read_lock();
+	/*
+	 * Need at least one CPU of the package to be online in order to
+	 * program all packages for host key id.  Check it.
+	 */
+	for_each_present_cpu(i)
+		cpumask_set_cpu(topology_physical_package_id(i), packages);
+	for_each_online_cpu(i)
+		cpumask_clear_cpu(topology_physical_package_id(i), packages);
+	if (!cpumask_empty(packages)) {
+		ret = -EIO;
+		/*
+		 * Because it's hard for human operator to figure out the
+		 * reason, warn it.
+		 */
+		pr_warn("All packages need to have online CPU to create TD. Online CPU  
and retry.\n");
+		goto free_packages;
+	}
+
+	/*
+	 * Acquire global lock to avoid TDX_OPERAND_BUSY:
+	 * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
+	 * Table (KOT) to track the assigned TDX private HKID.  It doesn't spin
+	 * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
+	 * caller to handle the contention.  This is because of time limitation
+	 * usable inside the TDX module and OS/VMM knows better about process
+	 * scheduling.
+	 *
+	 * APIs to acquire the lock of KOT:
+	 * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
+	 * TDH.PHYMEM.CACHE.WB.
+	 */
+	mutex_lock(&tdx_lock);
+	err = tdh_mng_create(kvm_tdx->tdr.pa, kvm_tdx->hkid);
+	mutex_unlock(&tdx_lock);
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_CREATE, err, NULL);
+		ret = -EIO;
+		goto free_packages;
+	}
+	tdx_mark_td_page_added(&kvm_tdx->tdr);
+
+	for_each_online_cpu(i) {
+		int pkg = topology_physical_package_id(i);
+
+		if (cpumask_test_and_set_cpu(pkg, packages))
+			continue;
+
+		/*
+		 * Program the memory controller in the package with an
+		 * encryption key associated to a TDX private host key id
+		 * assigned to this TDR.  Concurrent operations on same memory
+		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
+		 * mutex.
+		 */
+		mutex_lock(&tdx_mng_key_config_lock[pkg]);
+		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+				      &kvm_tdx->tdr.pa, true);
+		mutex_unlock(&tdx_mng_key_config_lock[pkg]);
+		if (ret)
+			break;
+	}
+	cpus_read_unlock();
+	free_cpumask_var(packages);
+	if (ret)
+		goto teardown;
+
+	for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
+		err = tdh_mng_addcx(kvm_tdx->tdr.pa, kvm_tdx->tdcs[i].pa);
+		if (WARN_ON_ONCE(err)) {
+			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
+			ret = -EIO;
+			goto teardown;
+		}
+		tdx_mark_td_page_added(&kvm_tdx->tdcs[i]);
+	}
+
+	/*
+	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a  
dedicated
+	 * ioctl() to define the configure CPUID values for the TD.
+	 */
+	return 0;
+
+	/*
+	 * The sequence for freeing resources from a partially initialized TD
+	 * varies based on where in the initialization flow failure occurred.
+	 * Simply use the full teardown and destroy, which naturally play nice
+	 * with partial initialization.
+	 */
+teardown:
+	tdx_mmu_release_hkid(kvm);
+	tdx_vm_free(kvm);
+	return ret;

If there is some error that causes an exit through teardown,
tdx_vm_free() will be called, which causes the resources to be
freed. However, tdx_vm_free() is called a second time when the selftest
(or qemu) exits, which causes a second reclaim to be performed.

+
+free_packages:
+	cpus_read_unlock();
+	free_cpumask_var(packages);
+free_tdcs:
+	for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
+		if (!kvm_tdx->tdcs[i].va)
+			continue;
+		free_page(kvm_tdx->tdcs[i].va);
+	}
+	kfree(kvm_tdx->tdcs);
+	kvm_tdx->tdcs = NULL;
+free_tdr:
+	if (kvm_tdx->tdr.va) {
+		free_page(kvm_tdx->tdr.va);
+		kvm_tdx->tdr.added = false;
+		kvm_tdx->tdr.va = 0;
+		kvm_tdx->tdr.pa = 0;
+	}
+free_hkid:
+	if (kvm_tdx->hkid != -1)
+		tdx_hkid_free(kvm_tdx);
+	return ret;
+}

The second reclaim is performed because kvm_tdx->tdcs is still set, and
kvm_tdx->tdr.added is still set, so the second two if blocks in
tdx_vm_free() are executed.

+void tdx_vm_free(struct kvm *kvm)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	int i;
+
+	/* Can't reclaim or free TD pages if teardown failed. */
+	if (is_hkid_assigned(kvm_tdx))
+		return;
+
+	if (kvm_tdx->tdcs) {
+		for (i = 0; i < tdx_caps.tdcs_nr_pages; i++)
+			tdx_reclaim_td_page(&kvm_tdx->tdcs[i]);
+		kfree(kvm_tdx->tdcs);
+	}
+
+	/*
+	 * TDX module maps TDR with TDX global HKID.  TDX module may access TDR
+	 * while operating on TD (Especially reclaiming TDCS).  Cache flush with
+	 * TDX global HKID is needed.
+	 */
+	if (kvm_tdx->tdr.added &&
+		tdx_reclaim_page(kvm_tdx->tdr.va, kvm_tdx->tdr.pa, true,
+				tdx_global_keyid))
+		return;
+
+	free_page(kvm_tdx->tdr.va);
+}

Here's the fix that stopped the crash I was observing

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 2e7916fb72a7..41d1ff1510c3 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -405,6 +405,7 @@ void tdx_vm_free(struct kvm *kvm)
                for (i = 0; i < tdx_caps.tdcs_nr_pages; i++)
                        tdx_reclaim_td_page(&kvm_tdx->tdcs[i]);
                kfree(kvm_tdx->tdcs);
+               kvm_tdx->tdcs = NULL;
        }

        /*
@@ -418,6 +419,9 @@ void tdx_vm_free(struct kvm *kvm)
                return;

        free_page(kvm_tdx->tdr.va);
+       kvm_tdx->tdr.added = false;
+       kvm_tdx->tdr.va = 0;
+       kvm_tdx->tdr.pa = 0;
 }

 static int tdx_do_tdh_mng_key_config(void *param)