Re: [RFC PATCH v5 024/104] KVM: TDX: create/destroy VM structure

Paolo Bonzini <pbonzini@xxxxxxxxxx> · Tue, 5 Apr 2022 14:44:04 +0200

On 3/4/22 20:48, isaku.yamahata@xxxxxxxxx wrote:
From: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>

As the first step to create TDX guest, create/destroy VM struct.  Assign
Host Key ID (HKID) to the TDX guest for memory encryption and allocate
extra pages for the TDX guest. On destruction, free allocated pages, and
HKID.

Add a second kvm_x86_ops hook in kvm_arch_vm_destroy() to support TDX's
destruction path, which needs to first put the VM into a teardown state,
then free per-vCPU resources, and finally free per-VM resources.

Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
---
  arch/x86/kvm/vmx/main.c      |  16 +-
  arch/x86/kvm/vmx/tdx.c       | 312 +++++++++++++++++++++++++++++++++++
  arch/x86/kvm/vmx/tdx.h       |   2 +
  arch/x86/kvm/vmx/tdx_errno.h |   2 +-
  arch/x86/kvm/vmx/tdx_ops.h   |   8 +
  arch/x86/kvm/vmx/x86_ops.h   |   8 +
  6 files changed, 346 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 6111c6485d8e..5c3a904a30e8 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -39,12 +39,24 @@ static int vt_vm_init(struct kvm *kvm)
  		ret = tdx_module_setup();
  		if (ret)
  			return ret;
-		return -EOPNOTSUPP;	/* Not ready to create guest TD yet. */
+		return tdx_vm_init(kvm);
  	}
  
  	return vmx_vm_init(kvm);
  }
  
+static void vt_mmu_prezap(struct kvm *kvm)
+{
+	if (is_td(kvm))
+		return tdx_mmu_prezap(kvm);
+}

Please rename the function to explain what it does, for example 
tdx_mmu_release_hkid.

Paolo

+static void vt_vm_free(struct kvm *kvm)
+{
+	if (is_td(kvm))
+		return tdx_vm_free(kvm);
+}

+
  struct kvm_x86_ops vt_x86_ops __initdata = {
  	.name = "kvm_intel",
  
@@ -58,6 +70,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
  	.is_vm_type_supported = vt_is_vm_type_supported,
  	.vm_size = sizeof(struct kvm_vmx),
  	.vm_init = vt_vm_init,
+	.mmu_prezap = vt_mmu_prezap,
+	.vm_free = vt_vm_free,
  
  	.vcpu_create = vmx_vcpu_create,
  	.vcpu_free = vmx_vcpu_free,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 1c8222f54764..702953fd365f 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -31,14 +31,324 @@ struct tdx_capabilities {
  	struct tdx_cpuid_config cpuid_configs[TDX_MAX_NR_CPUID_CONFIGS];
  };
  
+/* KeyID used by TDX module */
+static u32 tdx_global_keyid __read_mostly;
+
  /* Capabilities of KVM + the TDX module. */
  struct tdx_capabilities tdx_caps;
  
+static DEFINE_MUTEX(tdx_lock);
  static struct mutex *tdx_mng_key_config_lock;
  
  static u64 hkid_mask __ro_after_init;
  static u8 hkid_start_pos __ro_after_init;
  
+static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
+{
+	pa &= ~hkid_mask;
+	pa |= (u64)hkid << hkid_start_pos;
+
+	return pa;
+}
+
+static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
+{
+	return kvm_tdx->tdr.added;
+}
+
+static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
+{
+	tdx_keyid_free(kvm_tdx->hkid);
+	kvm_tdx->hkid = -1;
+}
+
+static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
+{
+	return kvm_tdx->hkid > 0;
+}
+
+static void tdx_clear_page(unsigned long page)
+{
+	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+	unsigned long i;
+
+	/* Zeroing the page is only necessary for systems with MKTME-i. */
+	if (!static_cpu_has(X86_FEATURE_MOVDIR64B))
+		return;
+
+	for (i = 0; i < 4096; i += 64)
+		/* MOVDIR64B [rdx], es:rdi */
+		asm (".byte 0x66, 0x0f, 0x38, 0xf8, 0x3a"
+		     : : "d" (zero_page), "D" (page + i) : "memory");
+}
+
+static int __tdx_reclaim_page(unsigned long va, hpa_t pa, bool do_wb, u16 hkid)
+{
+	struct tdx_module_output out;
+	u64 err;
+
+	err = tdh_phymem_page_reclaim(pa, &out);
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_PHYMEM_PAGE_RECLAIM, err, &out);
+		return -EIO;
+	}
+
+	if (do_wb) {
+		err = tdh_phymem_page_wbinvd(set_hkid_to_hpa(pa, hkid));
+		if (WARN_ON_ONCE(err)) {
+			pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err, NULL);
+			return -EIO;
+		}
+	}
+
+	tdx_clear_page(va);
+	return 0;
+}
+
+static int tdx_reclaim_page(unsigned long va, hpa_t pa)
+{
+	return __tdx_reclaim_page(va, pa, false, 0);
+}
+
+static int tdx_alloc_td_page(struct tdx_td_page *page)
+{
+	page->va = __get_free_page(GFP_KERNEL_ACCOUNT);
+	if (!page->va)
+		return -ENOMEM;
+
+	page->pa = __pa(page->va);
+	return 0;
+}
+
+static void tdx_mark_td_page_added(struct tdx_td_page *page)
+{
+	WARN_ON_ONCE(page->added);
+	page->added = true;
+}
+
+static void tdx_reclaim_td_page(struct tdx_td_page *page)
+{
+	if (page->added) {
+		if (tdx_reclaim_page(page->va, page->pa))
+			return;
+
+		page->added = false;
+	}
+	free_page(page->va);
+}
+
+static int tdx_do_tdh_phymem_cache_wb(void *param)
+{
+	u64 err = 0;
+
+	/*
+	 * We can destroy multiple the guest TDs simultaneously.  Prevent
+	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
+	 */
+	mutex_lock(&tdx_lock);
+	do {
+		err = tdh_phymem_cache_wb(!!err);
+	} while (err == TDX_INTERRUPTED_RESUMABLE);
+	mutex_unlock(&tdx_lock);
+
+	/* Other thread may have done for us. */
+	if (err == TDX_NO_HKID_READY_TO_WBCACHE)
+		err = TDX_SUCCESS;
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+void tdx_mmu_prezap(struct kvm *kvm)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	cpumask_var_t packages;
+	bool cpumask_allocated;
+	u64 err;
+	int ret;
+	int i;
+
+	if (!is_hkid_assigned(kvm_tdx))
+		return;
+
+	if (!is_td_created(kvm_tdx))
+		goto free_hkid;
+
+	mutex_lock(&tdx_lock);
+	err = tdh_mng_key_reclaimid(kvm_tdx->tdr.pa);
+	mutex_unlock(&tdx_lock);
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_KEY_RECLAIMID, err, NULL);
+		return;
+	}
+
+	cpumask_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
+	for_each_online_cpu(i) {
+		if (cpumask_allocated &&
+			cpumask_test_and_set_cpu(topology_physical_package_id(i),
+						packages))
+			continue;
+
+		ret = smp_call_on_cpu(i, tdx_do_tdh_phymem_cache_wb, NULL, 1);
+		if (ret)
+			break;
+	}
+	free_cpumask_var(packages);
+
+	mutex_lock(&tdx_lock);
+	err = tdh_mng_key_freeid(kvm_tdx->tdr.pa);
+	mutex_unlock(&tdx_lock);
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
+		return;
+	}
+
+free_hkid:
+	tdx_hkid_free(kvm_tdx);
+}
+
+void tdx_vm_free(struct kvm *kvm)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	int i;
+
+	/* Can't reclaim or free TD pages if teardown failed. */
+	if (is_hkid_assigned(kvm_tdx))
+		return;
+
+	for (i = 0; i < tdx_caps.tdcs_nr_pages; i++)
+		tdx_reclaim_td_page(&kvm_tdx->tdcs[i]);
+	kfree(kvm_tdx->tdcs);
+
+	if (kvm_tdx->tdr.added &&
+		__tdx_reclaim_page(kvm_tdx->tdr.va, kvm_tdx->tdr.pa, true,
+				tdx_global_keyid))
+		return;
+
+	free_page(kvm_tdx->tdr.va);
+}
+
+static int tdx_do_tdh_mng_key_config(void *param)
+{
+	hpa_t *tdr_p = param;
+	int cpu, cur_pkg;
+	u64 err;
+
+	cpu = raw_smp_processor_id();
+	cur_pkg = topology_physical_package_id(cpu);
+
+	mutex_lock(&tdx_mng_key_config_lock[cur_pkg]);
+	do {
+		err = tdh_mng_key_config(*tdr_p);
+	} while (err == TDX_KEY_GENERATION_FAILED);
+	mutex_unlock(&tdx_mng_key_config_lock[cur_pkg]);
+
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_KEY_CONFIG, err, NULL);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+int tdx_vm_init(struct kvm *kvm)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	cpumask_var_t packages;
+	int ret, i;
+	u64 err;
+
+	/* vCPUs can't be created until after KVM_TDX_INIT_VM. */
+	kvm->max_vcpus = 0;
+
+	kvm_tdx->hkid = tdx_keyid_alloc();
+	if (kvm_tdx->hkid < 0)
+		return -EBUSY;
+
+	ret = tdx_alloc_td_page(&kvm_tdx->tdr);
+	if (ret)
+		goto free_hkid;
+
+	kvm_tdx->tdcs = kcalloc(tdx_caps.tdcs_nr_pages, sizeof(*kvm_tdx->tdcs),
+				GFP_KERNEL_ACCOUNT);
+	if (!kvm_tdx->tdcs)
+		goto free_tdr;
+	for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
+		ret = tdx_alloc_td_page(&kvm_tdx->tdcs[i]);
+		if (ret)
+			goto free_tdcs;
+	}
+
+	mutex_lock(&tdx_lock);
+	err = tdh_mng_create(kvm_tdx->tdr.pa, kvm_tdx->hkid);
+	mutex_unlock(&tdx_lock);
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_CREATE, err, NULL);
+		ret = -EIO;
+		goto free_tdcs;
+	}
+	tdx_mark_td_page_added(&kvm_tdx->tdr);
+
+	if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto free_tdcs;
+	}
+	for_each_online_cpu(i) {
+		if (cpumask_test_and_set_cpu(topology_physical_package_id(i),
+						packages))
+			continue;
+
+		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+				&kvm_tdx->tdr.pa, 1);
+		if (ret)
+			break;
+	}
+	free_cpumask_var(packages);
+	if (ret)
+		goto teardown;
+
+	for (i = 0; i < tdx_caps.tdcs_nr_pages; i++) {
+		err = tdh_mng_addcx(kvm_tdx->tdr.pa, kvm_tdx->tdcs[i].pa);
+		if (WARN_ON_ONCE(err)) {
+			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
+			ret = -EIO;
+			goto teardown;
+		}
+		tdx_mark_td_page_added(&kvm_tdx->tdcs[i]);
+	}
+
+	/*
+	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
+	 * ioctl() to define the configure CPUID values for the TD.
+	 */
+	return 0;
+
+	/*
+	 * The sequence for freeing resources from a partially initialized TD
+	 * varies based on where in the initialization flow failure occurred.
+	 * Simply use the full teardown and destroy, which naturally play nice
+	 * with partial initialization.
+	 */
+teardown:
+	tdx_mmu_prezap(kvm);
+	tdx_vm_free(kvm);
+	return ret;
+
+free_tdcs:
+	/* @i points at the TDCS page that failed allocation. */
+	for (--i; i >= 0; i--)
+		free_page(kvm_tdx->tdcs[i].va);
+	kfree(kvm_tdx->tdcs);
+free_tdr:
+	free_page(kvm_tdx->tdr.va);
+free_hkid:
+	tdx_hkid_free(kvm_tdx);
+	return ret;
+}
+
  static int __tdx_module_setup(void)
  {
  	const struct tdsysinfo_struct *tdsysinfo;
@@ -59,6 +369,8 @@ static int __tdx_module_setup(void)
  		return ret;
  	}
  
+	tdx_global_keyid = tdx_get_global_keyid();
+
  	tdsysinfo = tdx_get_sysinfo();
  	if (tdx_caps.nr_cpuid_configs > TDX_MAX_NR_CPUID_CONFIGS)
  		return -EIO;
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index e4bb8831764e..860136ed70f5 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -19,6 +19,8 @@ struct kvm_tdx {
  
  	struct tdx_td_page tdr;
  	struct tdx_td_page *tdcs;
+
+	int hkid;
  };
  
  struct vcpu_tdx {
diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h
index 5c878488795d..590fcfdd1899 100644
--- a/arch/x86/kvm/vmx/tdx_errno.h
+++ b/arch/x86/kvm/vmx/tdx_errno.h
@@ -12,11 +12,11 @@
  #define TDX_SUCCESS				0x0000000000000000ULL
  #define TDX_NON_RECOVERABLE_VCPU		0x4000000100000000ULL
  #define TDX_INTERRUPTED_RESUMABLE		0x8000000300000000ULL
-#define TDX_LIFECYCLE_STATE_INCORRECT		0xC000060700000000ULL
  #define TDX_VCPU_NOT_ASSOCIATED			0x8000070200000000ULL
  #define TDX_KEY_GENERATION_FAILED		0x8000080000000000ULL
  #define TDX_KEY_STATE_INCORRECT			0xC000081100000000ULL
  #define TDX_KEY_CONFIGURED			0x0000081500000000ULL
+#define TDX_NO_HKID_READY_TO_WBCACHE		0x0000082100000000ULL
  #define TDX_EPT_WALK_FAILED			0xC0000B0000000000ULL
  
  /*
diff --git a/arch/x86/kvm/vmx/tdx_ops.h b/arch/x86/kvm/vmx/tdx_ops.h
index 0bed43879b82..3dd5b4c3f04c 100644
--- a/arch/x86/kvm/vmx/tdx_ops.h
+++ b/arch/x86/kvm/vmx/tdx_ops.h
@@ -6,6 +6,7 @@
  
  #include <linux/compiler.h>
  
+#include <asm/cacheflush.h>
  #include <asm/asm.h>
  #include <asm/kvm_host.h>
  
@@ -15,8 +16,14 @@
  
  #ifdef CONFIG_INTEL_TDX_HOST
  
+static inline void tdx_clflush_page(hpa_t addr)
+{
+	clflush_cache_range(__va(addr), PAGE_SIZE);
+}
+
  static inline u64 tdh_mng_addcx(hpa_t tdr, hpa_t addr)
  {
+	tdx_clflush_page(addr);
  	return kvm_seamcall(TDH_MNG_ADDCX, addr, tdr, 0, 0, 0, NULL);
  }
  
@@ -56,6 +63,7 @@ static inline u64 tdh_mng_key_config(hpa_t tdr)
  
  static inline u64 tdh_mng_create(hpa_t tdr, int hkid)
  {
+	tdx_clflush_page(tdr);
  	return kvm_seamcall(TDH_MNG_CREATE, tdr, hkid, 0, 0, 0, NULL);
  }
  
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index da32b4b86b19..2b2738c768d6 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -132,12 +132,20 @@ void __init tdx_pre_kvm_init(unsigned int *vcpu_size,
  bool tdx_is_vm_type_supported(unsigned long type);
  void __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
  void tdx_hardware_unsetup(void);
+
+int tdx_vm_init(struct kvm *kvm);
+void tdx_mmu_prezap(struct kvm *kvm);
+void tdx_vm_free(struct kvm *kvm);
  #else
  static inline void tdx_pre_kvm_init(
  	unsigned int *vcpu_size, unsigned int *vcpu_align, unsigned int *vm_size) {}
  static inline bool tdx_is_vm_type_supported(unsigned long type) { return false; }
  static inline void tdx_hardware_setup(struct kvm_x86_ops *x86_ops) {}
  static inline void tdx_hardware_unsetup(void) {}
+
+static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
+static inline void tdx_mmu_prezap(struct kvm *kvm) {}
+static inline void tdx_vm_free(struct kvm *kvm) {}
  #endif
  
  #endif /* __KVM_X86_VMX_X86_OPS_H */