[PATCH v2 22/24] KVM: TDX: Finalize VM initialization

Yan Zhao <yan.y.zhao@xxxxxxxxx> · Tue, 12 Nov 2024 15:38:48 +0800

From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>

Introduce a new VM-scoped KVM_MEMORY_ENCRYPT_OP IOCTL subcommand,
KVM_TDX_FINALIZE_VM, to perform TD Measurement Finalization.

The API documentation is provided in a separate patch:
“Documentation/virt/kvm: Document on Trust Domain Extensions (TDX)”.

Enhance TDX’s set_external_spte() hook to record the pre-mapping count
instead of returning without action when the TD is not finalized.

Adjust the pre-mapping count when pages are added or if the mapping is
dropped.

Set pre_fault_allowed to true after the finalization is complete.

Note: TD Measurement Finalization is the process by which the initial state
of the TDX VM is measured for attestation purposes. It uses the SEAMCALL
TDH.MR.FINALIZE, after which:
1. The VMM can no longer add TD private pages with arbitrary content.
2. The TDX VM becomes runnable.

Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
Co-developed-by: Adrian Hunter <adrian.hunter@xxxxxxxxx>
Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@xxxxxxxxx>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@xxxxxxxxx>
Signed-off-by: Yan Zhao <yan.y.zhao@xxxxxxxxx>
---
TDX MMU part 2 v2
 - Merge changes from patch "KVM: TDX: Premap initial guest memory" into
   this patch (Paolo)
 - Consolidate nr_premapped counting into this patch (Paolo)
 - Page level check should be (and is) in tdx_sept_set_private_spte() in
   patch "KVM: TDX: Implement hooks to propagate changes of TDP MMU mirror
   page table" not in tdx_mem_page_record_premap_cnt() (Paolo)
 - Protect finalization using kvm->slots_lock (Paolo)
 - Set kvm->arch.pre_fault_allowed to true after finalization is done
   (Paolo)
 - Add a memory barrier to ensure correct ordering of the updates to
   kvm_tdx->finalized and kvm->arch.pre_fault_allowed (Adrian)
 - pre_fault_allowed must not be true before finalization is done.
   Highlight that fact by checking it in tdx_mem_page_record_premap_cnt()
   (Adrian)
 - No need for is_td_finalized() (Rick)
 - Fixup SEAMCALL call sites due to function parameter changes to SEAMCALL
   wrappers (Kai)
 - Add nr_premapped where it's first used (Tao)

TDX MMU part 2 v1:
 - Added premapped check.
 - Update for the wrapper functions for SEAMCALLs. (Sean)
 - Add check if nr_premapped is zero.  If not, return error.
 - Use KVM_BUG_ON() in tdx_td_finalizer() for consistency.
 - Change tdx_td_finalizemr() to take struct kvm_tdx_cmd *cmd and return error
   (Adrian)
 - Handle TDX_OPERAND_BUSY case (Adrian)
 - Updates from seamcall overhaul (Kai)
 - Rename error->hw_error

v18:
 - Remove the change of tools/arch/x86/include/uapi/asm/kvm.h.

v15:
 - removed unconditional tdx_track() by tdx_flush_tlb_current() that
   does tdx_track().
---
 arch/x86/include/uapi/asm/kvm.h |  1 +
 arch/x86/kvm/vmx/tdx.c          | 78 ++++++++++++++++++++++++++++++---
 arch/x86/kvm/vmx/tdx.h          |  3 ++
 3 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a19cd84cec76..eee6de05f261 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -932,6 +932,7 @@ enum kvm_tdx_cmd_id {
 	KVM_TDX_INIT_VM,
 	KVM_TDX_INIT_VCPU,
 	KVM_TDX_INIT_MEM_REGION,
+	KVM_TDX_FINALIZE_VM,
 	KVM_TDX_GET_CPUID,
 
 	KVM_TDX_CMD_NR_MAX,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 15cedacd717a..acaa11be1031 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -563,6 +563,31 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
 	return 0;
 }
 
+/*
+ * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to get guest pages and
+ * tdx_gmem_post_populate() to premap page table pages into private EPT.
+ * Mapping guest pages into private EPT before TD is finalized should use a
+ * seamcall TDH.MEM.PAGE.ADD(), which copies page content from a source page
+ * from user to target guest pages to be added. This source page is not
+ * available via common interface kvm_tdp_map_page(). So, currently,
+ * kvm_tdp_map_page() only premaps guest pages into KVM mirrored root.
+ * A counter nr_premapped is increased here to record status. The counter will
+ * be decreased after TDH.MEM.PAGE.ADD() is called after the kvm_tdp_map_page()
+ * in tdx_gmem_post_populate().
+ */
+static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
+					  enum pg_level level, kvm_pfn_t pfn)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+	if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
+		return -EINVAL;
+
+	/* nr_premapped will be decreased when tdh_mem_page_add() is called. */
+	atomic64_inc(&kvm_tdx->nr_premapped);
+	return 0;
+}
+
 int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
 			      enum pg_level level, kvm_pfn_t pfn)
 {
@@ -582,14 +607,15 @@ int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
 	 */
 	get_page(pfn_to_page(pfn));
 
+	/*
+	 * To match ordering of 'finalized' and 'pre_fault_allowed' in
+	 * tdx_td_finalizemr().
+	 */
+	smp_rmb();
 	if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
 		return tdx_mem_page_aug(kvm, gfn, level, pfn);
 
-	/*
-	 * TODO: KVM_TDX_INIT_MEM_REGION support to populate before finalize
-	 * comes here for the initial memory.
-	 */
-	return -EOPNOTSUPP;
+	return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
 }
 
 static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
@@ -621,10 +647,12 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
 	if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE &&
 		     err == (TDX_EPT_WALK_FAILED | TDX_OPERAND_ID_RCX))) {
 		/*
-		 * This page was mapped with KVM_MAP_MEMORY, but
-		 * KVM_TDX_INIT_MEM_REGION is not issued yet.
+		 * Page is mapped by KVM_TDX_INIT_MEM_REGION, but hasn't called
+		 * tdh_mem_page_add().
 		 */
 		if (!is_last_spte(entry, level) || !(entry & VMX_EPT_RWX_MASK)) {
+			WARN_ON_ONCE(!atomic64_read(&kvm_tdx->nr_premapped));
+			atomic64_dec(&kvm_tdx->nr_premapped);
 			tdx_unpin(kvm, pfn);
 			return 0;
 		}
@@ -1368,6 +1396,36 @@ void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
 	ept_sync_global();
 }
 
+static int tdx_td_finalizemr(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+	guard(mutex)(&kvm->slots_lock);
+
+	if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+		return -EINVAL;
+	/*
+	 * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
+	 * TDH.MEM.PAGE.ADD().
+	 */
+	if (atomic64_read(&kvm_tdx->nr_premapped))
+		return -EINVAL;
+
+	cmd->hw_error = tdh_mr_finalize(kvm_tdx->tdr_pa);
+	if ((cmd->hw_error & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY)
+		return -EAGAIN;
+	if (KVM_BUG_ON(cmd->hw_error, kvm)) {
+		pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
+		return -EIO;
+	}
+
+	kvm_tdx->state = TD_STATE_RUNNABLE;
+	/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
+	smp_wmb();
+	kvm->arch.pre_fault_allowed = true;
+	return 0;
+}
+
 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_tdx_cmd tdx_cmd;
@@ -1392,6 +1450,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 	case KVM_TDX_INIT_VM:
 		r = tdx_td_init(kvm, &tdx_cmd);
 		break;
+	case KVM_TDX_FINALIZE_VM:
+		r = tdx_td_finalizemr(kvm, &tdx_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
@@ -1656,6 +1717,9 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
 		goto out;
 	}
 
+	WARN_ON_ONCE(!atomic64_read(&kvm_tdx->nr_premapped));
+	atomic64_dec(&kvm_tdx->nr_premapped);
+
 	if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
 		for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
 			err = tdh_mr_extend(kvm_tdx->tdr_pa, gpa + i, &entry,
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 727bcf25d731..aeddf2bb0a94 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -32,6 +32,9 @@ struct kvm_tdx {
 	u64 tsc_offset;
 
 	enum kvm_tdx_state state;
+
+	/* For KVM_TDX_INIT_MEM_REGION. */
+	atomic64_t nr_premapped;
 };
 
 /* TDX module vCPU states */
-- 
2.43.2