Re: [PATCH v19 070/130] KVM: TDX: TDP MMU TDX support

Binbin Wu <binbin.wu@xxxxxxxxxxxxxxx> · Tue, 2 Apr 2024 14:21:41 +0800

On 2/26/2024 4:26 PM, isaku.yamahata@xxxxxxxxx wrote:
From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>

Implement hooks of TDP MMU for TDX backend.  TLB flush, TLB shootdown,
propagating the change private EPT entry to Secure EPT and freeing Secure
EPT page. TLB flush handles both shared EPT and private EPT.  It flushes
shared EPT same as VMX.  It also waits for the TDX TLB shootdown.  For the
hook to free Secure EPT page, unlinks the Secure EPT page from the Secure
EPT so that the page can be freed to OS.

Propagate the entry change to Secure EPT.  The possible entry changes are
present -> non-present(zapping) and non-present -> present(population).  On
population just link the Secure EPT page or the private guest page to the
Secure EPT by TDX SEAMCALL. Because TDP MMU allows concurrent
zapping/population, zapping requires synchronous TLB shoot down with the
frozen EPT entry.

But for private memory, zapping holds write lock, right?

   It zaps the secure entry, increments TLB counter, sends
IPI to remote vcpus to trigger TLB flush, and then unlinks the private
guest page from the Secure EPT. For simplicity, batched zapping with
exclude lock is handled as concurrent zapping.

exclude lock -> exclusive lock

How to understand this sentence?
Since it's holding exclusive lock, how it can be handled as concurrent 
zapping?
Or you want to describe the current implementation prevents concurrent 
zapping?

Although it's inefficient,
it can be optimized in the future.

For MMIO SPTE, the spte value changes as follows.
initial value (suppress VE bit is set)
-> Guest issues MMIO and triggers EPT violation
-> KVM updates SPTE value to MMIO value (suppress VE bit is cleared)
-> Guest MMIO resumes.  It triggers VE exception in guest TD
-> Guest VE handler issues TDG.VP.VMCALL<MMIO>
-> KVM handles MMIO
-> Guest VE handler resumes its execution after MMIO instruction

Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>

---
v19:
- Compile fix when CONFIG_HYPERV != y.
   It's due to the following patch.  Catch it up.
   https://lore.kernel.org/all/20231018192325.1893896-1-seanjc@xxxxxxxxxx/
- Add comments on tlb shootdown to explan the sequence.
- Use gmem_max_level callback, delete tdp_max_page_level.

v18:
- rename tdx_sept_page_aug() -> tdx_mem_page_aug()
- checkpatch: space => tab

v15 -> v16:
- Add the handling of TD_ATTR_SEPT_VE_DISABLE case.

v14 -> v15:
- Implemented tdx_flush_tlb_current()
- Removed unnecessary invept in tdx_flush_tlb().  It was carry over
   from the very old code base.

Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
---
  arch/x86/kvm/mmu/spte.c    |   3 +-
  arch/x86/kvm/vmx/main.c    |  91 ++++++++-
  arch/x86/kvm/vmx/tdx.c     | 372 +++++++++++++++++++++++++++++++++++++
  arch/x86/kvm/vmx/tdx.h     |   2 +-
  arch/x86/kvm/vmx/tdx_ops.h |   6 +
  arch/x86/kvm/vmx/x86_ops.h |  13 ++
  6 files changed, 481 insertions(+), 6 deletions(-)

[...]
+
+static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
+			    enum pg_level level, kvm_pfn_t pfn)
+{
+	int tdx_level = pg_level_to_tdx_sept_level(level);
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	union tdx_sept_level_state level_state;
+	hpa_t hpa = pfn_to_hpa(pfn);
+	gpa_t gpa = gfn_to_gpa(gfn);
+	struct tdx_module_args out;
+	union tdx_sept_entry entry;
+	u64 err;
+
+	err = tdh_mem_page_aug(kvm_tdx->tdr_pa, gpa, hpa, &out);
+	if (unlikely(err == TDX_ERROR_SEPT_BUSY)) {
+		tdx_unpin(kvm, pfn);
+		return -EAGAIN;
+	}
+	if (unlikely(err == (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))) {
+		entry.raw = out.rcx;
+		level_state.raw = out.rdx;
+		if (level_state.level == tdx_level &&
+		    level_state.state == TDX_SEPT_PENDING &&
+		    entry.leaf && entry.pfn == pfn && entry.sve) {
+			tdx_unpin(kvm, pfn);
+			WARN_ON_ONCE(!(to_kvm_tdx(kvm)->attributes &
+				       TDX_TD_ATTR_SEPT_VE_DISABLE));

to_kvm_tdx(kvm) -> kvm_tdx

Since the implementation requires attributes.TDX_TD_ATTR_SEPT_VE_DISABLE is set, should it check the value passed from userspace?
And the reason should be described somewhere in changelog or/and comment.

+			return -EAGAIN;
+		}
+	}
+	if (KVM_BUG_ON(err, kvm)) {
+		pr_tdx_error(TDH_MEM_PAGE_AUG, err, &out);
+		tdx_unpin(kvm, pfn);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
+				     enum pg_level level, kvm_pfn_t pfn)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+	/* TODO: handle large pages. */
+	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+		return -EINVAL;
+
+	/*
+	 * Because restricted mem

The term "restricted mem" is not used anymore, right? Should update the 
comment.

doesn't support page migration with
+	 * a_ops->migrate_page (yet), no callback isn't triggered for KVM on

no callback isn't -> no callback is

+	 * page migration.  Until restricted mem supports page migration,

"restricted mem" -> guest_mem

+	 * prevent page migration.
+	 * TODO: Once restricted mem introduces callback on page migration,

ditto

+	 * implement it and remove get_page/put_page().
+	 */
+	get_page(pfn_to_page(pfn));
+
+	if (likely(is_td_finalized(kvm_tdx)))
+		return tdx_mem_page_aug(kvm, gfn, level, pfn);
+
+	/* TODO: tdh_mem_page_add() comes here for the initial memory. */
+
+	return 0;
+}
+
+static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
+				       enum pg_level level, kvm_pfn_t pfn)
+{
+	int tdx_level = pg_level_to_tdx_sept_level(level);
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	struct tdx_module_args out;
+	gpa_t gpa = gfn_to_gpa(gfn);
+	hpa_t hpa = pfn_to_hpa(pfn);
+	hpa_t hpa_with_hkid;
+	u64 err;
+
+	/* TODO: handle large pages. */
+	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+		return -EINVAL;
+
+	if (unlikely(!is_hkid_assigned(kvm_tdx))) {
+		/*
+		 * The HKID assigned to this TD was already freed and cache
+		 * was already flushed. We don't have to flush again.
+		 */
+		err = tdx_reclaim_page(hpa);
+		if (KVM_BUG_ON(err, kvm))
+			return -EIO;
+		tdx_unpin(kvm, pfn);
+		return 0;
+	}
+
+	do {
+		/*
+		 * When zapping private page, write lock is held. So no race
+		 * condition with other vcpu sept operation.  Race only with
+		 * TDH.VP.ENTER.
+		 */
+		err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &out);
+	} while (unlikely(err == TDX_ERROR_SEPT_BUSY));
+	if (KVM_BUG_ON(err, kvm)) {
+		pr_tdx_error(TDH_MEM_PAGE_REMOVE, err, &out);
+		return -EIO;
+	}
+
+	hpa_with_hkid = set_hkid_to_hpa(hpa, (u16)kvm_tdx->hkid);
+	do {
+		/*
+		 * TDX_OPERAND_BUSY can happen on locking PAMT entry.  Because
+		 * this page was removed above, other thread shouldn't be
+		 * repeatedly operating on this page.  Just retry loop.
+		 */
+		err = tdh_phymem_page_wbinvd(hpa_with_hkid);
+	} while (unlikely(err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX)));
+	if (KVM_BUG_ON(err, kvm)) {
+		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err, NULL);
+		return -EIO;
+	}
+	tdx_clear_page(hpa);
+	tdx_unpin(kvm, pfn);
+	return 0;
+}
+
+static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+				     enum pg_level level, void *private_spt)
+{
+	int tdx_level = pg_level_to_tdx_sept_level(level);
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	gpa_t gpa = gfn_to_gpa(gfn);
+	hpa_t hpa = __pa(private_spt);
+	struct tdx_module_args out;
+	u64 err;
+
+	err = tdh_mem_sept_add(kvm_tdx->tdr_pa, gpa, tdx_level, hpa, &out);

kvm_tdx is only used here, can drop the local var.

+	if (unlikely(err == TDX_ERROR_SEPT_BUSY))
+		return -EAGAIN;
+	if (KVM_BUG_ON(err, kvm)) {
+		pr_tdx_error(TDH_MEM_SEPT_ADD, err, &out);
+		return -EIO;
+	}
+
+	return 0;
+}
+