--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -36,9 +36,21 @@ static __init int vt_hardware_setup(void)
* is KVM may allocate couple of more bytes than needed for
* each VM.
*/
- if (enable_tdx)
+ if (enable_tdx) {
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size,
sizeof(struct kvm_tdx));
+ /*
+ * Note, TDX may fail to initialize in a later time in
+ * vt_init(), in which case it is not necessary to setup
+ * those callbacks. But making them valid here even
+ * when TDX fails to init later is fine because those
+ * callbacks won't be called if the VM isn't TDX guest.
+ */
+ vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
+ vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
+ vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
+ vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
Nit: The callbacks in 'struct kvm_x86_ops' have name "external", but
TDX callbacks have name "private". Should we rename TDX callbacks to
make them aligned?
+ }
return 0;
}
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 6feb3ab96926..b8cd5a629a80 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -447,6 +447,177 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
}
+static void tdx_unpin(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+
+ put_page(page);
+}
+
+static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ hpa_t hpa = pfn_to_hpa(pfn);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 entry, level_state;
+ u64 err;
+
+ err = tdh_mem_page_aug(kvm_tdx, gpa, hpa, &entry, &level_state);
+ if (unlikely(err == TDX_ERROR_SEPT_BUSY)) {
+ tdx_unpin(kvm, pfn);
+ return -EAGAIN;
+ }
Nit: Here (and other non-fatal error cases) I think we should return
-EBUSY to make it consistent with non-TDX case? E.g., the non-TDX case has:
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
return -EBUSY;
And the comment of tdp_mmu_set_spte_atomic() currently says it can only
return 0 or -EBUSY. It needs to be patched to reflect it can also
return other non-0 errors like -EIO but those are fatal. In terms of
non-fatal error I don't think we need another -EAGAIN.
/*
* tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
[...]
* Return:
* * 0 - If the SPTE was set.
* * -EBUSY - If the SPTE cannot be set. In this case this function will
* have no side-effects other than setting iter->old_spte to
* the last known value of the spte.
*/
[...]
+
+static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{
[...]
+
+ hpa_with_hkid = set_hkid_to_hpa(hpa, (u16)kvm_tdx->hkid);
+ do {
+ /*
+ * TDX_OPERAND_BUSY can happen on locking PAMT entry. Because
+ * this page was removed above, other thread shouldn't be
+ * repeatedly operating on this page. Just retry loop.
+ */
+ err = tdh_phymem_page_wbinvd(hpa_with_hkid);
+ } while (unlikely(err == (TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX)));
In what case(s) other threads can concurrently lock the PAMT entry,
leading to the above BUSY?
[...]
+
+int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{
+ int ret;
+
+ /*
+ * HKID is released when vm_free() which is after closing gmem_fd
From latest dev branch HKID is freed from vt_vm_destroy(), but not
vm_free() (which should be tdx_vm_free() btw).
static void vt_vm_destroy(struct kvm *kvm)
{
if (is_td(kvm))
return tdx_mmu_release_hkid(kvm);
vmx_vm_destroy(kvm);
}
Btw, why not have a tdx_vm_destroy() wrapper? Seems all other vt_xx()s
have a tdx_xx() but only this one calls tdx_mmu_release_hkid() directly.
+ * which causes gmem invalidation to zap all spte.
+ * Population is only allowed after KVM_TDX_INIT_VM.
+ */
What does the second sentence ("Population ...") meaning? Why is it
relevant here?