Re: [PATCH v2 08/25] x86/virt/tdx: Add SEAMCALL wrappers for TDX page cache management

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, 2024-10-31 at 11:57 +0800, Yan Zhao wrote:
> When TD is tearing down,
> - TD pages are removed and freed when hkid is assigned, so
>   tdh_phymem_page_reclaim() will not be called for them.
> - after vt_vm_destroy() releasing the hkid, kvm_arch_destroy_vm() calls
>   kvm_destroy_vcpus(), kvm_mmu_uninit_tdp_mmu() and tdx_vm_free() to reclaim
>   TDCX/TDVPR/EPT/TDR pages sequentially in a single thread.
> 
> So, there should be no contentions expected for current KVM to call
> tdh_phymem_page_reclaim().

This links into the question of how much of the wrappers should be in KVM code
vs arch/x86. I got the impression Dave would like to not see SEAMCALLs just
getting wrapped on KVM's side with what it really needs. Towards that, it could
be tempting to move tdx_reclaim_page() (see "[PATCH v2 17/25] KVM: TDX:
create/destroy VM structure") into arch/x86 and have arch/x86 handle the
tdx_clear_page() part too. That would also be more symmetric with what arch/x86
already does for clflush on the calls that hand pages to the TDX module.

But the analysis of why we don't need to worry about TDX_OPERAND_BUSY is based
on KVM's current use of tdh_phymem_page_reclaim(). So KVM still has to be the
one to reason about TDX_OPERAND_BUSY, and the more we wrap the low level
SEAMCALLs, the more brittle and spread out the solution to dance around the TDX
module locks becomes.

I took a look at dropping the retry loop and moving tdx_reclaim_page() into
arch/x86 anyway:

 arch/x86/include/asm/tdx.h  |  3 +--
 arch/x86/kvm/vmx/tdx.c      | 74 ++++------------------------------------------
----------------------------
 arch/x86/virt/vmx/tdx/tdx.c | 63
++++++++++++++++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 55 insertions(+), 85 deletions(-)


diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 051465261155..790d6d99d895 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -145,13 +145,12 @@ u64 tdh_vp_init(u64 tdvpr, u64 initial_rcx);
 u64 tdh_vp_rd(u64 tdvpr, u64 field, u64 *data);
 u64 tdh_vp_wr(u64 tdvpr, u64 field, u64 data, u64 mask);
 u64 tdh_vp_init_apicid(u64 tdvpr, u64 initial_rcx, u32 x2apicid);
-u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8);
+u64 tdx_reclaim_page(u64 pa, bool wbind);
 u64 tdh_mem_page_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx);
 u64 tdh_mem_sept_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx);
 u64 tdh_mem_track(u64 tdr);
 u64 tdh_mem_range_unblock(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx);
 u64 tdh_phymem_cache_wb(bool resume);
-u64 tdh_phymem_page_wbinvd_tdr(u64 tdr);
 u64 tdh_phymem_page_wbinvd_hkid(u64 hpa, u64 hkid);
 #else
 static inline void tdx_init(void) { }
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 0ee8ec86d02a..aca73d942344 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -291,67 +291,6 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu
*vcpu)
        vcpu->cpu = -1;
 }
 
-static void tdx_clear_page(unsigned long page_pa)
-{
-       const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
-       void *page = __va(page_pa);
-       unsigned long i;
-
-       /*
-        * The page could have been poisoned.  MOVDIR64B also clears
-        * the poison bit so the kernel can safely use the page again.
-        */
-       for (i = 0; i < PAGE_SIZE; i += 64)
-               movdir64b(page + i, zero_page);
-       /*
-        * MOVDIR64B store uses WC buffer.  Prevent following memory reads
-        * from seeing potentially poisoned cache.
-        */
-       __mb();
-}
-
-/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
-static int __tdx_reclaim_page(hpa_t pa)
-{
-       u64 err, rcx, rdx, r8;
-       int i;
-
-       for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
-               err = tdh_phymem_page_reclaim(pa, &rcx, &rdx, &r8);
-
-               /*
-                * TDH.PHYMEM.PAGE.RECLAIM is allowed only when TD is shutdown.
-                * state.  i.e. destructing TD.
-                * TDH.PHYMEM.PAGE.RECLAIM requires TDR and target page.
-                * Because we're destructing TD, it's rare to contend with TDR.
-                */
-               switch (err) {
-               case TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX:
-               case TDX_OPERAND_BUSY | TDX_OPERAND_ID_TDR:
-                       cond_resched();
-                       continue;
-               default:
-                       goto out;
-               }
-       }
-
-out:
-       if (WARN_ON_ONCE(err)) {
-               pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
-               return -EIO;
-       }
-       return 0;
-}
-
-static int tdx_reclaim_page(hpa_t pa)
-{
-       int r;
-
-       r = __tdx_reclaim_page(pa);
-       if (!r)
-               tdx_clear_page(pa);
-       return r;
-}
 
 
 /*
@@ -365,7 +304,7 @@ static void tdx_reclaim_control_page(unsigned long
ctrl_page_pa)
         * Leak the page if the kernel failed to reclaim the page.
         * The kernel cannot use it safely anymore.
         */
-       if (tdx_reclaim_page(ctrl_page_pa))
+       if (tdx_reclaim_page(ctrl_page_pa, false))
                return;
 
        free_page((unsigned long)__va(ctrl_page_pa));
@@ -581,20 +520,16 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
        if (!kvm_tdx->tdr_pa)
                return;
 
-       if (__tdx_reclaim_page(kvm_tdx->tdr_pa))
-               return;
-
        /*
         * Use a SEAMCALL to ask the TDX module to flush the cache based on the
         * KeyID. TDX module may access TDR while operating on TD (Especially
         * when it is reclaiming TDCS).
         */
-       err = tdh_phymem_page_wbinvd_tdr(kvm_tdx->tdr_pa);
+       err = tdx_reclaim_page(kvm_tdx->tdr_pa, true);
        if (KVM_BUG_ON(err, kvm)) {
-               pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+               pr_tdx_error(tdx_reclaim_page, err);
                return;
        }
-       tdx_clear_page(kvm_tdx->tdr_pa);
 
        free_page((unsigned long)__va(kvm_tdx->tdr_pa));
        kvm_tdx->tdr_pa = 0;
@@ -1694,7 +1629,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm,
gfn_t gfn,
                pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
                return -EIO;
        }
-       tdx_clear_page(hpa);
        tdx_unpin(kvm, pfn);
        return 0;
 }
@@ -1805,7 +1739,7 @@ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
         * The HKID assigned to this TD was already freed and cache was
         * already flushed. We don't have to flush again.
         */
-       return tdx_reclaim_page(__pa(private_spt));
+       return tdx_reclaim_page(__pa(private_spt), false);
 }
 
 int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index bad83f6a3b0c..bb7cdb867581 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1892,7 +1892,7 @@ u64 tdh_vp_init_apicid(u64 tdvpr, u64 initial_rcx, u32
x2apicid)
 }
 EXPORT_SYMBOL_GPL(tdh_vp_init_apicid);
 
-u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8)
+static u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8)
 {
        struct tdx_module_args args = {
                .rcx = page,
@@ -1914,7 +1914,49 @@ u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx,
u64 *r8)
 
        return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
+
+static void tdx_clear_page(unsigned long page_pa)
+{
+       const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+       void *page = __va(page_pa);
+       unsigned long i;
+
+       /*
+        * The page could have been poisoned.  MOVDIR64B also clears
+        * the poison bit so the kernel can safely use the page again.
+        */
+       for (i = 0; i < PAGE_SIZE; i += 64)
+               movdir64b(page + i, zero_page);
+       /*
+        * MOVDIR64B store uses WC buffer.  Prevent following memory reads
+        * from seeing potentially poisoned cache.
+        */
+       __mb();
+}
+
+/*
+ * tdx_reclaim_page() calls tdh_phymem_page_reclaim() internally. Callers
should
+ * be prepared to handle TDX_OPERAND_BUSY.
+ * If return code is not an error, page has been cleared with MOVDIR64.
+ */
+u64 tdx_reclaim_page(u64 pa, bool wbind_global_key)
+{
+       u64 rcx, rdx, r8;
+       u64 r;
+
+       r = tdh_phymem_page_reclaim(pa, &rcx, &rdx, &r8);
+       if (r)
+               return r;
+
+       /* tdh_phymem_page_wbinvd_hkid() will do tdx_clear_page() */
+       if (wbind_global_key)
+               return tdh_phymem_page_wbinvd_hkid(pa, tdx_global_keyid);
+
+       tdx_clear_page(pa);
+
+       return r;
+}
+EXPORT_SYMBOL_GPL(tdx_reclaim_page);
 
 u64 tdh_mem_page_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx)
 {
@@ -1987,22 +2029,17 @@ u64 tdh_phymem_cache_wb(bool resume)
 }
 EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb);
 
-u64 tdh_phymem_page_wbinvd_tdr(u64 tdr)
-{
-       struct tdx_module_args args = {};
-
-       args.rcx = tdr | ((u64)tdx_global_keyid << boot_cpu_data.x86_phys_bits);
-
-       return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
-}
-EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr);
-
 u64 tdh_phymem_page_wbinvd_hkid(u64 hpa, u64 hkid)
 {
        struct tdx_module_args args = {};
+       u64 err;
 
        args.rcx = hpa | (hkid << boot_cpu_data.x86_phys_bits);
 
-       return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
+       err = seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
+       if (!err)
+               tdx_clear_page(hpa);
+
+       return err;
 }
 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux