On Thu, 2024-10-31 at 11:57 +0800, Yan Zhao wrote: > When TD is tearing down, > - TD pages are removed and freed when hkid is assigned, so > tdh_phymem_page_reclaim() will not be called for them. > - after vt_vm_destroy() releasing the hkid, kvm_arch_destroy_vm() calls > kvm_destroy_vcpus(), kvm_mmu_uninit_tdp_mmu() and tdx_vm_free() to reclaim > TDCX/TDVPR/EPT/TDR pages sequentially in a single thread. > > So, there should be no contentions expected for current KVM to call > tdh_phymem_page_reclaim(). This links into the question of how much of the wrappers should be in KVM code vs arch/x86. I got the impression Dave would like to not see SEAMCALLs just getting wrapped on KVM's side with what it really needs. Towards that, it could be tempting to move tdx_reclaim_page() (see "[PATCH v2 17/25] KVM: TDX: create/destroy VM structure") into arch/x86 and have arch/x86 handle the tdx_clear_page() part too. That would also be more symmetric with what arch/x86 already does for clflush on the calls that hand pages to the TDX module. But the analysis of why we don't need to worry about TDX_OPERAND_BUSY is based on KVM's current use of tdh_phymem_page_reclaim(). So KVM still has to be the one to reason about TDX_OPERAND_BUSY, and the more we wrap the low level SEAMCALLs, the more brittle and spread out the solution to dance around the TDX module locks becomes. I took a look at dropping the retry loop and moving tdx_reclaim_page() into arch/x86 anyway: arch/x86/include/asm/tdx.h | 3 +-- arch/x86/kvm/vmx/tdx.c | 74 ++++------------------------------------------ ---------------------------- arch/x86/virt/vmx/tdx/tdx.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 55 insertions(+), 85 deletions(-) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 051465261155..790d6d99d895 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -145,13 +145,12 @@ u64 tdh_vp_init(u64 tdvpr, u64 initial_rcx); u64 tdh_vp_rd(u64 tdvpr, u64 field, u64 *data); u64 tdh_vp_wr(u64 tdvpr, u64 field, u64 data, u64 mask); u64 tdh_vp_init_apicid(u64 tdvpr, u64 initial_rcx, u32 x2apicid); -u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8); +u64 tdx_reclaim_page(u64 pa, bool wbind); u64 tdh_mem_page_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx); u64 tdh_mem_sept_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx); u64 tdh_mem_track(u64 tdr); u64 tdh_mem_range_unblock(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx); u64 tdh_phymem_cache_wb(bool resume); -u64 tdh_phymem_page_wbinvd_tdr(u64 tdr); u64 tdh_phymem_page_wbinvd_hkid(u64 hpa, u64 hkid); #else static inline void tdx_init(void) { } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 0ee8ec86d02a..aca73d942344 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -291,67 +291,6 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -static void tdx_clear_page(unsigned long page_pa) -{ - const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); - void *page = __va(page_pa); - unsigned long i; - - /* - * The page could have been poisoned. MOVDIR64B also clears - * the poison bit so the kernel can safely use the page again. - */ - for (i = 0; i < PAGE_SIZE; i += 64) - movdir64b(page + i, zero_page); - /* - * MOVDIR64B store uses WC buffer. Prevent following memory reads - * from seeing potentially poisoned cache. - */ - __mb(); -} - -/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ -static int __tdx_reclaim_page(hpa_t pa) -{ - u64 err, rcx, rdx, r8; - int i; - - for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { - err = tdh_phymem_page_reclaim(pa, &rcx, &rdx, &r8); - - /* - * TDH.PHYMEM.PAGE.RECLAIM is allowed only when TD is shutdown. - * state. i.e. destructing TD. - * TDH.PHYMEM.PAGE.RECLAIM requires TDR and target page. - * Because we're destructing TD, it's rare to contend with TDR. - */ - switch (err) { - case TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX: - case TDX_OPERAND_BUSY | TDX_OPERAND_ID_TDR: - cond_resched(); - continue; - default: - goto out; - } - } - -out: - if (WARN_ON_ONCE(err)) { - pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); - return -EIO; - } - return 0; -} - -static int tdx_reclaim_page(hpa_t pa) -{ - int r; - - r = __tdx_reclaim_page(pa); - if (!r) - tdx_clear_page(pa); - return r; -} /* @@ -365,7 +304,7 @@ static void tdx_reclaim_control_page(unsigned long ctrl_page_pa) * Leak the page if the kernel failed to reclaim the page. * The kernel cannot use it safely anymore. */ - if (tdx_reclaim_page(ctrl_page_pa)) + if (tdx_reclaim_page(ctrl_page_pa, false)) return; free_page((unsigned long)__va(ctrl_page_pa)); @@ -581,20 +520,16 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm) if (!kvm_tdx->tdr_pa) return; - if (__tdx_reclaim_page(kvm_tdx->tdr_pa)) - return; - /* * Use a SEAMCALL to ask the TDX module to flush the cache based on the * KeyID. TDX module may access TDR while operating on TD (Especially * when it is reclaiming TDCS). */ - err = tdh_phymem_page_wbinvd_tdr(kvm_tdx->tdr_pa); + err = tdx_reclaim_page(kvm_tdx->tdr_pa, true); if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + pr_tdx_error(tdx_reclaim_page, err); return; } - tdx_clear_page(kvm_tdx->tdr_pa); free_page((unsigned long)__va(kvm_tdx->tdr_pa)); kvm_tdx->tdr_pa = 0; @@ -1694,7 +1629,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); return -EIO; } - tdx_clear_page(hpa); tdx_unpin(kvm, pfn); return 0; } @@ -1805,7 +1739,7 @@ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, * The HKID assigned to this TD was already freed and cache was * already flushed. We don't have to flush again. */ - return tdx_reclaim_page(__pa(private_spt)); + return tdx_reclaim_page(__pa(private_spt), false); } int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index bad83f6a3b0c..bb7cdb867581 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1892,7 +1892,7 @@ u64 tdh_vp_init_apicid(u64 tdvpr, u64 initial_rcx, u32 x2apicid) } EXPORT_SYMBOL_GPL(tdh_vp_init_apicid); -u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8) +static u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8) { struct tdx_module_args args = { .rcx = page, @@ -1914,7 +1914,49 @@ u64 tdh_phymem_page_reclaim(u64 page, u64 *rcx, u64 *rdx, u64 *r8) return ret; } -EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim); + +static void tdx_clear_page(unsigned long page_pa) +{ + const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); + void *page = __va(page_pa); + unsigned long i; + + /* + * The page could have been poisoned. MOVDIR64B also clears + * the poison bit so the kernel can safely use the page again. + */ + for (i = 0; i < PAGE_SIZE; i += 64) + movdir64b(page + i, zero_page); + /* + * MOVDIR64B store uses WC buffer. Prevent following memory reads + * from seeing potentially poisoned cache. + */ + __mb(); +} + +/* + * tdx_reclaim_page() calls tdh_phymem_page_reclaim() internally. Callers should + * be prepared to handle TDX_OPERAND_BUSY. + * If return code is not an error, page has been cleared with MOVDIR64. + */ +u64 tdx_reclaim_page(u64 pa, bool wbind_global_key) +{ + u64 rcx, rdx, r8; + u64 r; + + r = tdh_phymem_page_reclaim(pa, &rcx, &rdx, &r8); + if (r) + return r; + + /* tdh_phymem_page_wbinvd_hkid() will do tdx_clear_page() */ + if (wbind_global_key) + return tdh_phymem_page_wbinvd_hkid(pa, tdx_global_keyid); + + tdx_clear_page(pa); + + return r; +} +EXPORT_SYMBOL_GPL(tdx_reclaim_page); u64 tdh_mem_page_remove(u64 tdr, u64 gpa, u64 level, u64 *rcx, u64 *rdx) { @@ -1987,22 +2029,17 @@ u64 tdh_phymem_cache_wb(bool resume) } EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb); -u64 tdh_phymem_page_wbinvd_tdr(u64 tdr) -{ - struct tdx_module_args args = {}; - - args.rcx = tdr | ((u64)tdx_global_keyid << boot_cpu_data.x86_phys_bits); - - return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); -} -EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr); - u64 tdh_phymem_page_wbinvd_hkid(u64 hpa, u64 hkid) { struct tdx_module_args args = {}; + u64 err; args.rcx = hpa | (hkid << boot_cpu_data.x86_phys_bits); - return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); + err = seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); + if (!err) + tdx_clear_page(hpa); + + return err; } EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);