Hi Isaku, On 2/26/2024 12:26 AM, isaku.yamahata@xxxxxxxxx wrote: > From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> ... > @@ -218,6 +257,87 @@ static void tdx_reclaim_control_page(unsigned long td_page_pa) > free_page((unsigned long)__va(td_page_pa)); > } > > +struct tdx_flush_vp_arg { > + struct kvm_vcpu *vcpu; > + u64 err; > +}; > + > +static void tdx_flush_vp(void *arg_) > +{ > + struct tdx_flush_vp_arg *arg = arg_; > + struct kvm_vcpu *vcpu = arg->vcpu; > + u64 err; > + > + arg->err = 0; > + lockdep_assert_irqs_disabled(); > + > + /* Task migration can race with CPU offlining. */ > + if (unlikely(vcpu->cpu != raw_smp_processor_id())) > + return; > + > + /* > + * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The > + * list tracking still needs to be updated so that it's correct if/when > + * the vCPU does get initialized. > + */ > + if (is_td_vcpu_created(to_tdx(vcpu))) { > + /* > + * No need to retry. TDX Resources needed for TDH.VP.FLUSH are, > + * TDVPR as exclusive, TDR as shared, and TDCS as shared. This > + * vp flush function is called when destructing vcpu/TD or vcpu > + * migration. No other thread uses TDVPR in those cases. > + */ (I have comment later that refer back to this comment about needing retry.) ... > @@ -233,26 +353,31 @@ static void tdx_do_tdh_phymem_cache_wb(void *unused) > pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL); > } > > -void tdx_mmu_release_hkid(struct kvm *kvm) > +static int __tdx_mmu_release_hkid(struct kvm *kvm) > { > bool packages_allocated, targets_allocated; > struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > cpumask_var_t packages, targets; > + struct kvm_vcpu *vcpu; > + unsigned long j; > + int i, ret = 0; > u64 err; > - int i; > > if (!is_hkid_assigned(kvm_tdx)) > - return; > + return 0; > > if (!is_td_created(kvm_tdx)) { > tdx_hkid_free(kvm_tdx); > - return; > + return 0; > } > > packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); > targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); > cpus_read_lock(); > > + kvm_for_each_vcpu(j, vcpu, kvm) > + tdx_flush_vp_on_cpu(vcpu); > + > /* > * We can destroy multiple guest TDs simultaneously. Prevent > * tdh_phymem_cache_wb from returning TDX_BUSY by serialization. > @@ -270,6 +395,19 @@ void tdx_mmu_release_hkid(struct kvm *kvm) > */ > write_lock(&kvm->mmu_lock); > > + err = tdh_mng_vpflushdone(kvm_tdx->tdr_pa); > + if (err == TDX_FLUSHVP_NOT_DONE) { > + ret = -EBUSY; > + goto out; > + } > + if (WARN_ON_ONCE(err)) { > + pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL); > + pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", > + kvm_tdx->hkid); > + ret = -EIO; > + goto out; > + } > + > for_each_online_cpu(i) { > if (packages_allocated && > cpumask_test_and_set_cpu(topology_physical_package_id(i), > @@ -291,14 +429,24 @@ void tdx_mmu_release_hkid(struct kvm *kvm) > pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL); > pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", > kvm_tdx->hkid); > + ret = -EIO; > } else > tdx_hkid_free(kvm_tdx); > > +out: > write_unlock(&kvm->mmu_lock); > mutex_unlock(&tdx_lock); > cpus_read_unlock(); > free_cpumask_var(targets); > free_cpumask_var(packages); > + > + return ret; > +} > + > +void tdx_mmu_release_hkid(struct kvm *kvm) > +{ > + while (__tdx_mmu_release_hkid(kvm) == -EBUSY) > + ; > } As I understand, __tdx_mmu_release_hkid() returns -EBUSY after TDH.VP.FLUSH has been sent for every vCPU followed by TDH.MNG.VPFLUSHDONE, which returns TDX_FLUSHVP_NOT_DONE. Considering earlier comment that a retry of TDH.VP.FLUSH is not needed, why is this while() loop here that sends the TDH.VP.FLUSH again to all vCPUs instead of just a loop within __tdx_mmu_release_hkid() to _just_ resend TDH.MNG.VPFLUSHDONE? Could it be possible for a vCPU to appear during this time, thus be missed in one TDH.VP.FLUSH cycle, to require a new cycle of TDH.VP.FLUSH? I note that TDX_FLUSHVP_NOT_DONE is distinct from TDX_OPERAND_BUSY that can also be returned from TDH.MNG.VPFLUSHDONE and wonder if a retry may be needed in that case also/instead? It looks like TDH.MNG.VPFLUSHDONE needs exclusive access to all operands and I do not know enough yet if this is the case here. Reinette