On Tue, Nov 07, 2023 at 06:56:37AM -0800, isaku.yamahata@xxxxxxxxx wrote: > From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> > > For vcpu migration, in the case of VMX, VMCS is flushed on the source pcpu, > and load it on the target pcpu. There are corresponding TDX SEAMCALL APIs, > call them on vcpu migration. The logic is mostly same as VMX except the > TDX SEAMCALLs are used. > > When shutting down the machine, (VMX or TDX) vcpus needs to be shutdown on > each pcpu. Do the similar for TDX with TDX SEAMCALL APIs. > > Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> > --- > arch/x86/kvm/vmx/main.c | 32 ++++++- > arch/x86/kvm/vmx/tdx.c | 190 ++++++++++++++++++++++++++++++++++++- > arch/x86/kvm/vmx/tdx.h | 2 + > arch/x86/kvm/vmx/x86_ops.h | 4 + > 4 files changed, 221 insertions(+), 7 deletions(-) > > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c > index e7c570686736..8b109d0fe764 100644 > --- a/arch/x86/kvm/vmx/main.c > +++ b/arch/x86/kvm/vmx/main.c > @@ -44,6 +44,14 @@ static int vt_hardware_enable(void) > return ret; > } > ...... > -void tdx_mmu_release_hkid(struct kvm *kvm) > +static int __tdx_mmu_release_hkid(struct kvm *kvm) > { > bool packages_allocated, targets_allocated; > struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); > cpumask_var_t packages, targets; > + struct kvm_vcpu *vcpu; > + unsigned long j; > + int i, ret = 0; > u64 err; > - int i; > > if (!is_hkid_assigned(kvm_tdx)) > - return; > + return 0; > > if (!is_td_created(kvm_tdx)) { > tdx_hkid_free(kvm_tdx); > - return; > + return 0; > } > > packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); > targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); > cpus_read_lock(); > > + kvm_for_each_vcpu(j, vcpu, kvm) > + tdx_flush_vp_on_cpu(vcpu); > + > /* > * We can destroy multiple the guest TDs simultaneously. Prevent > * tdh_phymem_cache_wb from returning TDX_BUSY by serialization. > @@ -236,6 +361,19 @@ void tdx_mmu_release_hkid(struct kvm *kvm) > */ > write_lock(&kvm->mmu_lock); > > + err = tdh_mng_vpflushdone(kvm_tdx->tdr_pa); > + if (err == TDX_FLUSHVP_NOT_DONE) { Not sure IIUC, The __tdx_mmu_release_hkid() is called in MMU release callback, which means all threads of the process have dropped mm by do_exit() so they won't run kvm code anymore, and tdx_flush_vp_on_cpu() is called for each pcpu they run last time, so will this error really happen ? > + ret = -EBUSY; > + goto out; > + } > + if (WARN_ON_ONCE(err)) { > + pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL); > + pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", > + kvm_tdx->hkid); > + ret = -EIO; > + goto out; > + } > + > for_each_online_cpu(i) { > if (packages_allocated && > cpumask_test_and_set_cpu(topology_physical_package_id(i), > @@ -258,14 +396,24 @@ void tdx_mmu_release_hkid(struct kvm *kvm) > pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL); > pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", > kvm_tdx->hkid); > + ret = -EIO; > } else > tdx_hkid_free(kvm_tdx); > > +out: > write_unlock(&kvm->mmu_lock); > mutex_unlock(&tdx_lock); > cpus_read_unlock(); > free_cpumask_var(targets); > free_cpumask_var(packages); > + > + return ret; > +} > + > +void tdx_mmu_release_hkid(struct kvm *kvm) > +{ > + while (__tdx_mmu_release_hkid(kvm) == -EBUSY) > + ; > } > > void tdx_vm_free(struct kvm *kvm) > @@ -429,6 +577,26 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu) > return 0; > } > > +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) > +{ > + struct vcpu_tdx *tdx = to_tdx(vcpu); > + > + if (vcpu->cpu == cpu) > + return; > + > + tdx_flush_vp_on_cpu(vcpu); > + > + local_irq_disable(); > + /* > + * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure > + * vcpu->cpu is read before tdx->cpu_list. > + */ > + smp_rmb(); > + > + list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); > + local_irq_enable(); > +} > + > void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) > { > struct vcpu_tdx *tdx = to_tdx(vcpu); > @@ -469,6 +637,16 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu) > struct vcpu_tdx *tdx = to_tdx(vcpu); > int i; > > + /* > + * When destroying VM, kvm_unload_vcpu_mmu() calls vcpu_load() for every > + * vcpu after they already disassociated from the per cpu list by > + * tdx_mmu_release_hkid(). So we need to disassociate them again, > + * otherwise the freed vcpu data will be accessed when do > + * list_{del,add}() on associated_tdvcpus list later. > + */ > + tdx_disassociate_vp_on_cpu(vcpu); > + WARN_ON_ONCE(vcpu->cpu != -1); > + > /* > * This methods can be called when vcpu allocation/initialization > * failed. So it's possible that hkid, tdvpx and tdvpr are not assigned > @@ -1873,6 +2051,10 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops) > return -EINVAL; > } > > + /* tdx_hardware_disable() uses associated_tdvcpus. */ > + for_each_possible_cpu(i) > + INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i)); > + > for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { > /* > * Here it checks if MSRs (tdx_uret_msrs) can be saved/restored > diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h > index c700792c08e2..4f803814126a 100644 > --- a/arch/x86/kvm/vmx/tdx.h > +++ b/arch/x86/kvm/vmx/tdx.h > @@ -70,6 +70,8 @@ struct vcpu_tdx { > unsigned long tdvpr_pa; > unsigned long *tdvpx_pa; > > + struct list_head cpu_list; > + > union tdx_exit_reason exit_reason; > > bool initialized; > diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h > index 4c9793b5b30d..911ef1e8eeda 100644 > --- a/arch/x86/kvm/vmx/x86_ops.h > +++ b/arch/x86/kvm/vmx/x86_ops.h > @@ -137,6 +137,7 @@ void vmx_setup_mce(struct kvm_vcpu *vcpu); > #ifdef CONFIG_INTEL_TDX_HOST > int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops); > void tdx_hardware_unsetup(void); > +void tdx_hardware_disable(void); > bool tdx_is_vm_type_supported(unsigned long type); > int tdx_offline_cpu(void); > > @@ -153,6 +154,7 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); > fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu); > void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); > void tdx_vcpu_put(struct kvm_vcpu *vcpu); > +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); > u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); > > int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); > @@ -164,6 +166,7 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); > #else > static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -EOPNOTSUPP; } > static inline void tdx_hardware_unsetup(void) {} > +static inline void tdx_hardware_disable(void) {} > static inline bool tdx_is_vm_type_supported(unsigned long type) { return false; } > static inline int tdx_offline_cpu(void) { return 0; } > > @@ -183,6 +186,7 @@ static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {} > static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu) { return EXIT_FASTPATH_NONE; } > static inline void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) {} > static inline void tdx_vcpu_put(struct kvm_vcpu *vcpu) {} > +static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {} > static inline u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; } > > static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; } > -- > 2.25.1 > >