From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> When shutting down the machine, (VMX or TDX) vcpus needs to be shutdown on each pcpu. Do the similar for TDX with TDX SEAMCALL APIs. Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> --- arch/x86/kvm/vmx/main.c | 23 +++++++++++-- arch/x86/kvm/vmx/tdx.c | 70 ++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/vmx/tdx.h | 2 ++ arch/x86/kvm/vmx/x86_ops.h | 4 +++ 4 files changed, 95 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 2cd5ba0e8788..882358ac270b 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -13,6 +13,25 @@ static bool vt_is_vm_type_supported(unsigned long type) return type == KVM_X86_DEFAULT_VM || tdx_is_vm_type_supported(type); } +static int vt_hardware_enable(void) +{ + int ret; + + ret = vmx_hardware_enable(); + if (ret) + return ret; + + tdx_hardware_enable(); + return 0; +} + +static void vt_hardware_disable(void) +{ + /* Note, TDX *and* VMX need to be disabled if TDX is enabled. */ + tdx_hardware_disable(); + vmx_hardware_disable(); +} + static __init int vt_hardware_setup(void) { int ret; @@ -199,8 +218,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .hardware_unsetup = vt_hardware_unsetup, - .hardware_enable = vmx_hardware_enable, - .hardware_disable = vmx_hardware_disable, + .hardware_enable = vt_hardware_enable, + .hardware_disable = vt_hardware_disable, .cpu_has_accelerated_tpr = report_flexpriority, .has_emulated_msr = vmx_has_emulated_msr, diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index a6b1a8ce888d..690298fb99c7 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -48,6 +48,14 @@ struct tdx_capabilities tdx_caps; static DEFINE_MUTEX(tdx_lock); static struct mutex *tdx_mng_key_config_lock; +/* + * A per-CPU list of TD vCPUs associated with a given CPU. Used when a CPU + * is brought down to invoke TDH_VP_FLUSH on the approapriate TD vCPUS. + * Protected by interrupt mask. This list is manipulated in process context + * of vcpu and IPI callback. See tdx_flush_vp_on_cpu(). + */ +static DEFINE_PER_CPU(struct list_head, associated_tdvcpus); + static u64 hkid_mask __ro_after_init; static u8 hkid_start_pos __ro_after_init; @@ -87,6 +95,8 @@ static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx) static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) { + list_del(&to_tdx(vcpu)->cpu_list); + /* * Ensure tdx->cpu_list is updated is before setting vcpu->cpu to -1, * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU @@ -97,6 +107,22 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } +void tdx_hardware_enable(void) +{ + INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, raw_smp_processor_id())); +} + +void tdx_hardware_disable(void) +{ + int cpu = raw_smp_processor_id(); + struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu); + struct vcpu_tdx *tdx, *tmp; + + /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */ + list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) + tdx_disassociate_vp(&tdx->vcpu); +} + static void tdx_clear_page(unsigned long page) { const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); @@ -230,9 +256,11 @@ void tdx_mmu_prezap(struct kvm *kvm) struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); cpumask_var_t packages; bool cpumask_allocated; + struct kvm_vcpu *vcpu; u64 err; int ret; int i; + unsigned long j; if (!is_hkid_assigned(kvm_tdx)) return; @@ -248,6 +276,17 @@ void tdx_mmu_prezap(struct kvm *kvm) return; } + kvm_for_each_vcpu(j, vcpu, kvm) + tdx_flush_vp_on_cpu(vcpu); + + mutex_lock(&tdx_lock); + err = tdh_mng_vpflushdone(kvm_tdx->tdr.pa); + mutex_unlock(&tdx_lock); + if (WARN_ON_ONCE(err)) { + pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL); + return; + } + cpumask_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); for_each_online_cpu(i) { if (cpumask_allocated && @@ -472,8 +511,22 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu) void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - if (vcpu->cpu != cpu) - tdx_flush_vp_on_cpu(vcpu); + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (vcpu->cpu == cpu) + return; + + tdx_flush_vp_on_cpu(vcpu); + + local_irq_disable(); + /* + * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure + * vcpu->cpu is read before tdx->cpu_list. + */ + smp_rmb(); + + list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); + local_irq_enable(); } void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) @@ -522,6 +575,19 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu) tdx_reclaim_td_page(&tdx->tdvpx[i]); kfree(tdx->tdvpx); tdx_reclaim_td_page(&tdx->tdvpr); + + /* + * kvm_free_vcpus() + * -> kvm_unload_vcpu_mmu() + * + * does vcpu_load() for every vcpu after they already disassociated + * from the per cpu list when tdx_vm_teardown(). So we need to + * disassociate them again, otherwise the freed vcpu data will be + * accessed when do list_{del,add}() on associated_tdvcpus list + * later. + */ + tdx_flush_vp_on_cpu(vcpu); + WARN_ON(vcpu->cpu != -1); } void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 8b1cf9c158e3..180360a65545 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -81,6 +81,8 @@ struct vcpu_tdx { struct tdx_td_page tdvpr; struct tdx_td_page *tdvpx; + struct list_head cpu_list; + union tdx_exit_reason exit_reason; bool initialized; diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ceafd6e18f4e..aae0f4449ec5 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -132,6 +132,8 @@ void __init tdx_pre_kvm_init(unsigned int *vcpu_size, bool tdx_is_vm_type_supported(unsigned long type); void __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops); void tdx_hardware_unsetup(void); +void tdx_hardware_enable(void); +void tdx_hardware_disable(void); int tdx_vm_init(struct kvm *kvm); void tdx_mmu_prezap(struct kvm *kvm); @@ -156,6 +158,8 @@ static inline void tdx_pre_kvm_init( static inline bool tdx_is_vm_type_supported(unsigned long type) { return false; } static inline void tdx_hardware_setup(struct kvm_x86_ops *x86_ops) {} static inline void tdx_hardware_unsetup(void) {} +static inline void tdx_hardware_enable(void) {} +static inline void tdx_hardware_disable(void) {} static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; } static inline void tdx_mmu_prezap(struct kvm *kvm) {} -- 2.25.1