Re: [PATCH v17 071/116] KVM: TDX: handle vcpu migration over logical processor

Yuan Yao <yuan.yao@xxxxxxxxxxxxxxx> · Wed, 15 Nov 2023 14:49:56 +0800

On Tue, Nov 07, 2023 at 06:56:37AM -0800, isaku.yamahata@xxxxxxxxx wrote:
> From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
>
> For vcpu migration, in the case of VMX, VMCS is flushed on the source pcpu,
> and load it on the target pcpu.  There are corresponding TDX SEAMCALL APIs,
> call them on vcpu migration.  The logic is mostly same as VMX except the
> TDX SEAMCALLs are used.
>
> When shutting down the machine, (VMX or TDX) vcpus needs to be shutdown on
> each pcpu.  Do the similar for TDX with TDX SEAMCALL APIs.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> ---
>  arch/x86/kvm/vmx/main.c    |  32 ++++++-
>  arch/x86/kvm/vmx/tdx.c     | 190 ++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/vmx/tdx.h     |   2 +
>  arch/x86/kvm/vmx/x86_ops.h |   4 +
>  4 files changed, 221 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index e7c570686736..8b109d0fe764 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -44,6 +44,14 @@ static int vt_hardware_enable(void)
>  	return ret;
>  }
>
......
> -void tdx_mmu_release_hkid(struct kvm *kvm)
> +static int __tdx_mmu_release_hkid(struct kvm *kvm)
>  {
>  	bool packages_allocated, targets_allocated;
>  	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>  	cpumask_var_t packages, targets;
> +	struct kvm_vcpu *vcpu;
> +	unsigned long j;
> +	int i, ret = 0;
>  	u64 err;
> -	int i;
>
>  	if (!is_hkid_assigned(kvm_tdx))
> -		return;
> +		return 0;
>
>  	if (!is_td_created(kvm_tdx)) {
>  		tdx_hkid_free(kvm_tdx);
> -		return;
> +		return 0;
>  	}
>
>  	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
>  	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
>  	cpus_read_lock();
>
> +	kvm_for_each_vcpu(j, vcpu, kvm)
> +		tdx_flush_vp_on_cpu(vcpu);
> +
>  	/*
>  	 * We can destroy multiple the guest TDs simultaneously.  Prevent
>  	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
> @@ -236,6 +361,19 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
>  	 */
>  	write_lock(&kvm->mmu_lock);
>
> +	err = tdh_mng_vpflushdone(kvm_tdx->tdr_pa);
> +	if (err == TDX_FLUSHVP_NOT_DONE) {

Not sure IIUC, The __tdx_mmu_release_hkid() is called in MMU release
callback, which means all threads of the process have dropped mm by
do_exit() so they won't run kvm code anymore, and tdx_flush_vp_on_cpu()
is called for each pcpu they run last time, so will this error really
happen ?

> +		ret = -EBUSY;
> +		goto out;
> +	}
> +	if (WARN_ON_ONCE(err)) {
> +		pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL);
> +		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
> +		       kvm_tdx->hkid);
> +		ret = -EIO;
> +		goto out;
> +	}
> +
>  	for_each_online_cpu(i) {
>  		if (packages_allocated &&
>  		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
> @@ -258,14 +396,24 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
>  		pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
>  		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
>  		       kvm_tdx->hkid);
> +		ret = -EIO;
>  	} else
>  		tdx_hkid_free(kvm_tdx);
>
> +out:
>  	write_unlock(&kvm->mmu_lock);
>  	mutex_unlock(&tdx_lock);
>  	cpus_read_unlock();
>  	free_cpumask_var(targets);
>  	free_cpumask_var(packages);
> +
> +	return ret;
> +}
> +
> +void tdx_mmu_release_hkid(struct kvm *kvm)
> +{
> +	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
> +		;
>  }
>
>  void tdx_vm_free(struct kvm *kvm)
> @@ -429,6 +577,26 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>  	return 0;
>  }
>
> +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> +{
> +	struct vcpu_tdx *tdx = to_tdx(vcpu);
> +
> +	if (vcpu->cpu == cpu)
> +		return;
> +
> +	tdx_flush_vp_on_cpu(vcpu);
> +
> +	local_irq_disable();
> +	/*
> +	 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
> +	 * vcpu->cpu is read before tdx->cpu_list.
> +	 */
> +	smp_rmb();
> +
> +	list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
> +	local_irq_enable();
> +}
> +
>  void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_tdx *tdx = to_tdx(vcpu);
> @@ -469,6 +637,16 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
>  	struct vcpu_tdx *tdx = to_tdx(vcpu);
>  	int i;
>
> +	/*
> +	 * When destroying VM, kvm_unload_vcpu_mmu() calls vcpu_load() for every
> +	 * vcpu after they already disassociated from the per cpu list by
> +	 * tdx_mmu_release_hkid().  So we need to disassociate them again,
> +	 * otherwise the freed vcpu data will be accessed when do
> +	 * list_{del,add}() on associated_tdvcpus list later.
> +	 */
> +	tdx_disassociate_vp_on_cpu(vcpu);
> +	WARN_ON_ONCE(vcpu->cpu != -1);
> +
>  	/*
>  	 * This methods can be called when vcpu allocation/initialization
>  	 * failed. So it's possible that hkid, tdvpx and tdvpr are not assigned
> @@ -1873,6 +2051,10 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
>  		return -EINVAL;
>  	}
>
> +	/* tdx_hardware_disable() uses associated_tdvcpus. */
> +	for_each_possible_cpu(i)
> +		INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
> +
>  	for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
>  		/*
>  		 * Here it checks if MSRs (tdx_uret_msrs) can be saved/restored
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index c700792c08e2..4f803814126a 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -70,6 +70,8 @@ struct vcpu_tdx {
>  	unsigned long tdvpr_pa;
>  	unsigned long *tdvpx_pa;
>
> +	struct list_head cpu_list;
> +
>  	union tdx_exit_reason exit_reason;
>
>  	bool initialized;
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index 4c9793b5b30d..911ef1e8eeda 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -137,6 +137,7 @@ void vmx_setup_mce(struct kvm_vcpu *vcpu);
>  #ifdef CONFIG_INTEL_TDX_HOST
>  int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
>  void tdx_hardware_unsetup(void);
> +void tdx_hardware_disable(void);
>  bool tdx_is_vm_type_supported(unsigned long type);
>  int tdx_offline_cpu(void);
>
> @@ -153,6 +154,7 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
>  fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu);
>  void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
>  void tdx_vcpu_put(struct kvm_vcpu *vcpu);
> +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
>  u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
>
>  int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
> @@ -164,6 +166,7 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
>  #else
>  static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -EOPNOTSUPP; }
>  static inline void tdx_hardware_unsetup(void) {}
> +static inline void tdx_hardware_disable(void) {}
>  static inline bool tdx_is_vm_type_supported(unsigned long type) { return false; }
>  static inline int tdx_offline_cpu(void) { return 0; }
>
> @@ -183,6 +186,7 @@ static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
>  static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu) { return EXIT_FASTPATH_NONE; }
>  static inline void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) {}
>  static inline void tdx_vcpu_put(struct kvm_vcpu *vcpu) {}
> +static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {}
>  static inline u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; }
>
>  static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
> --
> 2.25.1
>
>