Re: [PATCH v19 087/130] KVM: TDX: handle vcpu migration over logical processor

Reinette Chatre <reinette.chatre@xxxxxxxxx> · Fri, 12 Apr 2024 09:15:29 -0700

Hi Isaku,

On 2/26/2024 12:26 AM, isaku.yamahata@xxxxxxxxx wrote:
> From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>

...

> @@ -218,6 +257,87 @@ static void tdx_reclaim_control_page(unsigned long td_page_pa)
>  	free_page((unsigned long)__va(td_page_pa));
>  }
>  
> +struct tdx_flush_vp_arg {
> +	struct kvm_vcpu *vcpu;
> +	u64 err;
> +};
> +
> +static void tdx_flush_vp(void *arg_)
> +{
> +	struct tdx_flush_vp_arg *arg = arg_;
> +	struct kvm_vcpu *vcpu = arg->vcpu;
> +	u64 err;
> +
> +	arg->err = 0;
> +	lockdep_assert_irqs_disabled();
> +
> +	/* Task migration can race with CPU offlining. */
> +	if (unlikely(vcpu->cpu != raw_smp_processor_id()))
> +		return;
> +
> +	/*
> +	 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
> +	 * list tracking still needs to be updated so that it's correct if/when
> +	 * the vCPU does get initialized.
> +	 */
> +	if (is_td_vcpu_created(to_tdx(vcpu))) {
> +		/*
> +		 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are,
> +		 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
> +		 * vp flush function is called when destructing vcpu/TD or vcpu
> +		 * migration.  No other thread uses TDVPR in those cases.
> +		 */

(I have comment later that refer back to this comment about needing retry.)

...

> @@ -233,26 +353,31 @@ static void tdx_do_tdh_phymem_cache_wb(void *unused)
>  		pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
>  }
>  
> -void tdx_mmu_release_hkid(struct kvm *kvm)
> +static int __tdx_mmu_release_hkid(struct kvm *kvm)
>  {
>  	bool packages_allocated, targets_allocated;
>  	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
>  	cpumask_var_t packages, targets;
> +	struct kvm_vcpu *vcpu;
> +	unsigned long j;
> +	int i, ret = 0;
>  	u64 err;
> -	int i;
>  
>  	if (!is_hkid_assigned(kvm_tdx))
> -		return;
> +		return 0;
>  
>  	if (!is_td_created(kvm_tdx)) {
>  		tdx_hkid_free(kvm_tdx);
> -		return;
> +		return 0;
>  	}
>  
>  	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
>  	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
>  	cpus_read_lock();
>  
> +	kvm_for_each_vcpu(j, vcpu, kvm)
> +		tdx_flush_vp_on_cpu(vcpu);
> +
>  	/*
>  	 * We can destroy multiple guest TDs simultaneously.  Prevent
>  	 * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
> @@ -270,6 +395,19 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
>  	 */
>  	write_lock(&kvm->mmu_lock);
>  
> +	err = tdh_mng_vpflushdone(kvm_tdx->tdr_pa);
> +	if (err == TDX_FLUSHVP_NOT_DONE) {
> +		ret = -EBUSY;
> +		goto out;
> +	}
> +	if (WARN_ON_ONCE(err)) {
> +		pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL);
> +		pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
> +		       kvm_tdx->hkid);
> +		ret = -EIO;
> +		goto out;
> +	}
> +
>  	for_each_online_cpu(i) {
>  		if (packages_allocated &&
>  		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
> @@ -291,14 +429,24 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
>  		pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
>  		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
>  		       kvm_tdx->hkid);
> +		ret = -EIO;
>  	} else
>  		tdx_hkid_free(kvm_tdx);
>  
> +out:
>  	write_unlock(&kvm->mmu_lock);
>  	mutex_unlock(&tdx_lock);
>  	cpus_read_unlock();
>  	free_cpumask_var(targets);
>  	free_cpumask_var(packages);
> +
> +	return ret;
> +}
> +
> +void tdx_mmu_release_hkid(struct kvm *kvm)
> +{
> +	while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
> +		;
>  }

As I understand, __tdx_mmu_release_hkid() returns -EBUSY
after TDH.VP.FLUSH has been sent for every vCPU followed by
TDH.MNG.VPFLUSHDONE, which returns TDX_FLUSHVP_NOT_DONE.

Considering earlier comment that a retry of TDH.VP.FLUSH is not
needed, why is this while() loop here that sends the
TDH.VP.FLUSH again to all vCPUs instead of just a loop within
__tdx_mmu_release_hkid() to _just_ resend TDH.MNG.VPFLUSHDONE?

Could it be possible for a vCPU to appear during this time, thus
be missed in one TDH.VP.FLUSH cycle, to require a new cycle of
TDH.VP.FLUSH?

I note that TDX_FLUSHVP_NOT_DONE is distinct from TDX_OPERAND_BUSY
that can also be returned from TDH.MNG.VPFLUSHDONE and
wonder if a retry may be needed in that case also/instead? It looks like
TDH.MNG.VPFLUSHDONE needs exclusive access to all operands and I
do not know enough yet if this is the case here.

Reinette