Re: [PATCH v19 038/130] KVM: TDX: create/destroy VM structure

"Huang, Kai" <kai.huang@xxxxxxxxx> · Thu, 28 Mar 2024 12:33:35 +1300




... to continue the previous review ...

  
+static int __tdx_td_init(struct kvm *kvm)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	cpumask_var_t packages;
+	unsigned long *tdcs_pa = NULL;
+	unsigned long tdr_pa = 0;
+	unsigned long va;
+	int ret, i;
+	u64 err;
+
+	ret = tdx_guest_keyid_alloc();
+	if (ret < 0)
+		return ret;
+	kvm_tdx->hkid = ret;
+
+	va = __get_free_page(GFP_KERNEL_ACCOUNT);
+	if (!va)
+		goto free_hkid;
+	tdr_pa = __pa(va);
+
+	tdcs_pa = kcalloc(tdx_info->nr_tdcs_pages, sizeof(*kvm_tdx->tdcs_pa),
+			  GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (!tdcs_pa)
+		goto free_tdr;
Empty line.

+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
+		va = __get_free_page(GFP_KERNEL_ACCOUNT);
+		if (!va)
+			goto free_tdcs;
+		tdcs_pa[i] = __pa(va);
+	}
+
+	if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto free_tdcs;
+	}
Empty line.

+	cpus_read_lock();
+	/*
+	 * Need at least one CPU of the package to be online in order to
+	 * program all packages for host key id.  Check it.
+	 */
+	for_each_present_cpu(i)
+		cpumask_set_cpu(topology_physical_package_id(i), packages);
+	for_each_online_cpu(i)
+		cpumask_clear_cpu(topology_physical_package_id(i), packages);
+	if (!cpumask_empty(packages)) {
+		ret = -EIO;
+		/*
+		 * Because it's hard for human operator to figure out the
+		 * reason, warn it.
+		 */
+#define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
+		pr_warn_ratelimited(MSG_ALLPKG);
+		goto free_packages;
+	}
+
+	/*
+	 * Acquire global lock to avoid TDX_OPERAND_BUSY:
+	 * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
+	 * Table (KOT) to track the assigned TDX private HKID.  It doesn't spin
+	 * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
+	 * caller to handle the contention.  This is because of time limitation
+	 * usable inside the TDX module and OS/VMM knows better about process
+	 * scheduling.
+	 *
+	 * APIs to acquire the lock of KOT:
+	 * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
+	 * TDH.PHYMEM.CACHE.WB. > +	 */
Don't need to mention all SEAMCALLs here, but put a comment where 
appliciable, i.e., where they are used.
	/*
	 * TDH.MNG.CREATE tries to grab the global TDX module and fails
	 * with TDX_OPERAND_BUSY when it fails to grab.  Take the global
	 * lock to prevent it from failure.
	 */
+	mutex_lock(&tdx_lock);
+	err = tdh_mng_create(tdr_pa, kvm_tdx->hkid);
+	mutex_unlock(&tdx_lock);
Empty line.

+	if (err == TDX_RND_NO_ENTROPY) {
+		ret = -EAGAIN;
+		goto free_packages;
+	}
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_CREATE, err, NULL);
+		ret = -EIO;
+		goto free_packages;
+	}
I would prefer more empty lines.

+	kvm_tdx->tdr_pa = tdr_pa;
+
+	for_each_online_cpu(i) {
+		int pkg = topology_physical_package_id(i);
+
+		if (cpumask_test_and_set_cpu(pkg, packages))
+			continue;
+
+		/*
+		 * Program the memory controller in the package with an
+		 * encryption key associated to a TDX private host key id
+		 * assigned to this TDR.  Concurrent operations on same memory
+		 * controller results in TDX_OPERAND_BUSY.  Avoid this race by
+		 * mutex.
+		 */
IIUC the race can only happen when you are creating multiple TDX guests 
simulatenously?  Please clarify this in the comment.
And I even don't think you need all these TDX module details:

		/*
		 * Concurrent run of TDH.MNG.KEY.CONFIG on the same
		 * package resluts in TDX_OPERAND_BUSY.  When creating
		 * multiple TDX guests simultaneously this can run
		 * concurrently.  Take the per-package lock to
		 * serialize.
		 */
+		mutex_lock(&tdx_mng_key_config_lock[pkg]);
+		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+				      &kvm_tdx->tdr_pa, true);
+		mutex_unlock(&tdx_mng_key_config_lock[pkg]);
+		if (ret)
+			break;
+	}
+	cpus_read_unlock();
+	free_cpumask_var(packages);
+	if (ret) {
+		i = 0;
+		goto teardown;
+	}
+
+	kvm_tdx->tdcs_pa = tdcs_pa;
+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
+		err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
+		if (err == TDX_RND_NO_ENTROPY) {
+			/* Here it's hard to allow userspace to retry. */
+			ret = -EBUSY;
+			goto teardown;
+		}
+		if (WARN_ON_ONCE(err)) {
+			pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
+			ret = -EIO;
+			goto teardown;
+		}
+	}
+
+	/*
+	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
+	 * ioctl() to define the configure CPUID values for the TD.
+	 */
Then, how about renaming this function to __tdx_td_create()?

+	return 0;
+
+	/*
+	 * The sequence for freeing resources from a partially initialized TD
+	 * varies based on where in the initialization flow failure occurred.
+	 * Simply use the full teardown and destroy, which naturally play nice
+	 * with partial initialization.
+	 */
+teardown:
+	for (; i < tdx_info->nr_tdcs_pages; i++) {
+		if (tdcs_pa[i]) {
+			free_page((unsigned long)__va(tdcs_pa[i]));
+			tdcs_pa[i] = 0;
+		}
+	}
+	if (!kvm_tdx->tdcs_pa)
+		kfree(tdcs_pa);
The code to "free TDCS pages in a loop and free the array" is done below 
with duplicated code.  I am wondering whether we have way to eliminate one.
But I have lost track here, so perhaps we can review again after we 
split the patch to smaller pieces.
+	tdx_mmu_release_hkid(kvm);
+	tdx_vm_free(kvm);
+	return ret;
+
+free_packages:
+	cpus_read_unlock();
+	free_cpumask_var(packages);
+free_tdcs:
+	for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
+		if (tdcs_pa[i])
+			free_page((unsigned long)__va(tdcs_pa[i]));
+	}
+	kfree(tdcs_pa);
+	kvm_tdx->tdcs_pa = NULL;
+
+free_tdr:
+	if (tdr_pa)
+		free_page((unsigned long)__va(tdr_pa));
+	kvm_tdx->tdr_pa = 0;
+free_hkid:
+	if (is_hkid_assigned(kvm_tdx))
+		tdx_hkid_free(kvm_tdx);
+	return ret;
+}
+
  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
  {
  	struct kvm_tdx_cmd tdx_cmd;
@@ -215,12 +664,13 @@ static int tdx_md_read(struct tdx_md_map *maps, int nr_maps)
  
  static int __init tdx_module_setup(void)
  {
-	u16 num_cpuid_config;
+	u16 num_cpuid_config, tdcs_base_size;
  	int ret;
  	u32 i;
  
  	struct tdx_md_map mds[] = {
  		TDX_MD_MAP(NUM_CPUID_CONFIG, &num_cpuid_config),
+		TDX_MD_MAP(TDCS_BASE_SIZE, &tdcs_base_size),
  	};
  
  	struct tdx_metadata_field_mapping fields[] = {
@@ -273,6 +723,8 @@ static int __init tdx_module_setup(void)
  		c->edx = ecx_edx >> 32;
  	}
  
+	tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE;
+
Round up the 'tdcs_base_size' to make sure you have enough room, or put 
a WARN() here if not page aligned?
  	return 0;
  
  error_out:
@@ -319,13 +771,27 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
  	struct tdx_enabled enable = {
  		.err = ATOMIC_INIT(0),
  	};
+	int max_pkgs;
  	int r = 0;
+	int i;
Nit: you can put the 3 into one line.

  
+	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
+		pr_warn("MOVDIR64B is reqiured for TDX\n");
It's better to make it more clear:

"Disable TDX: MOVDIR64B is not supported or disabled by the kernel."

Or, to match below:

"Cannot enable TDX w/o MOVDIR64B".

+		return -EOPNOTSUPP;
+	}
  	if (!enable_ept) {
  		pr_warn("Cannot enable TDX with EPT disabled\n");
  		return -EINVAL;
  	}
  
+	max_pkgs = topology_max_packages();
+	tdx_mng_key_config_lock = kcalloc(max_pkgs, sizeof(*tdx_mng_key_config_lock),
+				   GFP_KERNEL);
+	if (!tdx_mng_key_config_lock)
+		return -ENOMEM;
+	for (i = 0; i < max_pkgs; i++)
+		mutex_init(&tdx_mng_key_config_lock[i]);
+
Using a per-socket lock looks a little bit overkill to me.  I don't know 
whether we need to do in the initial version.  Will leave to others.
Please at least add a comment to explain this is for better performance 
when creating multiple TDX guests IIUC?
  	if (!zalloc_cpumask_var(&enable.enabled, GFP_KERNEL)) {
  		r = -ENOMEM;
  		goto out;
@@ -350,4 +816,5 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
  void tdx_hardware_unsetup(void)
  {
  	kfree(tdx_info);
+	kfree(tdx_mng_key_config_lock);
The kernel actually has a mutex_destroy().  It is empty when 
CONFIG_DEBUG_LOCK_ALLOC is off, but I think it should be standard 
proceedure to also mutex_destory()?
[...]