Re: [PATCH v4 09/18] nitro_enclaves: Add logic for enclave vcpu creation

Alexander Graf <graf@xxxxxxxxx> · Mon, 6 Jul 2020 12:12:31 +0200

On 22.06.20 22:03, Andra Paraschiv wrote:
An enclave, before being started, has its resources set. One of its
resources is CPU.

The NE CPU pool is set for choosing CPUs for enclaves from it. Offline
the CPUs from the NE CPU pool during the pool setup and online them back
during the NE CPU pool teardown.

The enclave CPUs need to be full cores and from the same NUMA node. CPU
0 and its siblings have to remain available to the primary / parent VM.

Add ioctl command logic for enclave vCPU creation. Return as result a
file descriptor that is associated with the enclave vCPU.

Signed-off-by: Alexandru Vasile <lexnv@xxxxxxxxxx>
Signed-off-by: Andra Paraschiv <andraprs@xxxxxxxxxx>
---
Changelog

v3 -> v4

* Setup the NE CPU pool at runtime via a sysfs file for the kernel
   parameter.
* Check enclave CPUs to be from the same NUMA node.
* Use dev_err instead of custom NE log pattern.
* Update the NE ioctl call to match the decoupling from the KVM API.

v2 -> v3

* Remove the WARN_ON calls.
* Update static calls sanity checks.
* Update kzfree() calls to kfree().
* Remove file ops that do nothing for now - open, ioctl and release.

v1 -> v2

* Add log pattern for NE.
* Update goto labels to match their purpose.
* Remove the BUG_ON calls.
* Check if enclave state is init when setting enclave vcpu.
---
  drivers/virt/nitro_enclaves/ne_misc_dev.c | 491 ++++++++++++++++++++++
  1 file changed, 491 insertions(+)

diff --git a/drivers/virt/nitro_enclaves/ne_misc_dev.c b/drivers/virt/nitro_enclaves/ne_misc_dev.c
index f70496813033..d6777008f685 100644
--- a/drivers/virt/nitro_enclaves/ne_misc_dev.c
+++ b/drivers/virt/nitro_enclaves/ne_misc_dev.c
@@ -39,7 +39,11 @@
   * TODO: Update logic to create new sysfs entries instead of using
   * a kernel parameter e.g. if multiple sysfs files needed.
   */
+static int ne_set_kernel_param(const char *val, const struct kernel_param *kp);
+
  static const struct kernel_param_ops ne_cpu_pool_ops = {
+	.get = param_get_string,
+	.set = ne_set_kernel_param,
  };
  
  static char ne_cpus[PAGE_SIZE];
@@ -60,6 +64,485 @@ struct ne_cpu_pool {
  
  static struct ne_cpu_pool ne_cpu_pool;
  
+static const struct file_operations ne_enclave_vcpu_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
+};

Do we really need an fd for an object without operations? I think the 
general flow to add CPUs from the pool to the VM is very sensible. But I 
don't think we really need an fd as return value from that operation.

+
+/**
+ * ne_check_enclaves_created - Verify if at least one enclave has been created.
+ *
+ * @pdev: PCI device used for enclave lifetime management.
+ *
+ * @returns: true if at least one enclave is created, false otherwise.
+ */
+static bool ne_check_enclaves_created(struct pci_dev *pdev)
+{
+	struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev);
+
+	if (!ne_pci_dev)
+		return false;

Please pass in the ne_pci_dev into this function directly.

+
+	mutex_lock(&ne_pci_dev->enclaves_list_mutex);
+
+	if (list_empty(&ne_pci_dev->enclaves_list)) {
+		mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
+
+		return false;

If you make this a return variable, you save on the unlock duplication.

+	}
+
+	mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
+
+	return true;
+}
+
+/**
+ * ne_setup_cpu_pool - Set the NE CPU pool after handling sanity checks such as
+ * not sharing CPU cores with the primary / parent VM or not using CPU 0, which
+ * should remain available for the primary / parent VM. Offline the CPUs from
+ * the pool after the checks passed.
+ *
+ * @pdev: PCI device used for enclave lifetime management.
+ * @ne_cpu_list: the CPU list used for setting NE CPU pool.
+ *
+ * @returns: 0 on success, negative return value on failure.
+ */
+static int ne_setup_cpu_pool(struct pci_dev *pdev, const char *ne_cpu_list)
+{
+	unsigned int cpu = 0;
+	unsigned int cpu_sibling = 0;
+	int numa_node = -1;
+	int rc = -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		dev_err(&pdev->dev, "No admin capability for CPU pool setup\n");

No need to print anything here. It only gives non-admin users a chance 
to spill the kernel log. If non-admin users can write at all? Can they?

Also, isn't this at the wrong abstraction level? I would expect such a 
check to happen on the file write function, not here.

+
+		return -EPERM;
+	}
+
+	if (!ne_cpu_list)
+		return 0;
+
+	if (ne_check_enclaves_created(pdev)) {
+		dev_err(&pdev->dev, "The CPU pool is used, enclaves created\n");
+
+		return -EINVAL;
+	}
+
+	mutex_lock(&ne_cpu_pool.mutex);
+
+	rc = cpulist_parse(ne_cpu_list, ne_cpu_pool.avail);
+	if (rc < 0) {
+		dev_err(&pdev->dev,
+			"Error in cpulist parse [rc=%d]\n", rc);
+
+		goto unlock_mutex;
+	}
+
+	/*
+	 * Check if CPU 0 and its siblings are included in the provided CPU pool
+	 * They should remain available for the primary / parent VM.
+	 */
+	if (cpumask_test_cpu(0, ne_cpu_pool.avail)) {
+
+		dev_err(&pdev->dev,
+			"CPU 0 has to remain available for the primary VM\n");

Shouldn't this also change the read value of the sysfs file?

+
+		rc = -EINVAL;
+
+		goto unlock_mutex;
+	}
+
+	for_each_cpu(cpu_sibling, topology_sibling_cpumask(0)) {
+		if (cpumask_test_cpu(cpu_sibling, ne_cpu_pool.avail)) {
+			dev_err(&pdev->dev,
+				"CPU sibling %d of CPU 0 is in the CPU pool\n",
+				cpu_sibling);

Same here. I would expect the sysfs file to reflect either the previous 
state or <empty> because failures mean no CPUs are donated anymore.

Can we somehow implement the get function of the param as something that 
gets generated dynamically?

+
+			rc = -EINVAL;
+
+			goto unlock_mutex;
+		}
+	}
+
+	/*
+	 * Check if CPU siblings are included in the provided CPU pool. The
+	 * expectation is that CPU cores are made available in the CPU pool for
+	 * enclaves.
+	 */
+	for_each_cpu(cpu, ne_cpu_pool.avail) {
+		for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) {
+			if (!cpumask_test_cpu(cpu_sibling, ne_cpu_pool.avail)) {
+				dev_err(&pdev->dev,
+					"CPU %d is not in the CPU pool\n",
+					cpu_sibling);
+
+				rc = -EINVAL;
+
+				goto unlock_mutex;
+			}
+		}
+	}
+
+	/*
+	 * Check if the CPUs from the NE CPU pool are from the same NUMA node.
+	 */
+	for_each_cpu(cpu, ne_cpu_pool.avail) {
+		if (numa_node < 0) {
+			numa_node = cpu_to_node(cpu);
+
+			if (numa_node < 0) {
+				dev_err(&pdev->dev,
+					"Invalid NUMA node %d\n", numa_node);
+
+				rc = -EINVAL;
+
+				goto unlock_mutex;
+			}
+		} else {
+			if (numa_node != cpu_to_node(cpu)) {
+				dev_err(&pdev->dev,
+					"CPUs are from different NUMA nodes\n");
+
+				rc = -EINVAL;
+
+				goto unlock_mutex;
+			}
+		}
+	}
+

There should be a comment here that describes the why:

/*
 * CPUs that are donated to enclaves should not be considered online
 * by Linux anymore, as the hypervisor will degrade them to floating.
 *
 * We offline them here, to not degrade performance and expose correct
 * topology to Linux and user space.
 */

+	for_each_cpu(cpu, ne_cpu_pool.avail) {
+		rc = remove_cpu(cpu);
+		if (rc != 0) {
+			dev_err(&pdev->dev,
+				"CPU %d is not offlined [rc=%d]\n", cpu, rc);
+
+			goto online_cpus;
+		}
+	}
+
+	mutex_unlock(&ne_cpu_pool.mutex);
+
+	return 0;
+
+online_cpus:
+	for_each_cpu(cpu, ne_cpu_pool.avail)
+		add_cpu(cpu);
+unlock_mutex:
+	mutex_unlock(&ne_cpu_pool.mutex);
+
+	return rc;
+}
+
+/**
+ * ne_teardown_cpu_pool - Online the CPUs from the NE CPU pool and cleanup the
+ * CPU pool.
+ *
+ * @pdev: PCI device used for enclave lifetime management.
+ */
+static void ne_teardown_cpu_pool(struct pci_dev *pdev)
+{
+	unsigned int cpu = 0;
+	int rc = -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		dev_err(&pdev->dev, "No admin capability for CPU pool setup\n");
+
+		return;
+	}
+
+	if (!ne_cpu_pool.avail)
+		return;
+
+	if (ne_check_enclaves_created(pdev)) {
+		dev_err(&pdev->dev, "The CPU pool is used, enclaves created\n");
+
+		return;
+	}
+
+	mutex_lock(&ne_cpu_pool.mutex);
+
+	for_each_cpu(cpu, ne_cpu_pool.avail) {
+		rc = add_cpu(cpu);
+		if (rc != 0)
+			dev_err(&pdev->dev,
+				"CPU %d is not onlined [rc=%d]\n", cpu, rc);
+	}
+
+	cpumask_clear(ne_cpu_pool.avail);
+
+	mutex_unlock(&ne_cpu_pool.mutex);
+}
+
+static int ne_set_kernel_param(const char *val, const struct kernel_param *kp)
+{
+	const char *ne_cpu_list = val;
+	struct pci_dev *pdev = pci_get_device(PCI_VENDOR_ID_AMAZON,
+					      PCI_DEVICE_ID_NE, NULL);

Isn't there a better way?

+	int rc = -EINVAL;
+
+	if (!pdev)
+		return -ENODEV;
+
+	ne_teardown_cpu_pool(pdev);
+
+	rc = ne_setup_cpu_pool(pdev, ne_cpu_list);
+	if (rc < 0) {
+		dev_err(&pdev->dev, "Error in setup CPU pool [rc=%d]\n", rc);
+
+		return rc;
+	}
+
+	return param_set_copystring(val, kp);
+}
+
+/**
+ * ne_get_cpu_from_cpu_pool - Get a CPU from the CPU pool. If the vCPU id is 0,
+ * the CPU is autogenerated and chosen from the NE CPU pool.
+ *
+ * This function gets called with the ne_enclave mutex held.
+ *
+ * @ne_enclave: private data associated with the current enclave.
+ * @vcpu_id: id of the CPU to be associated with the given slot, apic id on x86.
+ *
+ * @returns: 0 on success, negative return value on failure.
+ */
+static int ne_get_cpu_from_cpu_pool(struct ne_enclave *ne_enclave, u32 *vcpu_id)

That's a very awkward API. Can you instead just pass by-value and return 
the resulting CPU ID?

+{
+	unsigned int cpu = 0;
+	unsigned int cpu_sibling = 0;
+
+	if (*vcpu_id != 0) {
+		if (cpumask_test_cpu(*vcpu_id, ne_enclave->cpu_siblings)) {
+			cpumask_clear_cpu(*vcpu_id, ne_enclave->cpu_siblings);
+
+			return 0;
+		}
+
+		mutex_lock(&ne_cpu_pool.mutex);
+
+		if (!cpumask_test_cpu(*vcpu_id, ne_cpu_pool.avail)) {
+			dev_err_ratelimited(ne_misc_dev.this_device,
+					    "CPU %d is not in NE CPU pool\n",
+					    *vcpu_id);
+
+			mutex_unlock(&ne_cpu_pool.mutex);
+
+			return -EINVAL;

I think you're better off making the return value explicit for the 
error, so that user space can print the error message rather than us.

+		}
+
+		cpumask_clear_cpu(*vcpu_id, ne_cpu_pool.avail);
+
+		/*
+		 * Make sure the CPU siblings are not marked as available
+		 * anymore.
+		 */
+		for_each_cpu(cpu_sibling, topology_sibling_cpumask(*vcpu_id)) {
+			if (cpu_sibling != *vcpu_id) {
+				cpumask_clear_cpu(cpu_sibling,
+						  ne_cpu_pool.avail);
+
+				cpumask_set_cpu(cpu_sibling,
+						ne_enclave->cpu_siblings);
+			}
+		}
+
+		mutex_unlock(&ne_cpu_pool.mutex);
+
+		return 0;
+	}
+
+	/* There are CPU siblings available to choose from. */
+	cpu = cpumask_any(ne_enclave->cpu_siblings);
+	if (cpu < nr_cpu_ids) {
+		cpumask_clear_cpu(cpu, ne_enclave->cpu_siblings);
+
+		*vcpu_id = cpu;
+
+		return 0;
+	}
+
+	mutex_lock(&ne_cpu_pool.mutex);
+
+	/* Choose any CPU from the available CPU pool. */
+	cpu = cpumask_any(ne_cpu_pool.avail);
+	if (cpu >= nr_cpu_ids) {
+		dev_err_ratelimited(ne_misc_dev.this_device,
+				    "No CPUs available in CPU pool\n");
+
+		mutex_unlock(&ne_cpu_pool.mutex);
+
+		return -EINVAL;

I think you're better off making the return value explicit for the 
error, so that user space can print the error message rather than us.

+	}
+
+	cpumask_clear_cpu(cpu, ne_cpu_pool.avail);
+
+	/* Make sure the CPU siblings are not marked as available anymore. */
+	for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) {
+		if (cpu_sibling != cpu) {
+			cpumask_clear_cpu(cpu_sibling, ne_cpu_pool.avail);
+
+			cpumask_set_cpu(cpu_sibling, ne_enclave->cpu_siblings);
+		}
+	}
+
+	mutex_unlock(&ne_cpu_pool.mutex);

I find the function slightly confusingly structured. Why can't we do 
something like


  if (!vcpu_id) {
    vcpu_id = find_next_free_vcpu_id();
    if (vcpu_id < 0)
        return -ENOSPC;
  }

  [logic to handle an explicit vcpu id]

I think that would be much more readable.

+
+	*vcpu_id = cpu;
+
+	return 0;
+}
+
+/**
+ * ne_create_vcpu_ioctl - Add vCPU to the slot associated with the current
+ * enclave. Create vCPU file descriptor to be further used for CPU handling.
+ *
+ * This function gets called with the ne_enclave mutex held.
+ *
+ * @ne_enclave: private data associated with the current enclave.
+ * @vcpu_id: id of the CPU to be associated with the given slot, apic id on x86.
+ *
+ * @returns: vCPU fd on success, negative return value on failure.
+ */
+static int ne_create_vcpu_ioctl(struct ne_enclave *ne_enclave, u32 vcpu_id)
+{
+	struct ne_pci_dev_cmd_reply cmd_reply = {};
+	int fd = 0;
+	struct file *file = NULL;
+	struct ne_vcpu_id *ne_vcpu_id = NULL;
+	int rc = -EINVAL;
+	struct slot_add_vcpu_req slot_add_vcpu_req = {};
+
+	if (ne_enclave->mm != current->mm)
+		return -EIO;
+
+	ne_vcpu_id = kzalloc(sizeof(*ne_vcpu_id), GFP_KERNEL);
+	if (!ne_vcpu_id)
+		return -ENOMEM;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		rc = fd;
+
+		dev_err_ratelimited(ne_misc_dev.this_device,
+				    "Error in getting unused fd [rc=%d]\n", rc);
+
+		goto free_ne_vcpu_id;
+	}
+
+	/* TODO: Include (vcpu) id in the ne-vm-vcpu naming. */
+	file = anon_inode_getfile("ne-vm-vcpu", &ne_enclave_vcpu_fops,
+				  ne_enclave, O_RDWR);
+	if (IS_ERR(file)) {
+		rc = PTR_ERR(file);
+
+		dev_err_ratelimited(ne_misc_dev.this_device,
+				    "Error in anon inode get file [rc=%d]\n",
+				    rc);
+
+		goto put_fd;
+	}
+
+	slot_add_vcpu_req.slot_uid = ne_enclave->slot_uid;
+	slot_add_vcpu_req.vcpu_id = vcpu_id;
+
+	rc = ne_do_request(ne_enclave->pdev, SLOT_ADD_VCPU, &slot_add_vcpu_req,
+			   sizeof(slot_add_vcpu_req), &cmd_reply,
+			   sizeof(cmd_reply));
+	if (rc < 0) {
+		dev_err_ratelimited(ne_misc_dev.this_device,
+				    "Error in slot add vcpu [rc=%d]\n", rc);
+
+		goto put_file;
+	}
+
+	ne_vcpu_id->vcpu_id = vcpu_id;
+
+	list_add(&ne_vcpu_id->vcpu_id_list_entry, &ne_enclave->vcpu_ids_list);
+
+	ne_enclave->nr_vcpus++;
+
+	fd_install(fd, file);
+
+	return fd;
+
+put_file:
+	fput(file);
+put_fd:
+	put_unused_fd(fd);
+free_ne_vcpu_id:
+	kfree(ne_vcpu_id);
+
+	return rc;
+}
+
+static long ne_enclave_ioctl(struct file *file, unsigned int cmd,
+			     unsigned long arg)
+{
+	struct ne_enclave *ne_enclave = file->private_data;
+
+	if (!ne_enclave || !ne_enclave->pdev)
+		return -EINVAL;
+
+	switch (cmd) {
+	case NE_CREATE_VCPU: {

Can this be an ADD_VCPU rather than CREATE? We don't really need a vcpu 
fd after all ...


Alex

+		int rc = -EINVAL;
+		u32 vcpu_id = 0;
+
+		if (copy_from_user(&vcpu_id, (void *)arg, sizeof(vcpu_id))) {
+			dev_err_ratelimited(ne_misc_dev.this_device,
+					    "Error in copy from user\n");
+
+			return -EFAULT;
+		}
+
+		mutex_lock(&ne_enclave->enclave_info_mutex);
+
+		if (ne_enclave->state != NE_STATE_INIT) {
+			dev_err_ratelimited(ne_misc_dev.this_device,
+					    "Enclave isn't in init state\n");
+
+			mutex_unlock(&ne_enclave->enclave_info_mutex);
+
+			return -EINVAL;
+		}
+
+		/* Use the CPU pool for choosing a CPU for the enclave. */
+		rc = ne_get_cpu_from_cpu_pool(ne_enclave, &vcpu_id);
+		if (rc < 0) {
+			dev_err_ratelimited(ne_misc_dev.this_device,
+					    "Error in get CPU from pool\n");
+
+			mutex_unlock(&ne_enclave->enclave_info_mutex);
+
+			return -EINVAL;
+		}
+
+		rc = ne_create_vcpu_ioctl(ne_enclave, vcpu_id);
+
+		/* Put back the CPU in enclave cpu pool, if add vcpu error. */
+		if (rc < 0)
+			cpumask_set_cpu(vcpu_id, ne_enclave->cpu_siblings);
+
+		mutex_unlock(&ne_enclave->enclave_info_mutex);
+
+		if (copy_to_user((void *)arg, &vcpu_id, sizeof(vcpu_id))) {
+			dev_err_ratelimited(ne_misc_dev.this_device,
+					    "Error in copy to user\n");
+
+			return -EFAULT;
+		}
+
+		return rc;
+	}
+
+	default:
+		return -ENOTTY;
+	}
+
+	return 0;
+}
+
  static __poll_t ne_enclave_poll(struct file *file, poll_table *wait)
  {
  	__poll_t mask = 0;
@@ -79,6 +562,7 @@ static const struct file_operations ne_enclave_fops = {
  	.owner		= THIS_MODULE,
  	.llseek		= noop_llseek,
  	.poll		= ne_enclave_poll,
+	.unlocked_ioctl	= ne_enclave_ioctl,
  };
  
  /**
@@ -286,8 +770,15 @@ static int __init ne_init(void)
  
  static void __exit ne_exit(void)
  {
+	struct pci_dev *pdev = pci_get_device(PCI_VENDOR_ID_AMAZON,
+					      PCI_DEVICE_ID_NE, NULL);
+	if (!pdev)
+		return;
+
  	pci_unregister_driver(&ne_pci_driver);
  
+	ne_teardown_cpu_pool(pdev);
+
  	free_cpumask_var(ne_cpu_pool.avail);
  }
  




Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879