[RFC PATCH 03/23] x86/sgx: Introduce virtual EPC for use by KVM guests

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>

Add a misc device /dev/sgx_virt_epc to allow userspace to allocate "raw"
EPC without an associated enclave.  The intended and only known use case
for raw EPC allocation is to expose EPC to a KVM guest, hence the
virt_epc moniker, virt.{c,h} files and X86_SGX_VIRTUALIZATION Kconfig.

Modify sgx_init() to always try to initialize virtual EPC driver, even
when SGX driver is disabled due to SGX Launch Control is in locked mode,
or not present at all, since SGX virtualization allows to expose SGX to
guests that support non-LC configurations.

Implement the "raw" EPC allocation in the x86 core-SGX subsystem via
/dev/sgx_virt_epc rather than in KVM. Doing so has two major advantages:

  - Does not require changes to KVM's uAPI, e.g. EPC gets handled as
    just another memory backend for guests.

  - EPC management is wholly contained in the SGX subsystem, e.g. SGX
    does not have to export any symbols, changes to reclaim flows don't
    need to be routed through KVM, SGX's dirty laundry doesn't have to
    get aired out for the world to see, and so on and so forth.

The virtual EPC allocated to guests is currently not reclaimable, due to
oversubscription of EPC for KVM guests is not currently supported. Due
to the complications of handling reclaim conflicts between guest and
host, KVM EPC oversubscription is significantly more complex than basic
support for SGX virtualization.

Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
Co-developed-by: Kai Huang <kai.huang@xxxxxxxxx>
Signed-off-by: Kai Huang <kai.huang@xxxxxxxxx>
---
 arch/x86/Kconfig                 |  12 ++
 arch/x86/kernel/cpu/sgx/Makefile |   1 +
 arch/x86/kernel/cpu/sgx/main.c   |   5 +-
 arch/x86/kernel/cpu/sgx/virt.c   | 263 +++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/sgx/virt.h   |  14 ++
 5 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/cpu/sgx/virt.c
 create mode 100644 arch/x86/kernel/cpu/sgx/virt.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 618d1aabccb8..a7318175509b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1947,6 +1947,18 @@ config X86_SGX
 
 	  If unsure, say N.
 
+config X86_SGX_VIRTUALIZATION
+	bool "Software Guard eXtensions (SGX) Virtualization"
+	depends on X86_SGX && KVM_INTEL
+	help
+
+	  Enables KVM guests to create SGX enclaves.
+
+	  This includes support to expose "raw" unreclaimable enclave memory to
+	  guests via a device node, e.g. /dev/sgx_virt_epc.
+
+	  If unsure, say N.
+
 config EFI
 	bool "EFI runtime service support"
 	depends on ACPI
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
index 91d3dc784a29..7a25bf63adfb 100644
--- a/arch/x86/kernel/cpu/sgx/Makefile
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -3,3 +3,4 @@ obj-y += \
 	encl.o \
 	ioctl.o \
 	main.o
+obj-$(CONFIG_X86_SGX_VIRTUALIZATION)	+= virt.o
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 95aad183bb65..02993a327a1f 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -9,9 +9,11 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
+#include "arch.h"
 #include "driver.h"
 #include "encl.h"
 #include "encls.h"
+#include "virt.h"
 
 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
 static int sgx_nr_epc_sections;
@@ -726,7 +728,8 @@ static void __init sgx_init(void)
 	if (!sgx_page_reclaimer_init())
 		goto err_page_cache;
 
-	ret = sgx_drv_init();
+	/* Success if the native *or* virtual EPC driver initialized cleanly. */
+	ret = !!sgx_drv_init() & !!sgx_virt_epc_init();
 	if (ret)
 		goto err_kthread;
 
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index 000000000000..d625551ccf25
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+#include "virt.h"
+
+struct sgx_virt_epc {
+	struct xarray page_array;
+	struct mutex lock;
+	struct mm_struct *mm;
+};
+
+static struct mutex virt_epc_lock;
+static struct list_head virt_epc_zombie_pages;
+
+static int __sgx_virt_epc_fault(struct sgx_virt_epc *epc,
+				struct vm_area_struct *vma, unsigned long addr)
+{
+	struct sgx_epc_page *epc_page;
+	unsigned long index, pfn;
+	int ret;
+
+	/* epc->lock must already have been hold */
+
+	/* Calculate index of EPC page in virtual EPC's page_array */
+	index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+	epc_page = xa_load(&epc->page_array, index);
+	if (epc_page)
+		return 0;
+
+	epc_page = sgx_alloc_epc_page(epc, false);
+	if (IS_ERR(epc_page))
+		return PTR_ERR(epc_page);
+
+	ret = xa_err(xa_store(&epc->page_array, index, epc_page, GFP_KERNEL));
+	if (ret)
+		goto err_free;
+
+	pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+	ret = vmf_insert_pfn(vma, addr, pfn);
+	if (ret != VM_FAULT_NOPAGE) {
+		ret = -EFAULT;
+		goto err_delete;
+	}
+
+	return 0;
+
+err_delete:
+	xa_erase(&epc->page_array, index);
+err_free:
+	sgx_free_epc_page(epc_page);
+	return ret;
+}
+
+static vm_fault_t sgx_virt_epc_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct sgx_virt_epc *epc = vma->vm_private_data;
+	int ret;
+
+	mutex_lock(&epc->lock);
+	ret = __sgx_virt_epc_fault(epc, vma, vmf->address);
+	mutex_unlock(&epc->lock);
+
+	if (!ret)
+		return VM_FAULT_NOPAGE;
+
+	if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+		mmap_read_unlock(vma->vm_mm);
+		return VM_FAULT_RETRY;
+	}
+
+	return VM_FAULT_SIGBUS;
+}
+
+const struct vm_operations_struct sgx_virt_epc_vm_ops = {
+	.fault = sgx_virt_epc_fault,
+};
+
+static int sgx_virt_epc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct sgx_virt_epc *epc = file->private_data;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	/*
+	 * Don't allow mmap() from child after fork(), since child and parent
+	 * cannot map to the same EPC.
+	 */
+	if (vma->vm_mm != epc->mm)
+		return -EINVAL;
+
+	vma->vm_ops = &sgx_virt_epc_vm_ops;
+	/* Don't copy VMA in fork() */
+	vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+	vma->vm_private_data = file->private_data;
+
+	return 0;
+}
+
+static int sgx_virt_epc_free_page(struct sgx_epc_page *epc_page)
+{
+	int ret;
+
+	if (!epc_page)
+		return 0;
+
+	/*
+	 * Explicitly EREMOVE virtual EPC page. Virtual EPC is only used by
+	 * guest, and in normal condition guest should have done EREMOVE for
+	 * all EPC pages before they are freed here. But it's possible guest
+	 * is killed or crashed unnormally in which case EREMOVE has not been
+	 * done. Do EREMOVE unconditionally here to cover both cases, because
+	 * it's not possible to tell whether guest has done EREMOVE, since
+	 * virtual EPC page status is not tracked. And it is fine to EREMOVE
+	 * EPC page multiple times.
+	 */
+	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+	if (ret) {
+		/*
+		 * Only SGX_CHILD_PRESENT is expected, which is because of
+		 * EREMOVE-ing an SECS still with child, in which case it can
+		 * be handled by EREMOVE-ing the SECS again after all pages in
+		 * virtual EPC have been EREMOVE-ed. See comments in below in
+		 * sgx_virt_epc_release().
+		 */
+		WARN_ON_ONCE(ret != SGX_CHILD_PRESENT);
+		return ret;
+	}
+
+	__sgx_free_epc_page(epc_page);
+	return 0;
+}
+
+static int sgx_virt_epc_release(struct inode *inode, struct file *file)
+{
+	struct sgx_virt_epc *epc = file->private_data;
+	struct sgx_epc_page *epc_page, *tmp, *entry;
+	unsigned long index;
+
+	LIST_HEAD(secs_pages);
+
+	mmdrop(epc->mm);
+
+	xa_for_each(&epc->page_array, index, entry) {
+		/*
+		 * Virtual EPC pages are not tracked, so it's possible for
+		 * EREMOVE to fail due to, e.g. a SECS page still has children
+		 * if guest was shutdown unexpectedly. If it is the case, leave
+		 * it in the xarray and retry EREMOVE below later.
+		 */
+		if (sgx_virt_epc_free_page(entry))
+			continue;
+
+		xa_erase(&epc->page_array, index);
+	}
+
+	/*
+	 * Retry all failed pages after iterating through the entire tree, at
+	 * which point all children should be removed and the SECS pages can be
+	 * nuked as well...unless userspace has exposed multiple instance of
+	 * virtual EPC to a single VM.
+	 */
+	xa_for_each(&epc->page_array, index, entry) {
+		epc_page = entry;
+		/*
+		 * Error here means that EREMOVE failed due to a SECS page
+		 * still has child on *another* EPC instance.  Put it to a
+		 * temporary SECS list which will be spliced to 'zombie page
+		 * list' and will be EREMOVE-ed again when freeing another
+		 * virtual EPC instance.
+		 */
+		if (sgx_virt_epc_free_page(epc_page))
+			list_add_tail(&epc_page->list, &secs_pages);
+
+		xa_erase(&epc->page_array, index);
+	}
+
+	/*
+	 * Third time's a charm.  Try to EREMOVE zombie SECS pages from virtual
+	 * EPC instances that were previously released, i.e. free SECS pages
+	 * that were in limbo due to having children in *this* EPC instance.
+	 */
+	mutex_lock(&virt_epc_lock);
+	list_for_each_entry_safe(epc_page, tmp, &virt_epc_zombie_pages, list) {
+		/*
+		 * Speculatively remove the page from the list of zombies, if
+		 * the page is successfully EREMOVE it will be added to the
+		 * list of free pages.  If EREMOVE fails, throw the page on the
+		 * local list, which will be spliced on at the end.
+		 */
+		list_del(&epc_page->list);
+
+		if (sgx_virt_epc_free_page(epc_page))
+			list_add_tail(&epc_page->list, &secs_pages);
+	}
+
+	if (!list_empty(&secs_pages))
+		list_splice_tail(&secs_pages, &virt_epc_zombie_pages);
+	mutex_unlock(&virt_epc_lock);
+
+	kfree(epc);
+
+	return 0;
+}
+
+static int sgx_virt_epc_open(struct inode *inode, struct file *file)
+{
+	struct sgx_virt_epc *epc;
+
+	epc = kzalloc(sizeof(struct sgx_virt_epc), GFP_KERNEL);
+	if (!epc)
+		return -ENOMEM;
+	/*
+	 * Keep the current->mm to virtual EPC. It will be checked in
+	 * sgx_virt_epc_mmap() to prevent, in case of fork, child being
+	 * able to mmap() to the same virtual EPC pages.
+	 */
+	mmgrab(current->mm);
+	epc->mm = current->mm;
+	mutex_init(&epc->lock);
+	xa_init(&epc->page_array);
+
+	file->private_data = epc;
+
+	return 0;
+}
+
+static const struct file_operations sgx_virt_epc_fops = {
+	.owner			= THIS_MODULE,
+	.open			= sgx_virt_epc_open,
+	.release		= sgx_virt_epc_release,
+	.mmap			= sgx_virt_epc_mmap,
+};
+
+static struct miscdevice sgx_virt_epc_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "sgx_virt_epc",
+	.nodename = "sgx_virt_epc",
+	.fops = &sgx_virt_epc_fops,
+};
+
+int __init sgx_virt_epc_init(void)
+{
+	INIT_LIST_HEAD(&virt_epc_zombie_pages);
+	mutex_init(&virt_epc_lock);
+
+	return misc_register(&sgx_virt_epc_dev);
+}
diff --git a/arch/x86/kernel/cpu/sgx/virt.h b/arch/x86/kernel/cpu/sgx/virt.h
new file mode 100644
index 000000000000..e5434541a122
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+#ifndef _ASM_X86_SGX_VIRT_H
+#define _ASM_X86_SGX_VIRT_H
+
+#ifdef CONFIG_X86_SGX_VIRTUALIZATION
+int __init sgx_virt_epc_init(void);
+#else
+static inline int __init sgx_virt_epc_init(void)
+{
+	return -ENODEV;
+}
+#endif
+
+#endif /* _ASM_X86_SGX_VIRT_H */
-- 
2.29.2




[Index of Archives]     [AMD Graphics]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux