Re: [RFC PATCH 03/23] x86/sgx: Introduce virtual EPC for use by KVM guests

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Jan 06, 2021 at 02:55:20PM +1300, Kai Huang wrote:
> From: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> 
> Add a misc device /dev/sgx_virt_epc to allow userspace to allocate "raw"
> EPC without an associated enclave.  The intended and only known use case
> for raw EPC allocation is to expose EPC to a KVM guest, hence the
> virt_epc moniker, virt.{c,h} files and X86_SGX_VIRTUALIZATION Kconfig.
> 
> Modify sgx_init() to always try to initialize virtual EPC driver, even
> when SGX driver is disabled due to SGX Launch Control is in locked mode,
> or not present at all, since SGX virtualization allows to expose SGX to
> guests that support non-LC configurations.
> 
> Implement the "raw" EPC allocation in the x86 core-SGX subsystem via
> /dev/sgx_virt_epc rather than in KVM. Doing so has two major advantages:
> 
>   - Does not require changes to KVM's uAPI, e.g. EPC gets handled as
>     just another memory backend for guests.
> 
>   - EPC management is wholly contained in the SGX subsystem, e.g. SGX
>     does not have to export any symbols, changes to reclaim flows don't
>     need to be routed through KVM, SGX's dirty laundry doesn't have to
>     get aired out for the world to see, and so on and so forth.
> 
> The virtual EPC allocated to guests is currently not reclaimable, due to
> oversubscription of EPC for KVM guests is not currently supported. Due
> to the complications of handling reclaim conflicts between guest and
> host, KVM EPC oversubscription is significantly more complex than basic
> support for SGX virtualization.
> 
> Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> Co-developed-by: Kai Huang <kai.huang@xxxxxxxxx>
> Signed-off-by: Kai Huang <kai.huang@xxxxxxxxx>

The commit message does not describe the code changes. It should
have an understandable explanation of fops. There is nothing about
the implementation right now.

/Jarkko

> ---
>  arch/x86/Kconfig                 |  12 ++
>  arch/x86/kernel/cpu/sgx/Makefile |   1 +
>  arch/x86/kernel/cpu/sgx/main.c   |   5 +-
>  arch/x86/kernel/cpu/sgx/virt.c   | 263 +++++++++++++++++++++++++++++++
>  arch/x86/kernel/cpu/sgx/virt.h   |  14 ++
>  5 files changed, 294 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/kernel/cpu/sgx/virt.c
>  create mode 100644 arch/x86/kernel/cpu/sgx/virt.h
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 618d1aabccb8..a7318175509b 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1947,6 +1947,18 @@ config X86_SGX
>  
>  	  If unsure, say N.
>  
> +config X86_SGX_VIRTUALIZATION
> +	bool "Software Guard eXtensions (SGX) Virtualization"
> +	depends on X86_SGX && KVM_INTEL
> +	help
> +
> +	  Enables KVM guests to create SGX enclaves.
> +
> +	  This includes support to expose "raw" unreclaimable enclave memory to
> +	  guests via a device node, e.g. /dev/sgx_virt_epc.
> +
> +	  If unsure, say N.
> +
>  config EFI
>  	bool "EFI runtime service support"
>  	depends on ACPI
> diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
> index 91d3dc784a29..7a25bf63adfb 100644
> --- a/arch/x86/kernel/cpu/sgx/Makefile
> +++ b/arch/x86/kernel/cpu/sgx/Makefile
> @@ -3,3 +3,4 @@ obj-y += \
>  	encl.o \
>  	ioctl.o \
>  	main.o
> +obj-$(CONFIG_X86_SGX_VIRTUALIZATION)	+= virt.o
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index 95aad183bb65..02993a327a1f 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -9,9 +9,11 @@
>  #include <linux/sched/mm.h>
>  #include <linux/sched/signal.h>
>  #include <linux/slab.h>
> +#include "arch.h"
>  #include "driver.h"
>  #include "encl.h"
>  #include "encls.h"
> +#include "virt.h"
>  
>  struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
>  static int sgx_nr_epc_sections;
> @@ -726,7 +728,8 @@ static void __init sgx_init(void)
>  	if (!sgx_page_reclaimer_init())
>  		goto err_page_cache;
>  
> -	ret = sgx_drv_init();
> +	/* Success if the native *or* virtual EPC driver initialized cleanly. */
> +	ret = !!sgx_drv_init() & !!sgx_virt_epc_init();
>  	if (ret)
>  		goto err_kthread;
>  
> diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
> new file mode 100644
> index 000000000000..d625551ccf25
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/virt.c
> @@ -0,0 +1,263 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*  Copyright(c) 2016-20 Intel Corporation. */
> +
> +#include <linux/miscdevice.h>
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/sched/mm.h>
> +#include <linux/sched/signal.h>
> +#include <linux/slab.h>
> +#include <linux/xarray.h>
> +#include <asm/sgx.h>
> +#include <uapi/asm/sgx.h>
> +
> +#include "encls.h"
> +#include "sgx.h"
> +#include "virt.h"
> +
> +struct sgx_virt_epc {
> +	struct xarray page_array;
> +	struct mutex lock;
> +	struct mm_struct *mm;
> +};
> +
> +static struct mutex virt_epc_lock;
> +static struct list_head virt_epc_zombie_pages;
> +
> +static int __sgx_virt_epc_fault(struct sgx_virt_epc *epc,
> +				struct vm_area_struct *vma, unsigned long addr)
> +{
> +	struct sgx_epc_page *epc_page;
> +	unsigned long index, pfn;
> +	int ret;
> +
> +	/* epc->lock must already have been hold */
> +
> +	/* Calculate index of EPC page in virtual EPC's page_array */
> +	index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
> +
> +	epc_page = xa_load(&epc->page_array, index);
> +	if (epc_page)
> +		return 0;
> +
> +	epc_page = sgx_alloc_epc_page(epc, false);
> +	if (IS_ERR(epc_page))
> +		return PTR_ERR(epc_page);
> +
> +	ret = xa_err(xa_store(&epc->page_array, index, epc_page, GFP_KERNEL));
> +	if (ret)
> +		goto err_free;
> +
> +	pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
> +
> +	ret = vmf_insert_pfn(vma, addr, pfn);
> +	if (ret != VM_FAULT_NOPAGE) {
> +		ret = -EFAULT;
> +		goto err_delete;
> +	}
> +
> +	return 0;
> +
> +err_delete:
> +	xa_erase(&epc->page_array, index);
> +err_free:
> +	sgx_free_epc_page(epc_page);
> +	return ret;
> +}
> +
> +static vm_fault_t sgx_virt_epc_fault(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct sgx_virt_epc *epc = vma->vm_private_data;
> +	int ret;
> +
> +	mutex_lock(&epc->lock);
> +	ret = __sgx_virt_epc_fault(epc, vma, vmf->address);
> +	mutex_unlock(&epc->lock);
> +
> +	if (!ret)
> +		return VM_FAULT_NOPAGE;
> +
> +	if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
> +		mmap_read_unlock(vma->vm_mm);
> +		return VM_FAULT_RETRY;
> +	}
> +
> +	return VM_FAULT_SIGBUS;
> +}
> +
> +const struct vm_operations_struct sgx_virt_epc_vm_ops = {
> +	.fault = sgx_virt_epc_fault,
> +};
> +
> +static int sgx_virt_epc_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	struct sgx_virt_epc *epc = file->private_data;
> +
> +	if (!(vma->vm_flags & VM_SHARED))
> +		return -EINVAL;
> +
> +	/*
> +	 * Don't allow mmap() from child after fork(), since child and parent
> +	 * cannot map to the same EPC.
> +	 */
> +	if (vma->vm_mm != epc->mm)
> +		return -EINVAL;
> +
> +	vma->vm_ops = &sgx_virt_epc_vm_ops;
> +	/* Don't copy VMA in fork() */
> +	vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
> +	vma->vm_private_data = file->private_data;
> +
> +	return 0;
> +}
> +
> +static int sgx_virt_epc_free_page(struct sgx_epc_page *epc_page)
> +{
> +	int ret;
> +
> +	if (!epc_page)
> +		return 0;
> +
> +	/*
> +	 * Explicitly EREMOVE virtual EPC page. Virtual EPC is only used by
> +	 * guest, and in normal condition guest should have done EREMOVE for
> +	 * all EPC pages before they are freed here. But it's possible guest
> +	 * is killed or crashed unnormally in which case EREMOVE has not been
> +	 * done. Do EREMOVE unconditionally here to cover both cases, because
> +	 * it's not possible to tell whether guest has done EREMOVE, since
> +	 * virtual EPC page status is not tracked. And it is fine to EREMOVE
> +	 * EPC page multiple times.
> +	 */
> +	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
> +	if (ret) {
> +		/*
> +		 * Only SGX_CHILD_PRESENT is expected, which is because of
> +		 * EREMOVE-ing an SECS still with child, in which case it can
> +		 * be handled by EREMOVE-ing the SECS again after all pages in
> +		 * virtual EPC have been EREMOVE-ed. See comments in below in
> +		 * sgx_virt_epc_release().
> +		 */
> +		WARN_ON_ONCE(ret != SGX_CHILD_PRESENT);
> +		return ret;
> +	}
> +
> +	__sgx_free_epc_page(epc_page);
> +	return 0;
> +}
> +
> +static int sgx_virt_epc_release(struct inode *inode, struct file *file)
> +{
> +	struct sgx_virt_epc *epc = file->private_data;
> +	struct sgx_epc_page *epc_page, *tmp, *entry;
> +	unsigned long index;
> +
> +	LIST_HEAD(secs_pages);
> +
> +	mmdrop(epc->mm);
> +
> +	xa_for_each(&epc->page_array, index, entry) {
> +		/*
> +		 * Virtual EPC pages are not tracked, so it's possible for
> +		 * EREMOVE to fail due to, e.g. a SECS page still has children
> +		 * if guest was shutdown unexpectedly. If it is the case, leave
> +		 * it in the xarray and retry EREMOVE below later.
> +		 */
> +		if (sgx_virt_epc_free_page(entry))
> +			continue;
> +
> +		xa_erase(&epc->page_array, index);
> +	}
> +
> +	/*
> +	 * Retry all failed pages after iterating through the entire tree, at
> +	 * which point all children should be removed and the SECS pages can be
> +	 * nuked as well...unless userspace has exposed multiple instance of
> +	 * virtual EPC to a single VM.
> +	 */
> +	xa_for_each(&epc->page_array, index, entry) {
> +		epc_page = entry;
> +		/*
> +		 * Error here means that EREMOVE failed due to a SECS page
> +		 * still has child on *another* EPC instance.  Put it to a
> +		 * temporary SECS list which will be spliced to 'zombie page
> +		 * list' and will be EREMOVE-ed again when freeing another
> +		 * virtual EPC instance.
> +		 */
> +		if (sgx_virt_epc_free_page(epc_page))
> +			list_add_tail(&epc_page->list, &secs_pages);
> +
> +		xa_erase(&epc->page_array, index);
> +	}
> +
> +	/*
> +	 * Third time's a charm.  Try to EREMOVE zombie SECS pages from virtual
> +	 * EPC instances that were previously released, i.e. free SECS pages
> +	 * that were in limbo due to having children in *this* EPC instance.
> +	 */
> +	mutex_lock(&virt_epc_lock);
> +	list_for_each_entry_safe(epc_page, tmp, &virt_epc_zombie_pages, list) {
> +		/*
> +		 * Speculatively remove the page from the list of zombies, if
> +		 * the page is successfully EREMOVE it will be added to the
> +		 * list of free pages.  If EREMOVE fails, throw the page on the
> +		 * local list, which will be spliced on at the end.
> +		 */
> +		list_del(&epc_page->list);
> +
> +		if (sgx_virt_epc_free_page(epc_page))
> +			list_add_tail(&epc_page->list, &secs_pages);
> +	}
> +
> +	if (!list_empty(&secs_pages))
> +		list_splice_tail(&secs_pages, &virt_epc_zombie_pages);
> +	mutex_unlock(&virt_epc_lock);
> +
> +	kfree(epc);
> +
> +	return 0;
> +}
> +
> +static int sgx_virt_epc_open(struct inode *inode, struct file *file)
> +{
> +	struct sgx_virt_epc *epc;
> +
> +	epc = kzalloc(sizeof(struct sgx_virt_epc), GFP_KERNEL);
> +	if (!epc)
> +		return -ENOMEM;
> +	/*
> +	 * Keep the current->mm to virtual EPC. It will be checked in
> +	 * sgx_virt_epc_mmap() to prevent, in case of fork, child being
> +	 * able to mmap() to the same virtual EPC pages.
> +	 */
> +	mmgrab(current->mm);
> +	epc->mm = current->mm;
> +	mutex_init(&epc->lock);
> +	xa_init(&epc->page_array);
> +
> +	file->private_data = epc;
> +
> +	return 0;
> +}
> +
> +static const struct file_operations sgx_virt_epc_fops = {
> +	.owner			= THIS_MODULE,
> +	.open			= sgx_virt_epc_open,
> +	.release		= sgx_virt_epc_release,
> +	.mmap			= sgx_virt_epc_mmap,
> +};
> +
> +static struct miscdevice sgx_virt_epc_dev = {
> +	.minor = MISC_DYNAMIC_MINOR,
> +	.name = "sgx_virt_epc",
> +	.nodename = "sgx_virt_epc",
> +	.fops = &sgx_virt_epc_fops,
> +};
> +
> +int __init sgx_virt_epc_init(void)
> +{
> +	INIT_LIST_HEAD(&virt_epc_zombie_pages);
> +	mutex_init(&virt_epc_lock);
> +
> +	return misc_register(&sgx_virt_epc_dev);
> +}
> diff --git a/arch/x86/kernel/cpu/sgx/virt.h b/arch/x86/kernel/cpu/sgx/virt.h
> new file mode 100644
> index 000000000000..e5434541a122
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/virt.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> +#ifndef _ASM_X86_SGX_VIRT_H
> +#define _ASM_X86_SGX_VIRT_H
> +
> +#ifdef CONFIG_X86_SGX_VIRTUALIZATION
> +int __init sgx_virt_epc_init(void);
> +#else
> +static inline int __init sgx_virt_epc_init(void)
> +{
> +	return -ENODEV;
> +}
> +#endif
> +
> +#endif /* _ASM_X86_SGX_VIRT_H */
> -- 
> 2.29.2
> 
> 



[Index of Archives]     [AMD Graphics]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux