Re: [RFC PATCH 03/23] x86/sgx: Introduce virtual EPC for use by KVM guests

Sean Christopherson <seanjc@xxxxxxxxxx> · Wed, 6 Jan 2021 12:35:31 -0800

On Wed, Jan 06, 2021, Dave Hansen wrote:
> On 1/5/21 5:55 PM, Kai Huang wrote:
> > From: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> > diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> > index 95aad183bb65..02993a327a1f 100644
> > --- a/arch/x86/kernel/cpu/sgx/main.c
> > +++ b/arch/x86/kernel/cpu/sgx/main.c
> > @@ -9,9 +9,11 @@
> >  #include <linux/sched/mm.h>
> >  #include <linux/sched/signal.h>
> >  #include <linux/slab.h>
> > +#include "arch.h"
> >  #include "driver.h"
> >  #include "encl.h"
> >  #include "encls.h"
> > +#include "virt.h"
> >  
> >  struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
> >  static int sgx_nr_epc_sections;
> > @@ -726,7 +728,8 @@ static void __init sgx_init(void)
> >  	if (!sgx_page_reclaimer_init())
> >  		goto err_page_cache;
> >  
> > -	ret = sgx_drv_init();
> > +	/* Success if the native *or* virtual EPC driver initialized cleanly. */
> > +	ret = !!sgx_drv_init() & !!sgx_virt_epc_init();
> >  	if (ret)
> >  		goto err_kthread;
> 
> FWIW, I hate that conditional.  But, I tried to write to to be something
> more sane and failed.

Heh, you're welcome :-D

> > diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
> > new file mode 100644
> > index 000000000000..d625551ccf25
> > --- /dev/null
> > +++ b/arch/x86/kernel/cpu/sgx/virt.c
> > @@ -0,0 +1,263 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*  Copyright(c) 2016-20 Intel Corporation. */
> > +
> > +#include <linux/miscdevice.h>
> > +#include <linux/mm.h>
> > +#include <linux/mman.h>
> > +#include <linux/sched/mm.h>
> > +#include <linux/sched/signal.h>
> > +#include <linux/slab.h>
> > +#include <linux/xarray.h>
> > +#include <asm/sgx.h>
> > +#include <uapi/asm/sgx.h>
> > +
> > +#include "encls.h"
> > +#include "sgx.h"
> > +#include "virt.h"
> > +
> > +struct sgx_virt_epc {
> > +	struct xarray page_array;
> > +	struct mutex lock;
> > +	struct mm_struct *mm;
> > +};
> > +
> > +static struct mutex virt_epc_lock;
> > +static struct list_head virt_epc_zombie_pages;
> 
> What does the lock protect?

Effectively, the list of zombie SECS pages.  Not sure why I used a generic name.

> What are zombie pages?

My own terminology for SECS pages whose virtual EPC has been destroyed but can't
be reclaimed due to them having child EPC pages in other virtual EPCs.

> BTW, if zombies are SECS-only, shouldn't that be in the name rather than
> "epc"?

I used the virt_epc prefix/namespace to tag it as a global list.  I've no
argument against something like zombie_secs_pages.

> > +static int __sgx_virt_epc_fault(struct sgx_virt_epc *epc,
> > +				struct vm_area_struct *vma, unsigned long addr)
> > +{
> > +	struct sgx_epc_page *epc_page;
> > +	unsigned long index, pfn;
> > +	int ret;
> > +
> > +	/* epc->lock must already have been hold */
> 
> 	/* epc->lock must already be held */
> 
> Wouldn't this be better as:
> 
> WARN_ON(!mutex_is_locked(&epc->lock));
> 
> ?

Or just proper lockdep?

> > +	/* Calculate index of EPC page in virtual EPC's page_array */
> > +	index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
> > +
> > +	epc_page = xa_load(&epc->page_array, index);
> > +	if (epc_page)
> > +		return 0;
> > +
> > +	epc_page = sgx_alloc_epc_page(epc, false);
> > +	if (IS_ERR(epc_page))
> > +		return PTR_ERR(epc_page);
> > +
> > +	ret = xa_err(xa_store(&epc->page_array, index, epc_page, GFP_KERNEL));
> > +	if (ret)
> > +		goto err_free;
> > +
> > +	pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
> > +
> > +	ret = vmf_insert_pfn(vma, addr, pfn);
> > +	if (ret != VM_FAULT_NOPAGE) {
> > +		ret = -EFAULT;
> > +		goto err_delete;
> > +	}
> > +
> > +	return 0;
> > +
> > +err_delete:
> > +	xa_erase(&epc->page_array, index);
> > +err_free:
> > +	sgx_free_epc_page(epc_page);
> > +	return ret;
> > +}
> > +
> > +static vm_fault_t sgx_virt_epc_fault(struct vm_fault *vmf)
> > +{
> > +	struct vm_area_struct *vma = vmf->vma;
> > +	struct sgx_virt_epc *epc = vma->vm_private_data;
> > +	int ret;
> > +
> > +	mutex_lock(&epc->lock);
> > +	ret = __sgx_virt_epc_fault(epc, vma, vmf->address);
> > +	mutex_unlock(&epc->lock);
> > +
> > +	if (!ret)
> > +		return VM_FAULT_NOPAGE;
> > +
> > +	if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
> > +		mmap_read_unlock(vma->vm_mm);
> > +		return VM_FAULT_RETRY;
> > +	}
> > +
> > +	return VM_FAULT_SIGBUS;
> > +}
> > +
> > +const struct vm_operations_struct sgx_virt_epc_vm_ops = {
> > +	.fault = sgx_virt_epc_fault,
> > +};
> > +
> > +static int sgx_virt_epc_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > +	struct sgx_virt_epc *epc = file->private_data;
> > +
> > +	if (!(vma->vm_flags & VM_SHARED))
> > +		return -EINVAL;
> > +
> > +	/*
> > +	 * Don't allow mmap() from child after fork(), since child and parent
> > +	 * cannot map to the same EPC.
> > +	 */
> > +	if (vma->vm_mm != epc->mm)
> > +		return -EINVAL;
> 
> I mentioned this below, but I'm not buying this logic.  I know it would
> be *bad*, but I don't see why the kernel needs to keep it from happening.

There's no known use case (KVM doesn't support sharing a VM across multiple
mm structs), and supporting multiple mm structs is a nightmare; see the driver
for the amount of pain incurred.

And IIRC, supporting VMM (KVM) EPC oversubscription, which may or may not ever
happen, was borderline impossible if virtual EPC supports multiple mm structs as
the interaction between KVM and virtual EPC is a disaster in that case.

> > +	vma->vm_ops = &sgx_virt_epc_vm_ops;
> > +	/* Don't copy VMA in fork() */
> > +	vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
> > +	vma->vm_private_data = file->private_data;
> > +
> > +	return 0;
> > +}
> > +
> > +static int sgx_virt_epc_free_page(struct sgx_epc_page *epc_page)
> > +{
> > +	int ret;
> > +
> > +	if (!epc_page)
> > +		return 0;
> 
> I always worry about these.  Why is passing NULL around OK?

I suspect I did it to mimic kfree() behavior.  I don't _think_ the radix (now
xarray) usage will ever encounter a NULL entry.

> 
> > +	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
> > +	if (ret) {
> > +		/*
> > +		 * Only SGX_CHILD_PRESENT is expected, which is because of
> > +		 * EREMOVE-ing an SECS still with child, in which case it can
> > +		 * be handled by EREMOVE-ing the SECS again after all pages in
> > +		 * virtual EPC have been EREMOVE-ed. See comments in below in
> > +		 * sgx_virt_epc_release().
> > +		 */
> > +		WARN_ON_ONCE(ret != SGX_CHILD_PRESENT);
> > +		return ret;
> > +	}
> 
> I find myself wondering what errors could cause the WARN_ON_ONCE() to be
> hit.  The SDM indicates that it's only:
> 
> 	SGX_ENCLAVE_ACT If there are still logical processors executing
> 			inside the enclave.
> 
> Should that be mentioned in the comment?

And faults, which are also spliced into the return value by the ENCLS macros.
I do remember hitting this WARN when I broke things, though I can't remember
whether it was a fault or the SGX_ENCLAVE_ACT scenario.  Probably the latter?

> > +
> > +	__sgx_free_epc_page(epc_page);
> > +	return 0;
> > +}
> > +

...

> > +	xa_for_each(&epc->page_array, index, entry) {
> > +		epc_page = entry;
> 
> Then, talk about the error condition here:
> 
> > +		/*
> > +		 * Error here means that EREMOVE failed due to a SECS page
> > +		 * still has child on *another* EPC instance.  Put it to a
> > +		 * temporary SECS list which will be spliced to 'zombie page
> > +		 * list' and will be EREMOVE-ed again when freeing another
> > +		 * virtual EPC instance.
> > +		 */
> 
> Surprise, I've got another rewrite:
> 
> 		/*
> 		 * An EREMOVE failure here means that the SECS page
> 		 * still has children.  But, since all children in this
> 		 * 'sgx_virt_epc' have been removed, the SECS page must
> 		 * have a child on another instance.
> 		 */
> 
> > +		if (sgx_virt_epc_free_page(epc_page))
> > +			list_add_tail(&epc_page->list, &secs_pages);
> 
> Why move these over to &secs_list here?  I think it's to avoid another
> xa_for_each() below, but it's not clear.

Yes?  IIRC, the sole motivation is to make the list_split_tail() operation as
short as possible while holding the global virt_epc_lock.

> > +		xa_erase(&epc->page_array, index);
> > +	}
> > +

...

> > +	mutex_lock(&virt_epc_lock);
> > +	list_for_each_entry_safe(epc_page, tmp, &virt_epc_zombie_pages, list) {
> > +		/*
> > +		 * Speculatively remove the page from the list of zombies, if
> > +		 * the page is successfully EREMOVE it will be added to the
> > +		 * list of free pages.  If EREMOVE fails, throw the page on the
> > +		 * local list, which will be spliced on at the end.
> > +		 */
> > +		list_del(&epc_page->list);
> > +
> > +		if (sgx_virt_epc_free_page(epc_page))
> > +			list_add_tail(&epc_page->list, &secs_pages);
> 
> I don't get this.  Couldn't you do without the unconditional list_del()
> and instead just do:
> 
> 		if (!sgx_virt_epc_free_page(epc_page))
> 			list_del(&epc_page->list);
> 
> Or does the free() code clobber the list_head?  If that's the case,
> maybe you should say that explicitly.

More or less.  EPC pages need to be removed from their list before freeing, once
a page is freed it is owned by the allocator.  Deleting after freeing leads to
list corruption if a different thread allocates the page and adds it to a
different list.

> > +	}
> > +
> > +	if (!list_empty(&secs_pages))
> > +		list_splice_tail(&secs_pages, &virt_epc_zombie_pages);
> > +	mutex_unlock(&virt_epc_lock);
> > +
> > +	kfree(epc);
> > +
> > +	return 0;
> > +}