Re: [PATCH v2 05/42] s390/mm: provide memory management functions for protected KVM guests

David Hildenbrand <david@xxxxxxxxxx> · Mon, 17 Feb 2020 11:21:44 +0100

> diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
> index 85e944f04c70..4ebcf891ff3c 100644
> --- a/arch/s390/include/asm/page.h
> +++ b/arch/s390/include/asm/page.h
> @@ -153,6 +153,11 @@ static inline int devmem_is_allowed(unsigned long pfn)
>  #define HAVE_ARCH_FREE_PAGE
>  #define HAVE_ARCH_ALLOC_PAGE
>  
> +#if IS_ENABLED(CONFIG_PGSTE)
> +int arch_make_page_accessible(struct page *page);
> +#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
> +#endif
> +

Feels like this should have been one of the (CONFIG_)ARCH_HAVE_XXX
thingies defined via kconfig instead.

E.g., like (CONFIG_)HAVE_ARCH_TRANSPARENT_HUGEPAGE

[...]

> +
> +/*
> + * Requests the Ultravisor to encrypt a guest page and make it
> + * accessible to the host for paging (export).
> + *
> + * @paddr: Absolute host address of page to be exported
> + */
> +int uv_convert_from_secure(unsigned long paddr)
> +{
> +	struct uv_cb_cfs uvcb = {
> +		.header.cmd = UVC_CMD_CONV_FROM_SEC_STOR,
> +		.header.len = sizeof(uvcb),
> +		.paddr = paddr
> +	};
> +
> +	if (uv_call(0, (u64)&uvcb))
> +		return -EINVAL;
> +	return 0;
> +}
> +
> +/*
> + * Calculate the expected ref_count for a page that would otherwise have no
> + * further pins. This was cribbed from similar functions in other places in
> + * the kernel, but with some slight modifications. We know that a secure
> + * page can not be a huge page for example.

s/ca not cannot/

> + */
> +static int expected_page_refs(struct page *page)
> +{
> +	int res;
> +
> +	res = page_mapcount(page);
> +	if (PageSwapCache(page)) {
> +		res++;
> +	} else if (page_mapping(page)) {
> +		res++;
> +		if (page_has_private(page))
> +			res++;
> +	}
> +	return res;
> +}
> +
> +static int make_secure_pte(pte_t *ptep, unsigned long addr,
> +			   struct page *exp_page, struct uv_cb_header *uvcb)
> +{
> +	pte_t entry = READ_ONCE(*ptep);
> +	struct page *page;
> +	int expected, rc = 0;
> +
> +	if (!pte_present(entry))
> +		return -ENXIO;
> +	if (pte_val(entry) & _PAGE_INVALID)
> +		return -ENXIO;
> +
> +	page = pte_page(entry);
> +	if (page != exp_page)
> +		return -ENXIO;
> +	if (PageWriteback(page))
> +		return -EAGAIN;
> +	expected = expected_page_refs(page);
> +	if (!page_ref_freeze(page, expected))
> +		return -EBUSY;
> +	set_bit(PG_arch_1, &page->flags);
> +	rc = uv_call(0, (u64)uvcb);
> +	page_ref_unfreeze(page, expected);
> +	/* Return -ENXIO if the page was not mapped, -EINVAL otherwise */
> +	if (rc)
> +		rc = uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
> +	return rc;
> +}
> +
> +/*
> + * Requests the Ultravisor to make a page accessible to a guest.
> + * If it's brought in the first time, it will be cleared. If
> + * it has been exported before, it will be decrypted and integrity
> + * checked.
> + */
> +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
> +{
> +	struct vm_area_struct *vma;
> +	unsigned long uaddr;
> +	struct page *page;
> +	int rc, local_drain = 0;

local_drain could have been a bool.

> +	spinlock_t *ptelock;
> +	pte_t *ptep;
> +
> +again:
> +	rc = -EFAULT;
> +	down_read(&gmap->mm->mmap_sem);
> +
> +	uaddr = __gmap_translate(gmap, gaddr);
> +	if (IS_ERR_VALUE(uaddr))
> +		goto out;
> +	vma = find_vma(gmap->mm, uaddr);
> +	if (!vma)
> +		goto out;
> +	/*
> +	 * Secure pages cannot be huge and userspace should not combine both.
> +	 * In case userspace does it anyway this will result in an -EFAULT for
> +	 * the unpack. The guest is thus never reaching secure mode. If
> +	 * userspace is playing dirty tricky with mapping huge pages later
> +	 * on this will result in a segmenation fault.

s/segmenation/segmentation/

> +	 */
> +	if (is_vm_hugetlb_page(vma))
> +		goto out;
> +
> +	rc = -ENXIO;
> +	page = follow_page(vma, uaddr, FOLL_WRITE);
> +	if (IS_ERR_OR_NULL(page))
> +		goto out;
> +
> +	lock_page(page);
> +	ptep = get_locked_pte(gmap->mm, uaddr, &ptelock);
> +	rc = make_secure_pte(ptep, uaddr, page, uvcb);
> +	pte_unmap_unlock(ptep, ptelock);
> +	unlock_page(page);
> +out:
> +	up_read(&gmap->mm->mmap_sem);
> +
> +	if (rc == -EAGAIN) {
> +		wait_on_page_writeback(page);
> +	} else if (rc == -EBUSY) {
> +		/*
> +		 * If we have tried a local drain and the page refcount
> +		 * still does not match our expected safe value, try with a
> +		 * system wide drain. This is needed if the pagevecs holding
> +		 * the page are on a different CPU.
> +		 */
> +		if (local_drain) {
> +			lru_add_drain_all();

I do wonder if that is valid to be called with all the locks at this point.

> +			/* We give up here, and let the caller try again */
> +			return -EAGAIN;
> +		}
> +		/*
> +		 * We are here if the page refcount does not match the
> +		 * expected safe value. The main culprits are usually
> +		 * pagevecs. With lru_add_drain() we drain the pagevecs
> +		 * on the local CPU so that hopefully the refcount will
> +		 * reach the expected safe value.
> +		 */
> +		lru_add_drain();

dito ...

> +		local_drain = 1;
> +		/* And now we try again immediately after draining */
> +		goto again;
> +	} else if (rc == -ENXIO) {
> +		if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE))
> +			return -EFAULT;
> +		return -EAGAIN;
> +	}
> +	return rc;
> +}
> +EXPORT_SYMBOL_GPL(gmap_make_secure);
> +
> +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
> +{
> +	struct uv_cb_cts uvcb = {
> +		.header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
> +		.header.len = sizeof(uvcb),
> +		.guest_handle = gmap->guest_handle,
> +		.gaddr = gaddr,
> +	};
> +
> +	return gmap_make_secure(gmap, gaddr, &uvcb);
> +}
> +EXPORT_SYMBOL_GPL(gmap_convert_to_secure);
> +
> +/**
> + * To be called with the page locked or with an extra reference!

Can we have races here? (IOW, two callers concurrently for the same page)

> + */
> +int arch_make_page_accessible(struct page *page)
> +{
> +	int rc = 0;
> +
> +	/* Hugepage cannot be protected, so nothing to do */
> +	if (PageHuge(page))
> +		return 0;
> +
> +	/*
> +	 * PG_arch_1 is used in 3 places:
> +	 * 1. for kernel page tables during early boot
> +	 * 2. for storage keys of huge pages and KVM
> +	 * 3. As an indication that this page might be secure. This can
> +	 *    overindicate, e.g. we set the bit before calling
> +	 *    convert_to_secure.
> +	 * As secure pages are never huge, all 3 variants can co-exists.
> +	 */
> +	if (!test_bit(PG_arch_1, &page->flags))
> +		return 0;
> +
> +	rc = uv_pin_shared(page_to_phys(page));
> +	if (!rc) {
> +		clear_bit(PG_arch_1, &page->flags);
> +		return 0;
> +	}

Overall, looks sane to me. (I am mostly concerned about possible races,
e.g., when two gmaps would be created for a single VM and nasty stuff be
done with them). But yeah, I guess you guys thought about this ;)

-- 
Thanks,

David / dhildenb