[RFC PATCH 08/18] virt/mshv: map and unmap guest memory

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Introduce ioctls for mapping and unmapping regions of guest memory.

Uses a table of memory 'slots' similar to KVM, but the slot
number is not visible to userspace.

For now, this simple implementation requires each new mapping to be
disjoint - the underlying hypercalls have no such restriction, and
implicitly overwrite any mappings on the pages in the specified regions.

Co-developed-by: Lillian Grassin-Drake <ligrassi@xxxxxxxxxxxxx>
Signed-off-by: Lillian Grassin-Drake <ligrassi@xxxxxxxxxxxxx>
Signed-off-by: Nuno Das Neves <nunodasneves@xxxxxxxxxxxxxxxxxxx>
---
 Documentation/virt/mshv/api.rst        |  15 ++
 include/asm-generic/hyperv-tlfs.h      |  15 ++
 include/linux/mshv.h                   |  14 ++
 include/uapi/asm-generic/hyperv-tlfs.h |   9 +
 include/uapi/linux/mshv.h              |  15 ++
 virt/mshv/mshv_main.c                  | 322 ++++++++++++++++++++++++-
 6 files changed, 388 insertions(+), 2 deletions(-)

diff --git a/Documentation/virt/mshv/api.rst b/Documentation/virt/mshv/api.rst
index ce651a1738e0..530efc29d354 100644
--- a/Documentation/virt/mshv/api.rst
+++ b/Documentation/virt/mshv/api.rst
@@ -72,3 +72,18 @@ it is open - this ioctl can only be called once per open.
 This ioctl creates a guest partition, returning a file descriptor to use as a
 handle for partition ioctls.
 
+3.3 MSHV_MAP_GUEST_MEMORY and MSHV_UNMAP_GUEST_MEMORY
+-----------------------------------------------------
+:Type: partition ioctl
+:Parameters: struct mshv_user_mem_region
+:Returns: 0 on success
+
+Create a mapping from a region of process memory to a region of physical memory
+in a guest partition.
+
+Mappings must be disjoint in process address space and guest address space.
+
+Note: In the current implementation, this memory is pinned to stop the pages
+being moved by linux and subsequently clobbered by the hypervisor. So the region
+is backed by physical memory.
+
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index 2a49503b7396..6e5072e29897 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -149,6 +149,8 @@ struct ms_hyperv_tsc_page {
 #define HVCALL_GET_PARTITION_ID			0x0046
 #define HVCALL_DEPOSIT_MEMORY			0x0048
 #define HVCALL_WITHDRAW_MEMORY			0x0049
+#define HVCALL_MAP_GPA_PAGES			0x004b
+#define HVCALL_UNMAP_GPA_PAGES			0x004c
 #define HVCALL_CREATE_VP			0x004e
 #define HVCALL_GET_VP_REGISTERS			0x0050
 #define HVCALL_SET_VP_REGISTERS			0x0051
@@ -827,4 +829,17 @@ struct hv_delete_partition {
 	u64 partition_id;
 };
 
+struct hv_map_gpa_pages {
+	u64 target_partition_id;
+	u64 target_gpa_base;
+	u32 map_flags;
+	u64 source_gpa_page_list[];
+};
+
+struct hv_unmap_gpa_pages {
+	u64 target_partition_id;
+	u64 target_gpa_base;
+	u32 unmap_flags;
+};
+
 #endif
diff --git a/include/linux/mshv.h b/include/linux/mshv.h
index fc4f35089b2c..91a742f37440 100644
--- a/include/linux/mshv.h
+++ b/include/linux/mshv.h
@@ -7,13 +7,27 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <uapi/linux/mshv.h>
 
 #define MSHV_MAX_PARTITIONS		128
+#define MSHV_MAX_MEM_REGIONS		64
+
+struct mshv_mem_region {
+	u64 size; /* bytes */
+	u64 guest_pfn;
+	u64 userspace_addr; /* start of the userspace allocated memory */
+	struct page **pages;
+};
 
 struct mshv_partition {
 	u64 id;
 	refcount_t ref_count;
+	struct mutex mutex;
+	struct {
+		u32 count;
+		struct mshv_mem_region slots[MSHV_MAX_MEM_REGIONS];
+	} regions;
 };
 
 struct mshv {
diff --git a/include/uapi/asm-generic/hyperv-tlfs.h b/include/uapi/asm-generic/hyperv-tlfs.h
index 7a858226a9c5..e7b09b9f00de 100644
--- a/include/uapi/asm-generic/hyperv-tlfs.h
+++ b/include/uapi/asm-generic/hyperv-tlfs.h
@@ -12,4 +12,13 @@
 #define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED          BIT(4)
 #define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED                    BIT(13)
 
+/* HV Map GPA (Guest Physical Address) Flags */
+#define HV_MAP_GPA_PERMISSIONS_NONE     0x0
+#define HV_MAP_GPA_READABLE             0x1
+#define HV_MAP_GPA_WRITABLE             0x2
+#define HV_MAP_GPA_KERNEL_EXECUTABLE    0x4
+#define HV_MAP_GPA_USER_EXECUTABLE      0x8
+#define HV_MAP_GPA_EXECUTABLE           0xC
+#define HV_MAP_GPA_PERMISSIONS_MASK     0xF
+
 #endif
diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
index 4f8da9a6fde2..47be03ef4e86 100644
--- a/include/uapi/linux/mshv.h
+++ b/include/uapi/linux/mshv.h
@@ -18,10 +18,25 @@ struct mshv_create_partition {
 	struct hv_partition_creation_properties partition_creation_properties;
 };
 
+/*
+ * Mappings can't overlap in GPA space or userspace
+ * To unmap, these fields must match an existing mapping
+ */
+struct mshv_user_mem_region {
+	__u64 size;		/* bytes */
+	__u64 guest_pfn;
+	__u64 userspace_addr;	/* start of the userspace allocated memory */
+	__u32 flags;		/* ignored on unmap */
+};
+
 #define MSHV_IOCTL 0xB8
 
 /* mshv device */
 #define MSHV_REQUEST_VERSION	_IOW(MSHV_IOCTL, 0x00, __u32)
 #define MSHV_CREATE_PARTITION	_IOW(MSHV_IOCTL, 0x01, struct mshv_create_partition)
 
+/* partition device */
+#define MSHV_MAP_GUEST_MEMORY	_IOW(MSHV_IOCTL, 0x02, struct mshv_user_mem_region)
+#define MSHV_UNMAP_GUEST_MEMORY	_IOW(MSHV_IOCTL, 0x03, struct mshv_user_mem_region)
+
 #endif
diff --git a/virt/mshv/mshv_main.c b/virt/mshv/mshv_main.c
index 162a1bb42a4a..ce480598e67f 100644
--- a/virt/mshv/mshv_main.c
+++ b/virt/mshv/mshv_main.c
@@ -60,6 +60,10 @@ static struct miscdevice mshv_dev = {
 
 #define HV_WITHDRAW_BATCH_SIZE	(PAGE_SIZE / sizeof(u64))
 #define HV_INIT_PARTITION_DEPOSIT_PAGES 208
+#define HV_MAP_GPA_MASK		(0x0000000FFFFFFFFFULL)
+#define HV_MAP_GPA_BATCH_SIZE	\
+		(PAGE_SIZE / sizeof(struct hv_map_gpa_pages) / sizeof(u64))
+#define PIN_PAGES_BATCH_SIZE	(0x10000000 / PAGE_SIZE)
 
 static int
 hv_call_withdraw_memory(u64 count, int node, u64 partition_id)
@@ -245,16 +249,318 @@ hv_call_delete_partition(u64 partition_id)
 	return -hv_status_to_errno(status);
 }
 
+static int
+hv_call_map_gpa_pages(u64 partition_id,
+		      u64 gpa_target,
+		      u64 page_count, u32 flags,
+		      struct page **pages)
+{
+	struct hv_map_gpa_pages *input_page;
+	int status;
+	int i;
+	struct page **p;
+	u32 completed = 0;
+	u64 hypercall_status;
+	unsigned long remaining = page_count;
+	int rep_count;
+	unsigned long irq_flags;
+	int ret = 0;
+
+	while (remaining) {
+
+		rep_count = min(remaining, HV_MAP_GPA_BATCH_SIZE);
+
+		local_irq_save(irq_flags);
+		input_page = (struct hv_map_gpa_pages *)(*this_cpu_ptr(
+			hyperv_pcpu_input_arg));
+
+		input_page->target_partition_id = partition_id;
+		input_page->target_gpa_base = gpa_target;
+		input_page->map_flags = flags;
+
+		for (i = 0, p = pages; i < rep_count; i++, p++)
+			input_page->source_gpa_page_list[i] =
+				page_to_pfn(*p) & HV_MAP_GPA_MASK;
+		hypercall_status = hv_do_rep_hypercall(
+			HVCALL_MAP_GPA_PAGES, rep_count, 0, input_page, NULL);
+		local_irq_restore(irq_flags);
+
+		status = hypercall_status & HV_HYPERCALL_RESULT_MASK;
+		completed = (hypercall_status & HV_HYPERCALL_REP_COMP_MASK) >>
+				HV_HYPERCALL_REP_COMP_OFFSET;
+
+		if (status == HV_STATUS_INSUFFICIENT_MEMORY) {
+			ret = hv_call_deposit_pages(NUMA_NO_NODE,
+						    partition_id, 256);
+			if (ret)
+				break;
+		} else if (status != HV_STATUS_SUCCESS) {
+			pr_err("%s: completed %llu out of %llu, %s\n",
+			       __func__,
+			       page_count - remaining, page_count,
+			       hv_status_to_string(status));
+			ret = -hv_status_to_errno(status);
+			break;
+		}
+
+		pages += completed;
+		remaining -= completed;
+		gpa_target += completed;
+	}
+
+	if (ret && completed) {
+		pr_err("%s: Partially succeeded; mapped regions may be in invalid state",
+		       __func__);
+		ret = -EBADFD;
+	}
+
+	return ret;
+}
+
+static int
+hv_call_unmap_gpa_pages(u64 partition_id,
+			u64 gpa_target,
+			u64 page_count, u32 flags)
+{
+	struct hv_unmap_gpa_pages *input_page;
+	int status;
+	int ret = 0;
+	u32 completed = 0;
+	u64 hypercall_status;
+	unsigned long remaining = page_count;
+	int rep_count;
+	unsigned long irq_flags;
+
+	local_irq_save(irq_flags);
+	input_page = (struct hv_unmap_gpa_pages *)(*this_cpu_ptr(
+		hyperv_pcpu_input_arg));
+
+	input_page->target_partition_id = partition_id;
+	input_page->target_gpa_base = gpa_target;
+	input_page->unmap_flags = flags;
+
+	while (remaining) {
+		rep_count = min(remaining, HV_MAP_GPA_BATCH_SIZE);
+		hypercall_status = hv_do_rep_hypercall(
+			HVCALL_UNMAP_GPA_PAGES, rep_count, 0, input_page, NULL);
+		status = hypercall_status & HV_HYPERCALL_RESULT_MASK;
+		completed = (hypercall_status & HV_HYPERCALL_REP_COMP_MASK) >>
+				HV_HYPERCALL_REP_COMP_OFFSET;
+		if (status != HV_STATUS_SUCCESS) {
+			pr_err("%s: completed %llu out of %llu, %s\n",
+			       __func__,
+			       page_count - remaining, page_count,
+			       hv_status_to_string(status));
+			ret = -hv_status_to_errno(status);
+			break;
+		}
+
+		remaining -= completed;
+		gpa_target += completed;
+		input_page->target_gpa_base = gpa_target;
+	}
+	local_irq_restore(irq_flags);
+
+	if (ret && completed) {
+		pr_err("%s: Partially succeeded; mapped regions may be in invalid state",
+		       __func__);
+		ret = -EBADFD;
+	}
+
+	return ret;
+}
+
+static long
+mshv_partition_ioctl_map_memory(struct mshv_partition *partition,
+				struct mshv_user_mem_region __user *user_mem)
+{
+	struct mshv_user_mem_region mem;
+	struct mshv_mem_region *region;
+	int completed;
+	unsigned long remaining, batch_size;
+	int i;
+	struct page **pages;
+	u64 page_count, user_start, user_end, gpfn_start, gpfn_end;
+	u64 region_page_count, region_user_start, region_user_end;
+	u64 region_gpfn_start, region_gpfn_end;
+	long ret = 0;
+
+	/* Check we have enough slots*/
+	if (partition->regions.count == MSHV_MAX_MEM_REGIONS) {
+		pr_err("%s: not enough memory region slots\n", __func__);
+		return -ENOSPC;
+	}
+
+	if (copy_from_user(&mem, user_mem, sizeof(mem)))
+		return -EFAULT;
+
+	if (!mem.size ||
+	    mem.size & (PAGE_SIZE - 1) ||
+	    mem.userspace_addr & (PAGE_SIZE - 1) ||
+	    !access_ok(mem.userspace_addr, mem.size))
+		return -EINVAL;
+
+	/* Reject overlapping regions */
+	page_count = mem.size >> PAGE_SHIFT;
+	user_start = mem.userspace_addr;
+	user_end = mem.userspace_addr + mem.size;
+	gpfn_start = mem.guest_pfn;
+	gpfn_end = mem.guest_pfn + page_count;
+	for (i = 0; i < MSHV_MAX_MEM_REGIONS; ++i) {
+		region = &partition->regions.slots[i];
+		if (!region->size)
+			continue;
+		region_page_count = region->size >> PAGE_SHIFT;
+		region_user_start = region->userspace_addr;
+		region_user_end = region->userspace_addr + region->size;
+		region_gpfn_start = region->guest_pfn;
+		region_gpfn_end = region->guest_pfn + region_page_count;
+
+		if (!(
+		     (user_end <= region_user_start) ||
+		     (region_user_end <= user_start))) {
+			return -EEXIST;
+		}
+		if (!(
+		     (gpfn_end <= region_gpfn_start) ||
+		     (region_gpfn_end <= gpfn_start))) {
+			return -EEXIST;
+		}
+	}
+
+	/* Pin the userspace pages */
+	pages = vzalloc(sizeof(struct page *) * page_count);
+	if (!pages)
+		return -ENOMEM;
+
+	remaining = page_count;
+	while (remaining) {
+		/*
+		 * We need to batch this, as pin_user_pages_fast with the
+		 * FOLL_LONGTERM flag does a big temporary allocation
+		 * of contiguous memory
+		 */
+		batch_size = min(remaining, PIN_PAGES_BATCH_SIZE);
+		completed = pin_user_pages_fast(
+				mem.userspace_addr +
+					(page_count - remaining) * PAGE_SIZE,
+				batch_size,
+				FOLL_WRITE | FOLL_LONGTERM,
+				&pages[page_count - remaining]);
+		if (completed < 0) {
+			pr_err("%s: failed to pin user pages error %i\n",
+			       __func__,
+			       completed);
+			ret = completed;
+			goto err_unpin_pages;
+		}
+		remaining -= completed;
+	}
+
+	/* Map the pages to GPA pages */
+	ret = hv_call_map_gpa_pages(partition->id, mem.guest_pfn,
+				    page_count, mem.flags, pages);
+	if (ret)
+		goto err_unpin_pages;
+
+	/* Install the new region */
+	for (i = 0; i < MSHV_MAX_MEM_REGIONS; ++i) {
+		if (!partition->regions.slots[i].size) {
+			region = &partition->regions.slots[i];
+			break;
+		}
+	}
+	region->pages = pages;
+	region->size = mem.size;
+	region->guest_pfn = mem.guest_pfn;
+	region->userspace_addr = mem.userspace_addr;
+
+	partition->regions.count++;
+
+	return 0;
+
+err_unpin_pages:
+	unpin_user_pages(pages, page_count - remaining);
+	vfree(pages);
+
+	return ret;
+}
+
+static long
+mshv_partition_ioctl_unmap_memory(struct mshv_partition *partition,
+				  struct mshv_user_mem_region __user *user_mem)
+{
+	struct mshv_user_mem_region mem;
+	struct mshv_mem_region *region_ptr;
+	int i;
+	u64 page_count;
+	long ret;
+
+	if (!partition->regions.count)
+		return -EINVAL;
+
+	if (copy_from_user(&mem, user_mem, sizeof(mem)))
+		return -EFAULT;
+
+	/* Find matching region */
+	for (i = 0; i < MSHV_MAX_MEM_REGIONS; ++i) {
+		if (!partition->regions.slots[i].size)
+			continue;
+		region_ptr = &partition->regions.slots[i];
+		if (region_ptr->userspace_addr == mem.userspace_addr &&
+		    region_ptr->size == mem.size &&
+		    region_ptr->guest_pfn == mem.guest_pfn)
+			break;
+	}
+
+	if (i == MSHV_MAX_MEM_REGIONS)
+		return -EINVAL;
+
+	page_count = region_ptr->size >> PAGE_SHIFT;
+	ret = hv_call_unmap_gpa_pages(partition->id, region_ptr->guest_pfn,
+				      page_count, 0);
+	if (ret)
+		return ret;
+
+	unpin_user_pages(region_ptr->pages, page_count);
+	vfree(region_ptr->pages);
+	memset(region_ptr, 0, sizeof(*region_ptr));
+	partition->regions.count--;
+
+	return 0;
+}
+
 static long
 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
-	return -ENOTTY;
+	struct mshv_partition *partition = filp->private_data;
+	long ret;
+
+	if (mutex_lock_killable(&partition->mutex))
+		return -EINTR;
+
+	switch (ioctl) {
+	case MSHV_MAP_GUEST_MEMORY:
+		ret = mshv_partition_ioctl_map_memory(partition,
+							(void __user *)arg);
+		break;
+	case MSHV_UNMAP_GUEST_MEMORY:
+		ret = mshv_partition_ioctl_unmap_memory(partition,
+							(void __user *)arg);
+		break;
+	default:
+		ret = -ENOTTY;
+	}
+
+	mutex_unlock(&partition->mutex);
+	return ret;
 }
 
 static void
 destroy_partition(struct mshv_partition *partition)
 {
-	unsigned long flags;
+	unsigned long flags, page_count;
+	struct mshv_mem_region *region;
 	int i;
 
 	/* Remove from list of partitions */
@@ -286,6 +592,16 @@ destroy_partition(struct mshv_partition *partition)
 
 	hv_call_delete_partition(partition->id);
 
+	/* Remove regions and unpin the pages */
+	for (i = 0; i < MSHV_MAX_MEM_REGIONS; ++i) {
+		region = &partition->regions.slots[i];
+		if (!region->size)
+			continue;
+		page_count = region->size >> PAGE_SHIFT;
+		unpin_user_pages(region->pages, page_count);
+		vfree(region->pages);
+	}
+
 	kfree(partition);
 }
 
@@ -353,6 +669,8 @@ mshv_ioctl_create_partition(void __user *user_arg)
 	if (!partition)
 		return -ENOMEM;
 
+	mutex_init(&partition->mutex);
+
 	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (fd < 0) {
 		ret = fd;
-- 
2.25.1




[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux