[PATCH 11/11] hmm/dummy_driver: add support for fake remote memory using pages.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Jérôme Glisse <jglisse@xxxxxxxxxx>

Fake the existent of remote memory using preallocated pages and
demonstrate how to use the hmm api related to remote memory.

Signed-off-by: Jérôme Glisse <jglisse@xxxxxxxxxx>
---
 drivers/char/hmm_dummy.c       | 450 ++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/hmm_dummy.h |   8 +-
 2 files changed, 453 insertions(+), 5 deletions(-)

diff --git a/drivers/char/hmm_dummy.c b/drivers/char/hmm_dummy.c
index e87dc7c..2443374 100644
--- a/drivers/char/hmm_dummy.c
+++ b/drivers/char/hmm_dummy.c
@@ -48,6 +48,8 @@
 
 #define HMM_DUMMY_DEVICE_NAME		"hmm_dummy_device"
 #define HMM_DUMMY_DEVICE_MAX_MIRRORS	4
+#define HMM_DUMMY_DEVICE_RMEM_SIZE	(32UL << 20UL)
+#define HMM_DUMMY_DEVICE_RMEM_NBITS	(HMM_DUMMY_DEVICE_RMEM_SIZE >> PAGE_SHIFT)
 
 struct hmm_dummy_device;
 
@@ -73,8 +75,16 @@ struct hmm_dummy_device {
 	/* device file mapping tracking (keep track of all vma) */
 	struct hmm_dummy_mirror	*dmirrors[HMM_DUMMY_DEVICE_MAX_MIRRORS];
 	struct address_space	*fmapping[HMM_DUMMY_DEVICE_MAX_MIRRORS];
+	struct page		**rmem_pages;
+	unsigned long		*rmem_bitmap;
 };
 
+struct hmm_dummy_rmem {
+	struct hmm_rmem		rmem;
+	unsigned long		fuid;
+	unsigned long		luid;
+	uint16_t		*rmem_idx;
+};
 
 /* We only create 2 device to show the inter device rmem sharing/migration
  * capabilities.
@@ -482,6 +492,51 @@ static void hmm_dummy_pt_free(struct hmm_dummy_mirror *dmirror,
 }
 
 
+/* hmm_dummy_rmem - dummy remote memory using system memory pages
+ *
+ * Helper function to allocate fake remote memory out of the device rmem_pages.
+ */
+static void hmm_dummy_rmem_free(struct hmm_dummy_rmem *drmem)
+{
+	struct hmm_dummy_device *ddevice;
+	struct hmm_rmem *rmem = &drmem->rmem;
+	unsigned long i, npages;
+
+	npages = (rmem->luid - rmem->fuid);
+	ddevice = container_of(rmem->device, struct hmm_dummy_device, device);
+	mutex_lock(&ddevice->mutex);
+	for (i = 0; i < npages; ++i) {
+		clear_bit(drmem->rmem_idx[i], ddevice->rmem_bitmap);
+	}
+	mutex_unlock(&ddevice->mutex);
+
+	kfree(drmem->rmem_idx);
+	drmem->rmem_idx = NULL;
+}
+
+static struct hmm_dummy_rmem *hmm_dummy_rmem_new(void)
+{
+	struct hmm_dummy_rmem *drmem;
+
+	drmem = kzalloc(sizeof(*drmem), GFP_KERNEL);
+	return drmem;
+}
+
+static int hmm_dummy_mirror_lmem_to_rmem(struct hmm_dummy_mirror *dmirror,
+					 unsigned long faddr,
+					 unsigned long laddr)
+{
+	struct hmm_mirror *mirror = &dmirror->mirror;
+	struct hmm_fault fault;
+	int ret;
+
+	fault.faddr = faddr & PAGE_MASK;
+	fault.laddr = PAGE_ALIGN(laddr);
+	ret = hmm_migrate_lmem_to_rmem(&fault, mirror);
+	return ret;
+}
+
+
 /* hmm_ops - hmm callback for the hmm dummy driver.
  *
  * Below are the various callback that the hmm api require for a device. The
@@ -574,7 +629,7 @@ static struct hmm_fence *hmm_dummy_lmem_update(struct hmm_mirror *mirror,
 
 			page = hmm_dummy_pte_to_page(*pldp);
 			if (page) {
-				set_page_dirty(page);
+				set_page_dirty_lock(page);
 			}
 		}
 		*pldp &= ~HMM_DUMMY_PTE_DIRTY;
@@ -631,6 +686,318 @@ static int hmm_dummy_lmem_fault(struct hmm_mirror *mirror,
 	return 0;
 }
 
+static struct hmm_rmem *hmm_dummy_rmem_alloc(struct hmm_device *device,
+					     struct hmm_fault *fault)
+{
+	struct hmm_dummy_device *ddevice;
+	struct hmm_dummy_rmem *drmem;
+	struct hmm_rmem *rmem;
+	unsigned long i, npages;
+
+	ddevice = container_of(device, struct hmm_dummy_device, device);
+
+	drmem = hmm_dummy_rmem_new();
+	if (drmem == NULL) {
+		return ERR_PTR(-ENOMEM);
+	}
+	rmem = &drmem->rmem;
+
+	npages = (fault->laddr - fault->faddr) >> PAGE_SHIFT;
+	drmem->rmem_idx = kmalloc(npages * sizeof(uint16_t), GFP_KERNEL);
+	if (drmem->rmem_idx == NULL) {
+		kfree(drmem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mutex_lock(&ddevice->mutex);
+	for (i = 0; i < npages; ++i) {
+		int r;
+
+		r = find_first_zero_bit(ddevice->rmem_bitmap,
+					HMM_DUMMY_DEVICE_RMEM_NBITS);
+		if (r < 0) {
+			while ((--i)) {
+				clear_bit(drmem->rmem_idx[i],
+					  ddevice->rmem_bitmap);
+			}
+			kfree(drmem->rmem_idx);
+			kfree(drmem);
+			mutex_unlock(&ddevice->mutex);
+			return ERR_PTR(-ENOMEM);
+		}
+		drmem->rmem_idx[i] = r;
+	}
+	mutex_unlock(&ddevice->mutex);
+
+	return rmem;
+}
+
+static struct hmm_fence *hmm_dummy_rmem_update(struct hmm_mirror *mirror,
+					       struct hmm_rmem *rmem,
+					       unsigned long faddr,
+					       unsigned long laddr,
+					       unsigned long fuid,
+					       enum hmm_etype etype,
+					       bool dirty)
+{
+	struct hmm_dummy_mirror *dmirror;
+	struct hmm_dummy_pt_map pt_map = {0};
+	unsigned long addr, i, mask, or, idx;
+
+	dmirror = container_of(mirror, struct hmm_dummy_mirror, mirror);
+	pt_map.dmirror = dmirror;
+	idx = fuid - rmem->fuid;
+
+	/* Sanity check for debugging hmm real device driver do not have to do that. */
+	switch (etype) {
+	case HMM_UNREGISTER:
+	case HMM_UNMAP:
+	case HMM_MUNMAP:
+	case HMM_MPROT_WONLY:
+	case HMM_MIGRATE_TO_RMEM:
+	case HMM_MIGRATE_TO_LMEM:
+		mask = 0;
+		or = 0;
+		break;
+	case HMM_MPROT_RONLY:
+	case HMM_WRITEBACK:
+		mask = ~HMM_DUMMY_PTE_WRITE;
+		or = 0;
+		break;
+	case HMM_MPROT_RANDW:
+		mask = -1L;
+		or = HMM_DUMMY_PTE_WRITE;
+		break;
+	default:
+		printk(KERN_ERR "%4d:%s invalid event type %d\n",
+		       __LINE__, __func__, etype);
+		return ERR_PTR(-EIO);
+	}
+
+	mutex_lock(&dmirror->mutex);
+	for (i = 0, addr = faddr; addr < laddr; ++i, addr += PAGE_SIZE, ++idx) {
+		unsigned long *pldp;
+
+		pldp = hmm_dummy_pt_pld_map(&pt_map, addr);
+		if (!pldp) {
+			continue;
+		}
+		if (dirty && ((*pldp) & HMM_DUMMY_PTE_DIRTY)) {
+			hmm_pfn_set_dirty(&rmem->pfns[idx]);
+		}
+		*pldp &= ~HMM_DUMMY_PTE_DIRTY;
+		*pldp &= mask;
+		*pldp |= or;
+	}
+	hmm_dummy_pt_unmap(&pt_map);
+
+	switch (etype) {
+	case HMM_UNREGISTER:
+	case HMM_MUNMAP:
+		hmm_dummy_pt_free(dmirror, faddr, laddr);
+		break;
+	default:
+		break;
+	}
+	mutex_unlock(&dmirror->mutex);
+	return NULL;
+}
+
+static int hmm_dummy_rmem_fault(struct hmm_mirror *mirror,
+				struct hmm_rmem *rmem,
+				unsigned long faddr,
+				unsigned long laddr,
+				unsigned long fuid,
+				struct hmm_fault *fault)
+{
+	struct hmm_dummy_mirror *dmirror;
+	struct hmm_dummy_device *ddevice;
+	struct hmm_dummy_pt_map pt_map = {0};
+	struct hmm_dummy_rmem *drmem;
+	unsigned long i;
+	bool write = fault ? !!(fault->flags & HMM_FAULT_WRITE) : false;
+
+	dmirror = container_of(mirror, struct hmm_dummy_mirror, mirror);
+	drmem = container_of(rmem, struct hmm_dummy_rmem, rmem);
+	ddevice = dmirror->ddevice;
+	pt_map.dmirror = dmirror;
+
+	mutex_lock(&dmirror->mutex);
+	for (i = fuid; faddr < laddr; ++i, faddr += PAGE_SIZE) {
+		unsigned long *pldp, pld_idx, pfn, idx = i - rmem->fuid;
+
+		pldp = hmm_dummy_pt_pld_map(&pt_map, faddr);
+		if (!pldp) {
+			continue;
+		}
+		pfn = page_to_pfn(ddevice->rmem_pages[drmem->rmem_idx[idx]]);
+		pld_idx = hmm_dummy_pld_index(faddr);
+		pldp[pld_idx]  = (pfn << HMM_DUMMY_PFN_SHIFT);
+		if (test_bit(HMM_PFN_WRITE, &rmem->pfns[idx])) {
+			pldp[pld_idx] |=  HMM_DUMMY_PTE_WRITE;
+			hmm_pfn_clear_lmem_uptodate(&rmem->pfns[idx]);
+		}
+		pldp[pld_idx] |= HMM_DUMMY_PTE_VALID_PAGE;
+		if (write && !test_bit(HMM_PFN_WRITE, &rmem->pfns[idx])) {
+			/* Fallback to use system memory. Other solution would be
+			 * to migrate back to system memory.
+			 */
+			hmm_pfn_clear_rmem_uptodate(&rmem->pfns[idx]);
+			if (!test_bit(HMM_PFN_LMEM_UPTODATE, &rmem->pfns[idx])) {
+				struct page *spage, *dpage;
+
+				dpage = hmm_pfn_to_page(rmem->pfns[idx]);
+				spage = ddevice->rmem_pages[drmem->rmem_idx[idx]];
+				copy_highpage(dpage, spage);
+				hmm_pfn_set_lmem_uptodate(&rmem->pfns[idx]);
+			}
+			pfn = rmem->pfns[idx] >> HMM_PFN_SHIFT;
+			pldp[pld_idx]  = (pfn << HMM_DUMMY_PFN_SHIFT);
+			pldp[pld_idx] |= HMM_DUMMY_PTE_WRITE;
+			pldp[pld_idx] |= HMM_DUMMY_PTE_VALID_PAGE;
+		}
+	}
+	hmm_dummy_pt_unmap(&pt_map);
+	mutex_unlock(&dmirror->mutex);
+	return 0;
+}
+
+struct hmm_fence *hmm_dummy_rmem_to_lmem(struct hmm_rmem *rmem,
+					 unsigned long fuid,
+					 unsigned long luid)
+{
+	struct hmm_dummy_device *ddevice;
+	struct hmm_dummy_rmem *drmem;
+	unsigned long i;
+
+	ddevice = container_of(rmem->device, struct hmm_dummy_device, device);
+	drmem = container_of(rmem, struct hmm_dummy_rmem, rmem);
+
+	for (i = fuid; i < luid; ++i) {
+		unsigned long idx = i - rmem->fuid;
+		struct page *spage, *dpage;
+
+		if (test_bit(HMM_PFN_LMEM_UPTODATE, &rmem->pfns[idx])) {
+			/* This lmem page is already uptodate. */
+			continue;
+		}
+		spage = ddevice->rmem_pages[drmem->rmem_idx[idx]];
+		dpage = hmm_pfn_to_page(rmem->pfns[idx]);
+		if (!dpage) {
+			return ERR_PTR(-EINVAL);
+		}
+		copy_highpage(dpage, spage);
+		hmm_pfn_set_lmem_uptodate(&rmem->pfns[idx]);
+	}
+
+	return NULL;
+}
+
+struct hmm_fence *hmm_dummy_lmem_to_rmem(struct hmm_rmem *rmem,
+					 unsigned long fuid,
+					 unsigned long luid)
+{
+	struct hmm_dummy_device *ddevice;
+	struct hmm_dummy_rmem *drmem;
+	unsigned long i;
+
+	ddevice = container_of(rmem->device, struct hmm_dummy_device, device);
+	drmem = container_of(rmem, struct hmm_dummy_rmem, rmem);
+
+	for (i = fuid; i < luid; ++i) {
+		unsigned long idx = i - rmem->fuid;
+		struct page *spage, *dpage;
+
+		if (test_bit(HMM_PFN_RMEM_UPTODATE, &rmem->pfns[idx])) {
+			/* This rmem page is already uptodate. */
+			continue;
+		}
+		dpage = ddevice->rmem_pages[drmem->rmem_idx[idx]];
+		spage = hmm_pfn_to_page(rmem->pfns[idx]);
+		if (!spage) {
+			return ERR_PTR(-EINVAL);
+		}
+		copy_highpage(dpage, spage);
+		hmm_pfn_set_rmem_uptodate(&rmem->pfns[idx]);
+	}
+
+	return NULL;
+}
+
+static int hmm_dummy_rmem_do_split(struct hmm_rmem *rmem,
+				   unsigned long fuid,
+				   unsigned long luid)
+{
+	struct hmm_dummy_rmem *drmem, *dnew;
+	struct hmm_fault fault;
+	struct hmm_rmem *new;
+	unsigned long i, pgoff, npages;
+	int ret;
+
+	drmem = container_of(rmem, struct hmm_dummy_rmem, rmem);
+	npages = (luid - fuid);
+	pgoff = (fuid == rmem->fuid) ? 0 : fuid - rmem->fuid;
+	fault.faddr = 0;
+	fault.laddr = npages << PAGE_SHIFT;
+	new = hmm_dummy_rmem_alloc(rmem->device, &fault);
+	if (IS_ERR(new)) {
+		return PTR_ERR(new);
+	}
+	dnew = container_of(new, struct hmm_dummy_rmem, rmem);
+
+	new->fuid = fuid;
+	new->luid = luid;
+	ret = hmm_rmem_split_new(rmem, new);
+	if (ret) {
+		return ret;
+	}
+
+	/* Update the rmem it is fine to hold no lock as no one else can access
+	 * both of this rmem object as long as the range are reserved.
+	 */
+	for (i = 0; i < npages; ++i) {
+		dnew->rmem_idx[i] = drmem->rmem_idx[i + pgoff];
+	}
+	if (!pgoff) {
+		for (i = 0; i < (rmem->luid - rmem->fuid); ++i) {
+			drmem->rmem_idx[i] = drmem->rmem_idx[i + npages];
+		}
+	}
+
+	return 0;
+}
+
+static int hmm_dummy_rmem_split(struct hmm_rmem *rmem,
+				unsigned long fuid,
+				unsigned long luid)
+{
+	int ret;
+
+	if (fuid > rmem->fuid) {
+		ret = hmm_dummy_rmem_do_split(rmem, rmem->fuid, fuid);
+		if (ret) {
+			return ret;
+		}
+	}
+	if (luid < rmem->luid) {
+		ret = hmm_dummy_rmem_do_split(rmem, luid, rmem->luid);
+		if (ret) {
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void hmm_dummy_rmem_destroy(struct hmm_rmem *rmem)
+{
+	struct hmm_dummy_rmem *drmem;
+
+	drmem = container_of(rmem, struct hmm_dummy_rmem, rmem);
+	hmm_dummy_rmem_free(drmem);
+	kfree(drmem);
+}
+
 static const struct hmm_device_ops hmm_dummy_ops = {
 	.device_destroy		= &hmm_dummy_device_destroy,
 	.mirror_release		= &hmm_dummy_mirror_release,
@@ -638,6 +1005,14 @@ static const struct hmm_device_ops hmm_dummy_ops = {
 	.fence_wait		= &hmm_dummy_fence_wait,
 	.lmem_update		= &hmm_dummy_lmem_update,
 	.lmem_fault		= &hmm_dummy_lmem_fault,
+	.rmem_alloc		= &hmm_dummy_rmem_alloc,
+	.rmem_update		= &hmm_dummy_rmem_update,
+	.rmem_fault		= &hmm_dummy_rmem_fault,
+	.rmem_to_lmem		= &hmm_dummy_rmem_to_lmem,
+	.lmem_to_rmem		= &hmm_dummy_lmem_to_rmem,
+	.rmem_split		= &hmm_dummy_rmem_split,
+	.rmem_split_adjust	= &hmm_dummy_rmem_split,
+	.rmem_destroy		= &hmm_dummy_rmem_destroy,
 };
 
 
@@ -880,7 +1255,7 @@ static ssize_t hmm_dummy_fops_write(struct file *filp,
 		if (!(pldp[pld_idx] & HMM_DUMMY_PTE_WRITE)) {
 			hmm_dummy_pt_unmap(&pt_map);
 			mutex_unlock(&dmirror->mutex);
-				goto fault;
+			goto fault;
 		}
 		pldp[pld_idx] |= HMM_DUMMY_PTE_DIRTY;
 		page = hmm_dummy_pte_to_page(pldp[pld_idx]);
@@ -964,8 +1339,11 @@ static long hmm_dummy_fops_unlocked_ioctl(struct file *filp,
 					  unsigned int command,
 					  unsigned long arg)
 {
+	struct hmm_dummy_migrate dmigrate;
 	struct hmm_dummy_device *ddevice;
 	struct hmm_dummy_mirror *dmirror;
+	struct hmm_mirror *mirror;
+	void __user *uarg = (void __user *)arg;
 	unsigned minor;
 	int ret;
 
@@ -1011,6 +1389,31 @@ static long hmm_dummy_fops_unlocked_ioctl(struct file *filp,
 				       "mirroring address space of %d\n",
 				       dmirror->pid);
 		return 0;
+	case HMM_DUMMY_MIGRATE_TO_RMEM:
+		mutex_lock(&ddevice->mutex);
+		dmirror = ddevice->dmirrors[minor];
+		if (!dmirror) {
+			mutex_unlock(&ddevice->mutex);
+			return -EINVAL;
+		}
+		mirror = &dmirror->mirror;
+		mutex_unlock(&ddevice->mutex);
+
+		if (copy_from_user(&dmigrate, uarg, sizeof(dmigrate))) {
+			return -EFAULT;
+		}
+
+		ret = hmm_dummy_pt_alloc(dmirror,
+					 dmigrate.faddr,
+					 dmigrate.laddr);
+		if (ret) {
+			return ret;
+		}
+
+		ret = hmm_dummy_mirror_lmem_to_rmem(dmirror,
+						    dmigrate.faddr,
+						    dmigrate.laddr);
+		return ret;
 	default:
 		return -EINVAL;
 	}
@@ -1034,7 +1437,31 @@ static const struct file_operations hmm_dummy_fops = {
  */
 static int hmm_dummy_device_init(struct hmm_dummy_device *ddevice)
 {
-	int ret, i;
+	struct page **pages;
+	unsigned long *bitmap;
+	int ret, i, npages;
+
+	npages = HMM_DUMMY_DEVICE_RMEM_SIZE >> PAGE_SHIFT;
+	bitmap = kzalloc(BITS_TO_LONGS(npages) * sizeof(long), GFP_KERNEL);
+	if (!bitmap) {
+		return -ENOMEM;
+	}
+	pages = kzalloc(npages * sizeof(void*), GFP_KERNEL);
+	if (!pages) {
+		kfree(bitmap);
+		return -ENOMEM;
+	}
+	for (i = 0; i < npages; ++i) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (!pages[i]) {
+			while ((--i)) {
+				__free_page(pages[i]);
+			}
+			kfree(bitmap);
+			kfree(pages);
+			return -ENOMEM;
+		}
+	}
 
 	ret = alloc_chrdev_region(&ddevice->dev, 0,
 				  HMM_DUMMY_DEVICE_MAX_MIRRORS,
@@ -1066,15 +1493,23 @@ static int hmm_dummy_device_init(struct hmm_dummy_device *ddevice)
 		goto error;
 	}
 
+	ddevice->rmem_bitmap = bitmap;
+	ddevice->rmem_pages = pages;
+
 	return 0;
 
 error:
+	for (i = 0; i < npages; ++i) {
+		__free_page(pages[i]);
+	}
+	kfree(bitmap);
+	kfree(pages);
 	return ret;
 }
 
 static void hmm_dummy_device_fini(struct hmm_dummy_device *ddevice)
 {
-	unsigned i;
+	unsigned i, npages;
 
 	/* First finish hmm. */
 	for (i = 0; i < HMM_DUMMY_DEVICE_MAX_MIRRORS; i++) {
@@ -1092,6 +1527,13 @@ static void hmm_dummy_device_fini(struct hmm_dummy_device *ddevice)
 	cdev_del(&ddevice->cdev);
 	unregister_chrdev_region(ddevice->dev,
 				 HMM_DUMMY_DEVICE_MAX_MIRRORS);
+
+	npages = HMM_DUMMY_DEVICE_RMEM_SIZE >> PAGE_SHIFT;
+	for (i = 0; i < npages; ++i) {
+		__free_page(ddevice->rmem_pages[i]);
+	}
+	kfree(ddevice->rmem_bitmap);
+	kfree(ddevice->rmem_pages);
 }
 
 static int __init hmm_dummy_init(void)
diff --git a/include/uapi/linux/hmm_dummy.h b/include/uapi/linux/hmm_dummy.h
index 16ae0d3..027c453 100644
--- a/include/uapi/linux/hmm_dummy.h
+++ b/include/uapi/linux/hmm_dummy.h
@@ -29,6 +29,12 @@
 #include <linux/irqnr.h>
 
 /* Expose the address space of the calling process through hmm dummy dev file */
-#define HMM_DUMMY_EXPOSE_MM	_IO( 'R', 0x00 )
+#define HMM_DUMMY_EXPOSE_MM		_IO( 'R', 0x00 )
+#define HMM_DUMMY_MIGRATE_TO_RMEM	_IO( 'R', 0x01 )
+
+struct hmm_dummy_migrate {
+	uint64_t		faddr;
+	uint64_t		laddr;
+};
 
 #endif /* _UAPI_LINUX_RANDOM_H */
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux