[RFC 9/9] prd: Add support for page struct mapping

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Yigal Korman <yigal@xxxxxxxxxxxxx>

One of the current short comings of the NVDIMM/PMEM
support is that this memory does not have a page-struct(s)
associated with its memory and therefor cannot be passed
to a block-device or network or DMAed in any way through
another device in the system.

This simple patch fixes all this. After this patch an FS
can do:
	bdev_direct_access(,&pfn,);
	page = pfn_to_page(pfn);
And use that page for a lock_page(), set_page_dirty(), and/or
anything else one might do with a page *.
(Note that with brd one can already do this)

[pmem-pages-ref-count]
pmem will serve it's pages with ref==0. Once an FS does
an blkdev_get_XXX(,FMODE_EXCL,), that memory is own by the FS.
The FS needs to manage its allocation, just as it already does
for its disk blocks. The fs should set page->count = 2, before
submission to any Kernel subsystem so when it returns it will
never be released to the Kernel's page-allocators. (page_freeze)

All is actually needed for this is to allocate page-sections
and map them into kernel virtual memory. Note that these sections
are not associated with any zone, because that would add them to
the page_allocators.

In order to reuse existing code, prd now depends on memory hotplug
and sparse memory configuration options.

If system has enabled MEMORY_HOTPLUG_SPARSE then a new config option
BLK_DEV_PMEM_USE_PAGES is enabled (Yes by default)

We will also need MEMORY_HOTREMOVE so if BLK_DEV_PMEM_USE_PAGES
is on we will "select" MEMORY_HOTREMOVE. Most distro's have
MEMORY_HOTPLUG_SPARSE on but not MEMORY_HOTREMOVE. For us here
we must have both.

Signed-off-by: Yigal Korman <yigal@xxxxxxxxxxxxx>
Signed-off-by: Boaz Harrosh <boaz@xxxxxxxxxxxxx>
---
 drivers/block/Kconfig |  13 +++++
 drivers/block/prd.c   | 137 ++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 145 insertions(+), 5 deletions(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 8f0c225..8aca1b7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -416,6 +416,19 @@ config BLK_DEV_PMEM
 	  Most normal users won't need this functionality, and can thus say N
 	  here.
 
+config BLK_DEV_PMEM_USE_PAGES
+	bool "Enable use of page struct pages with pmem"
+	depends on BLK_DEV_PMEM
+	depends on MEMORY_HOTPLUG_SPARSE
+	select MEMORY_HOTREMOVE
+	default y
+	help
+	  If a user of PMEM device needs "struct page" associated
+	  with its memory, so this memory can be sent to other
+	  block devices, or sent on the network, or be DMA transferred
+	  to other devices in the system, then you must say "Yes" here.
+	  If unsure leave as Yes.
+
 config CDROM_PKTCDVD
 	tristate "Packet writing on CD/DVD media"
 	depends on !UML
diff --git a/drivers/block/prd.c b/drivers/block/prd.c
index 36b8fe4..6115553 100644
--- a/drivers/block/prd.c
+++ b/drivers/block/prd.c
@@ -241,6 +241,134 @@ MODULE_PARM_DESC(map,
 static LIST_HEAD(prd_devices);
 static DEFINE_MUTEX(prd_devices_mutex);
 
+#ifdef CONFIG_BLK_DEV_PMEM_USE_PAGES
+static int prd_add_page_mapping(phys_addr_t phys_addr, size_t total_size,
+				void **o_virt_addr)
+{
+	int nid = memory_add_physaddr_to_nid(phys_addr);
+	unsigned long start_pfn = phys_addr >> PAGE_SHIFT;
+	unsigned long nr_pages = total_size >> PAGE_SHIFT;
+	unsigned int start_sec = pfn_to_section_nr(start_pfn);
+	unsigned int end_sec = pfn_to_section_nr(start_pfn + nr_pages - 1);
+	unsigned long phys_start_pfn;
+	struct page **page_array, **mapped_page_array;
+	unsigned long i;
+	struct vm_struct *vm_area;
+	void *virt_addr;
+	int ret = 0;
+
+	for (i = start_sec; i <= end_sec; i++) {
+		phys_start_pfn = i << PFN_SECTION_SHIFT;
+
+		if (pfn_valid(phys_start_pfn)) {
+			pr_warn("prd: memory section %lu already exists.\n", i);
+			continue;
+		}
+
+		ret = sparse_add_one_section(nid, phys_start_pfn);
+		if (unlikely(ret < 0)) {
+			if (ret == -EEXIST) {
+				ret = 0;
+				continue;
+			} else {
+				pr_warn("prd: sparse_add_one_section => %d\n",
+					ret);
+				return ret;
+			}
+		}
+	}
+
+	virt_addr = page_address(pfn_to_page(phys_addr >> PAGE_SHIFT));
+
+	page_array = vmalloc(sizeof(struct page *) * nr_pages);
+	if (unlikely(!page_array)) {
+		pr_warn("prd: failed to allocate nr_pages=0x%lx\n", nr_pages);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i <  nr_pages; i++)
+		page_array[i] = pfn_to_page(start_pfn + i);
+
+	/* __get_vm_area requires a range of addresses from which to allocate
+	 * the vm_area. This range will include more pages that we need because
+	 * it allocates one guard page in the end. Usually you give it a wide
+	 * range from which to choose from, but we want exact addresses, so add
+	 * the size of the guard page to the end of the range (otherwise, this
+	 * will always fail)
+	 */
+	/* TODO this guard page may confuse users when asking for several pmem
+	 * devices in adjacent areas (the start of the next pmem will be
+	 * occupied by the guard page of the previous pmem)
+	 */
+	vm_area = __get_vm_area(total_size, VM_USERMAP, (ulong)virt_addr,
+				(ulong)virt_addr + total_size + PAGE_SIZE);
+	if (unlikely(!vm_area)) {
+		pr_err("prd: failed to __get_vm_area.\n");
+		ret = -ENOMEM;
+		goto free_array;
+	}
+
+	mapped_page_array = page_array;
+	ret = map_vm_area(vm_area, PAGE_KERNEL, &mapped_page_array);
+	if (unlikely(ret || mapped_page_array < (page_array + nr_pages))) {
+		pr_err("prd: failed to map_vm_area => %d\n", ret);
+		if (!ret) {
+			free_vm_area(vm_area);
+			ret = -ENOMEM;
+		}
+	}
+	*o_virt_addr = virt_addr;
+
+free_array:
+	vfree(page_array);
+	return ret;
+}
+
+static void prd_remove_page_mapping(phys_addr_t phys_addr, size_t total_size,
+				    void *virt_addr)
+{
+	unsigned long start_pfn = phys_addr >> PAGE_SHIFT;
+	unsigned long nr_pages = total_size >> PAGE_SHIFT;
+	unsigned int start_sec = pfn_to_section_nr(start_pfn);
+	unsigned int end_sec = pfn_to_section_nr(start_pfn + nr_pages - 1);
+	unsigned int i;
+
+	for (i = start_sec; i <= end_sec; i++) {
+		struct mem_section *ms = __nr_to_section(i);
+		int nid = pfn_to_nid(i << PFN_SECTION_SHIFT);
+
+		if (!valid_section(ms)) {
+			pr_warn("prd: memory section %d is missing.\n", i);
+			continue;
+		}
+
+		sparse_remove_one_section(nid, ms);
+	}
+	vunmap(virt_addr);
+}
+
+#else /* !CONFIG_BLK_DEV_PMEM_USE_PAGES */
+static int prd_add_page_mapping(phys_addr_t phys_addr, size_t total_size,
+				void **o_virt_addr)
+{
+	void *virt_addr = ioremap_cache(phys_addr, total_size);
+
+	if (unlikely(!virt_addr))
+		return -ENXIO;
+
+	*o_virt_addr = virt_addr;
+	return 0;
+}
+
+static void prd_remove_page_mapping(phys_addr_t phys_addr, size_t total_size,
+				    void *virt_addr)
+{
+	iounmap(virt_addr);
+}
+#endif /* CONFIG_BLK_DEV_PMEM_USE_PAGES */
+
+
+
 /* prd->phys_addr and prd->size need to be set.
  * Will then set virt_addr if successful.
  */
@@ -257,11 +385,10 @@ int prd_mem_map(struct prd_device *prd)
 		return -EINVAL;
 	}
 
-	prd->virt_addr = ioremap_cache(prd->phys_addr, prd->size);
-	if (unlikely(!prd->virt_addr)) {
-		err = -ENOMEM;
+	err = prd_add_page_mapping(prd->phys_addr, prd->size, &prd->virt_addr);
+	if (unlikely(err))
 		goto out_release;
-	}
+
 	return 0;
 
 out_release:
@@ -274,7 +401,7 @@ void prd_mem_unmap(struct prd_device *prd)
 	if (unlikely(!prd->virt_addr))
 		return;
 
-	iounmap(prd->virt_addr);
+	prd_remove_page_mapping(prd->phys_addr, prd->size, prd->virt_addr);
 	release_mem_region(prd->phys_addr, prd->size);
 	prd->virt_addr = NULL;
 }
-- 
1.9.3


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux