[PATCH 6/7] libnvdimm/pfn: Introduce 'struct pfn_map_info'

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The 'start_pad' accounting bug resulted from the pmem driver inferring
properties of the established pagemap to calculate pmem->phys_addr and
pmem->size, and that nd_pfn->data_offset was identical to
pmem->data_offset. The assumptions in the current implementation are
only true when 'start_pad' is zero.

The confusion resulted in the wrong offset being applied for
sector-to-physical-page translations. In advance of introducing a
corrected implementation make the mapping parameters and translation
explicit via a new 'struct pfn_map_info' to carry the mapping and
device-offset to physical address translation parameters.

No functional change intended, this cleanup preserves the existing bug.

Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
 drivers/dax/pmem.c              |    9 ++-
 drivers/nvdimm/nd.h             |   13 ++++-
 drivers/nvdimm/pfn_devs.c       |   21 ++++++-
 drivers/nvdimm/pmem.c           |  111 ++++++++++++++++++---------------------
 drivers/nvdimm/pmem.h           |   12 +---
 tools/testing/nvdimm/pmem-dax.c |   15 +++--
 6 files changed, 96 insertions(+), 85 deletions(-)

diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 2c1f459c0c63..80359df4f662 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -64,6 +64,7 @@ static int dax_pmem_probe(struct device *dev)
 	struct nd_pfn_sb *pfn_sb;
 	struct dev_dax *dev_dax;
 	struct dax_pmem *dax_pmem;
+	struct pfn_map_info mi;
 	struct nd_namespace_io *nsio;
 	struct dax_region *dax_region;
 	struct nd_namespace_common *ndns;
@@ -83,7 +84,7 @@ static int dax_pmem_probe(struct device *dev)
 	rc = devm_nsio_enable(dev, nsio);
 	if (rc)
 		return rc;
-	rc = nvdimm_setup_pfn(nd_pfn, &dax_pmem->pgmap);
+	rc = nvdimm_setup_pfn(nd_pfn, &dax_pmem->pgmap, &mi);
 	if (rc)
 		return rc;
 	devm_nsio_disable(dev, nsio);
@@ -117,8 +118,10 @@ static int dax_pmem_probe(struct device *dev)
 		return PTR_ERR(addr);
 
 	/* adjust the dax_region resource to the start of data */
-	memcpy(&res, &dax_pmem->pgmap.res, sizeof(res));
-	res.start += le64_to_cpu(pfn_sb->dataoff);
+	res = (struct resource) {
+		.start = mi.map_base + mi.map_pad + mi.map_offset,
+		.end = mi.map_base + mi.map_pad + mi.map_size - 1,
+	};
 
 	rc = sscanf(dev_name(&ndns->dev), "namespace%d.%d", &region_id, &id);
 	if (rc != 2)
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 379bf4305e61..4339d338928b 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -377,13 +377,22 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 unsigned int pmem_sector_size(struct nd_namespace_common *ndns);
 void nvdimm_badblocks_populate(struct nd_region *nd_region,
 		struct badblocks *bb, const struct resource *res);
+struct pfn_map_info {
+	resource_size_t map_base;
+	unsigned long map_offset;
+	resource_size_t map_size;
+	unsigned long map_pad;
+	u64 pfn_flags;
+	void *map;
+};
 #if IS_ENABLED(CONFIG_ND_CLAIM)
-int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap);
+int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap,
+		struct pfn_map_info *mi);
 int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio);
 void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio);
 #else
 static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
-				   struct dev_pagemap *pgmap)
+		struct dev_pagemap *pgmap, struct pfn_map_info *mi)
 {
 	return -ENXIO;
 }
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 110699f4c3e4..1c2a0e707da3 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -611,7 +611,8 @@ static unsigned long init_altmap_reserve(resource_size_t base)
 	return reserve;
 }
 
-static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
+static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap,
+		struct pfn_map_info *mi)
 {
 	struct resource *res = &pgmap->res;
 	struct vmem_altmap *altmap = &pgmap->altmap;
@@ -632,6 +633,19 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 	res->start += start_pad;
 	res->end -= end_trunc;
 
+	*mi = (struct pfn_map_info) {
+		/*
+		 * TODO fix implementation to properly account for
+		 * 'start_pad' in map_base, and map_offset. As is, the
+		 * fact that __va(map_base) != __pa(map) leads
+		 * mistranslations in pmem_direct_access().
+		 */
+		.map_base = nsio->res.start,
+		.map_pad = start_pad,
+		.map_offset = offset,
+		.map_size = resource_size(res),
+	};
+
 	if (nd_pfn->mode == PFN_MODE_RAM) {
 		if (offset < reserve)
 			return -EINVAL;
@@ -789,7 +803,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
  * Determine the effective resource range and vmem_altmap from an nd_pfn
  * instance.
  */
-int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
+int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap,
+		struct pfn_map_info *mi)
 {
 	int rc;
 
@@ -801,6 +816,6 @@ int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 		return rc;
 
 	/* we need a valid pfn_sb before we can init a dev_pagemap */
-	return __nvdimm_setup_pfn(nd_pfn, pgmap);
+	return __nvdimm_setup_pfn(nd_pfn, pgmap, mi);
 }
 EXPORT_SYMBOL_GPL(nvdimm_setup_pfn);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index bc2f700feef8..46b823f3b94d 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -34,9 +34,7 @@
 #include <linux/nd.h>
 #include <linux/backing-dev.h>
 #include "pmem.h"
-#include "pfn.h"
 #include "nd.h"
-#include "nd-core.h"
 
 static struct device *to_dev(struct pmem_device *pmem)
 {
@@ -58,7 +56,7 @@ static void hwpoison_clear(struct pmem_device *pmem,
 	unsigned long pfn_start, pfn_end, pfn;
 
 	/* only pmem in the linear map supports HWPoison */
-	if (is_vmalloc_addr(pmem->virt_addr))
+	if (is_vmalloc_addr(pmem->mi.map))
 		return;
 
 	pfn_start = PHYS_PFN(phys);
@@ -80,17 +78,18 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
 		phys_addr_t offset, unsigned int len)
 {
 	struct device *dev = to_dev(pmem);
+	struct pfn_map_info *mi = &pmem->mi;
 	sector_t sector;
 	long cleared;
 	blk_status_t rc = BLK_STS_OK;
 
-	sector = (offset - pmem->data_offset) / 512;
+	sector = (offset - mi->map_offset) / 512;
 
-	cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
+	cleared = nvdimm_clear_poison(dev, mi->map_base + offset, len);
 	if (cleared < len)
 		rc = BLK_STS_IOERR;
 	if (cleared > 0 && cleared / 512) {
-		hwpoison_clear(pmem, pmem->phys_addr + offset, cleared);
+		hwpoison_clear(pmem, mi->map_base + offset, cleared);
 		cleared /= 512;
 		dev_dbg(dev, "%#llx clear %ld sector%s\n",
 				(unsigned long long) sector, cleared,
@@ -100,7 +99,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
 			sysfs_notify_dirent(pmem->bb_state);
 	}
 
-	arch_invalidate_pmem(pmem->virt_addr + offset, len);
+	arch_invalidate_pmem(mi->map + offset, len);
 
 	return rc;
 }
@@ -151,8 +150,9 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 {
 	blk_status_t rc = BLK_STS_OK;
 	bool bad_pmem = false;
-	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
-	void *pmem_addr = pmem->virt_addr + pmem_off;
+	struct pfn_map_info *mi = &pmem->mi;
+	phys_addr_t pmem_off = sector * 512 + mi->map_offset;
+	void *pmem_addr = mi->map + pmem_off;
 
 	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
 		bad_pmem = true;
@@ -247,16 +247,17 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 		long nr_pages, void **kaddr, pfn_t *pfn)
 {
-	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
+	struct pfn_map_info *mi = &pmem->mi;
+	resource_size_t offset = PFN_PHYS(pgoff) + mi->map_offset;
 
 	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
 					PFN_PHYS(nr_pages))))
 		return -EIO;
 
 	if (kaddr)
-		*kaddr = pmem->virt_addr + offset;
+		*kaddr = mi->map + offset;
 	if (pfn)
-		*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+		*pfn = phys_to_pfn_t(mi->map_base + offset, mi->pfn_flags);
 
 	/*
 	 * If badblocks are present, limit known good range to the
@@ -264,7 +265,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 	 */
 	if (unlikely(pmem->bb.count))
 		return nr_pages;
-	return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
+	return PHYS_PFN(mi->map_size - offset);
 }
 
 static const struct block_device_operations pmem_fops = {
@@ -354,45 +355,49 @@ static int pmem_attach_disk(struct device *dev,
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	struct nd_region *nd_region = to_nd_region(dev->parent);
 	int nid = dev_to_node(dev), fua;
-	struct resource *res = &nsio->res;
+	struct pfn_map_info *mi;
 	struct resource bb_res;
 	struct nd_pfn *nd_pfn = NULL;
 	struct dax_device *dax_dev;
-	struct nd_pfn_sb *pfn_sb;
 	struct pmem_device *pmem;
 	struct request_queue *q;
 	struct device *gendev;
 	struct gendisk *disk;
-	void *addr;
 	int rc;
 
 	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
 	if (!pmem)
 		return -ENOMEM;
+	mi = &pmem->mi;
 
 	/* while nsio_rw_bytes is active, parse a pfn info block if present */
 	if (is_nd_pfn(dev)) {
 		nd_pfn = to_nd_pfn(dev);
-		rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap);
+		rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap, mi);
 		if (rc)
 			return rc;
+	} else {
+		*mi = (struct pfn_map_info) {
+			.map_offset = 0,
+			.map_base = nsio->res.start,
+			.map_size = resource_size(&nsio->res),
+		};
 	}
 
 	/* we're attaching a block device, disable raw namespace access */
 	devm_nsio_disable(dev, nsio);
 
 	dev_set_drvdata(dev, pmem);
-	pmem->phys_addr = res->start;
-	pmem->size = resource_size(res);
 	fua = nvdimm_has_flush(nd_region);
 	if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) || fua < 0) {
 		dev_warn(dev, "unable to guarantee persistence of writes\n");
 		fua = 0;
 	}
 
-	if (!devm_request_mem_region(dev, res->start, resource_size(res),
+	if (!devm_request_mem_region(dev, nsio->res.start,
+				resource_size(&nsio->res),
 				dev_name(&ndns->dev))) {
-		dev_warn(dev, "could not reserve region %pR\n", res);
+		dev_warn(dev, "could not reserve region %pR\n", &nsio->res);
 		return -EBUSY;
 	}
 
@@ -403,37 +408,27 @@ static int pmem_attach_disk(struct device *dev,
 	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
 		return -ENOMEM;
 
-	pmem->pfn_flags = PFN_DEV;
+	mi->pfn_flags = PFN_DEV;
 	pmem->pgmap.ref = &q->q_usage_counter;
 	pmem->pgmap.kill = pmem_freeze_queue;
 	if (is_nd_pfn(dev)) {
 		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
 			return -ENOMEM;
-		addr = devm_memremap_pages(dev, &pmem->pgmap);
-		pfn_sb = nd_pfn->pfn_sb;
-		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
-		pmem->pfn_pad = resource_size(res) -
-			resource_size(&pmem->pgmap.res);
-		pmem->pfn_flags |= PFN_MAP;
-		memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
-		bb_res.start += pmem->data_offset;
+		mi->map = devm_memremap_pages(dev, &pmem->pgmap);
+		mi->pfn_flags |= PFN_MAP;
 	} else if (pmem_should_map_pages(dev)) {
 		memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
 		pmem->pgmap.altmap_valid = false;
 		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
 			return -ENOMEM;
-		addr = devm_memremap_pages(dev, &pmem->pgmap);
-		pmem->pfn_flags |= PFN_MAP;
-		memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
-	} else {
-		addr = devm_memremap(dev, pmem->phys_addr,
-				pmem->size, ARCH_MEMREMAP_PMEM);
-		memcpy(&bb_res, &nsio->res, sizeof(bb_res));
-	}
+		mi->map = devm_memremap_pages(dev, &pmem->pgmap);
+		mi->pfn_flags |= PFN_MAP;
+	} else
+		mi->map = devm_memremap(dev, mi->map_base, mi->map_size,
+				ARCH_MEMREMAP_PMEM);
 
-	if (IS_ERR(addr))
-		return PTR_ERR(addr);
-	pmem->virt_addr = addr;
+	if (IS_ERR(mi->map))
+		return PTR_ERR(mi->map);
 
 	blk_queue_write_cache(q, true, fua);
 	blk_queue_make_request(q, pmem_make_request);
@@ -441,7 +436,7 @@ static int pmem_attach_disk(struct device *dev,
 	blk_queue_logical_block_size(q, pmem_sector_size(ndns));
 	blk_queue_max_hw_sectors(q, UINT_MAX);
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
-	if (pmem->pfn_flags & PFN_MAP)
+	if (mi->pfn_flags & PFN_MAP)
 		blk_queue_flag_set(QUEUE_FLAG_DAX, q);
 	q->queuedata = pmem;
 
@@ -455,10 +450,13 @@ static int pmem_attach_disk(struct device *dev,
 	disk->flags		= GENHD_FL_EXT_DEVT;
 	disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
 	nvdimm_namespace_disk_name(ndns, disk->disk_name);
-	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
-			/ 512);
+	set_capacity(disk, (mi->map_size - mi->map_offset) / 512);
 	if (devm_init_badblocks(dev, &pmem->bb))
 		return -ENOMEM;
+	bb_res = (struct resource) {
+		.start = mi->map_base + mi->map_pad + mi->map_offset,
+		.end = mi->map_base + mi->map_pad + mi->map_size - 1,
+	};
 	nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res);
 	disk->bb = &pmem->bb;
 
@@ -540,7 +538,6 @@ static void nd_pmem_shutdown(struct device *dev)
 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
 {
 	struct nd_region *nd_region;
-	resource_size_t offset = 0, end_trunc = 0;
 	struct nd_namespace_common *ndns;
 	struct nd_namespace_io *nsio;
 	struct resource res;
@@ -556,32 +553,26 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
 		ndns = nd_btt->ndns;
 		nd_region = to_nd_region(ndns->dev.parent);
 		nsio = to_nd_namespace_io(&ndns->dev);
+		res = (struct resource) {
+			.start = nsio->res.start,
+			.end = nsio->res.end,
+		};
 		bb = &nsio->bb;
 		bb_state = NULL;
 	} else {
 		struct pmem_device *pmem = dev_get_drvdata(dev);
+		struct pfn_map_info *mi = &pmem->mi;
 
 		nd_region = to_region(pmem);
 		bb = &pmem->bb;
 		bb_state = pmem->bb_state;
 
-		if (is_nd_pfn(dev)) {
-			struct nd_pfn *nd_pfn = to_nd_pfn(dev);
-			struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
-
-			ndns = nd_pfn->ndns;
-			offset = pmem->data_offset +
-					__le32_to_cpu(pfn_sb->start_pad);
-			end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
-		} else {
-			ndns = to_ndns(dev);
-		}
-
-		nsio = to_nd_namespace_io(&ndns->dev);
+		res = (struct resource) {
+			.start = mi->map_base + mi->map_pad + mi->map_offset,
+			.end = mi->map_base + mi->map_pad + mi->map_size - 1,
+		};
 	}
 
-	res.start = nsio->res.start + offset;
-	res.end = nsio->res.end - end_trunc;
 	nvdimm_badblocks_populate(nd_region, bb, &res);
 	if (bb_state)
 		sysfs_notify_dirent(bb_state);
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index 59cfe13ea8a8..6c27bbae349f 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -6,19 +6,11 @@
 #include <linux/types.h>
 #include <linux/pfn_t.h>
 #include <linux/fs.h>
+#include "nd.h"
 
 /* this definition is in it's own header for tools/testing/nvdimm to consume */
 struct pmem_device {
-	/* One contiguous memory region per device */
-	phys_addr_t		phys_addr;
-	/* when non-zero this device is hosting a 'pfn' instance */
-	phys_addr_t		data_offset;
-	u64			pfn_flags;
-	void			*virt_addr;
-	/* immutable base size of the namespace */
-	size_t			size;
-	/* trim size when namespace capacity has been section aligned */
-	u32			pfn_pad;
+	struct pfn_map_info	mi;
 	struct kernfs_node	*bb_state;
 	struct badblocks	bb;
 	struct dax_device	*dax_dev;
diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c
index 2e7fd8227969..52a8760d2ec5 100644
--- a/tools/testing/nvdimm/pmem-dax.c
+++ b/tools/testing/nvdimm/pmem-dax.c
@@ -18,7 +18,8 @@
 long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 		long nr_pages, void **kaddr, pfn_t *pfn)
 {
-	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
+	struct pfn_map_info *mi = &pmem->mi;
+	resource_size_t offset = PFN_PHYS(pgoff) + mi->map_offset;
 
 	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
 					PFN_PHYS(nr_pages))))
@@ -28,12 +29,12 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 	 * Limit dax to a single page at a time given vmalloc()-backed
 	 * in the nfit_test case.
 	 */
-	if (get_nfit_res(pmem->phys_addr + offset)) {
+	if (get_nfit_res(mi->map_base + offset)) {
 		struct page *page;
 
 		if (kaddr)
-			*kaddr = pmem->virt_addr + offset;
-		page = vmalloc_to_page(pmem->virt_addr + offset);
+			*kaddr = mi->map + offset;
+		page = vmalloc_to_page(mi->map + offset);
 		if (pfn)
 			*pfn = page_to_pfn_t(page);
 		pr_debug_ratelimited("%s: pmem: %p pgoff: %#lx pfn: %#lx\n",
@@ -43,9 +44,9 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 	}
 
 	if (kaddr)
-		*kaddr = pmem->virt_addr + offset;
+		*kaddr = mi->map + offset;
 	if (pfn)
-		*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+		*pfn = phys_to_pfn_t(mi->map_base + offset, mi->pfn_flags);
 
 	/*
 	 * If badblocks are present, limit known good range to the
@@ -53,5 +54,5 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 	 */
 	if (unlikely(pmem->bb.count))
 		return nr_pages;
-	return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
+	return PHYS_PFN(mi->map_size - offset);
 }




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux