The 'start_pad' accounting bug resulted from the pmem driver inferring properties of the established pagemap to calculate pmem->phys_addr and pmem->size, and that nd_pfn->data_offset was identical to pmem->data_offset. The assumptions in the current implementation are only true when 'start_pad' is zero. The confusion resulted in the wrong offset being applied for sector-to-physical-page translations. In advance of introducing a corrected implementation make the mapping parameters and translation explicit via a new 'struct pfn_map_info' to carry the mapping and device-offset to physical address translation parameters. No functional change intended, this cleanup preserves the existing bug. Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> --- drivers/dax/pmem.c | 9 ++- drivers/nvdimm/nd.h | 13 ++++- drivers/nvdimm/pfn_devs.c | 21 ++++++- drivers/nvdimm/pmem.c | 111 ++++++++++++++++++--------------------- drivers/nvdimm/pmem.h | 12 +--- tools/testing/nvdimm/pmem-dax.c | 15 +++-- 6 files changed, 96 insertions(+), 85 deletions(-) diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 2c1f459c0c63..80359df4f662 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -64,6 +64,7 @@ static int dax_pmem_probe(struct device *dev) struct nd_pfn_sb *pfn_sb; struct dev_dax *dev_dax; struct dax_pmem *dax_pmem; + struct pfn_map_info mi; struct nd_namespace_io *nsio; struct dax_region *dax_region; struct nd_namespace_common *ndns; @@ -83,7 +84,7 @@ static int dax_pmem_probe(struct device *dev) rc = devm_nsio_enable(dev, nsio); if (rc) return rc; - rc = nvdimm_setup_pfn(nd_pfn, &dax_pmem->pgmap); + rc = nvdimm_setup_pfn(nd_pfn, &dax_pmem->pgmap, &mi); if (rc) return rc; devm_nsio_disable(dev, nsio); @@ -117,8 +118,10 @@ static int dax_pmem_probe(struct device *dev) return PTR_ERR(addr); /* adjust the dax_region resource to the start of data */ - memcpy(&res, &dax_pmem->pgmap.res, sizeof(res)); - res.start += le64_to_cpu(pfn_sb->dataoff); + res = (struct resource) { + .start = mi.map_base + mi.map_pad + mi.map_offset, + .end = mi.map_base + mi.map_pad + mi.map_size - 1, + }; rc = sscanf(dev_name(&ndns->dev), "namespace%d.%d", ®ion_id, &id); if (rc != 2) diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 379bf4305e61..4339d338928b 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -377,13 +377,22 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, unsigned int pmem_sector_size(struct nd_namespace_common *ndns); void nvdimm_badblocks_populate(struct nd_region *nd_region, struct badblocks *bb, const struct resource *res); +struct pfn_map_info { + resource_size_t map_base; + unsigned long map_offset; + resource_size_t map_size; + unsigned long map_pad; + u64 pfn_flags; + void *map; +}; #if IS_ENABLED(CONFIG_ND_CLAIM) -int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap); +int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap, + struct pfn_map_info *mi); int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio); void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio); #else static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, - struct dev_pagemap *pgmap) + struct dev_pagemap *pgmap, struct pfn_map_info *mi) { return -ENXIO; } diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 110699f4c3e4..1c2a0e707da3 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -611,7 +611,8 @@ static unsigned long init_altmap_reserve(resource_size_t base) return reserve; } -static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) +static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap, + struct pfn_map_info *mi) { struct resource *res = &pgmap->res; struct vmem_altmap *altmap = &pgmap->altmap; @@ -632,6 +633,19 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) res->start += start_pad; res->end -= end_trunc; + *mi = (struct pfn_map_info) { + /* + * TODO fix implementation to properly account for + * 'start_pad' in map_base, and map_offset. As is, the + * fact that __va(map_base) != __pa(map) leads + * mistranslations in pmem_direct_access(). + */ + .map_base = nsio->res.start, + .map_pad = start_pad, + .map_offset = offset, + .map_size = resource_size(res), + }; + if (nd_pfn->mode == PFN_MODE_RAM) { if (offset < reserve) return -EINVAL; @@ -789,7 +803,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) * Determine the effective resource range and vmem_altmap from an nd_pfn * instance. */ -int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) +int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap, + struct pfn_map_info *mi) { int rc; @@ -801,6 +816,6 @@ int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) return rc; /* we need a valid pfn_sb before we can init a dev_pagemap */ - return __nvdimm_setup_pfn(nd_pfn, pgmap); + return __nvdimm_setup_pfn(nd_pfn, pgmap, mi); } EXPORT_SYMBOL_GPL(nvdimm_setup_pfn); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index bc2f700feef8..46b823f3b94d 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -34,9 +34,7 @@ #include <linux/nd.h> #include <linux/backing-dev.h> #include "pmem.h" -#include "pfn.h" #include "nd.h" -#include "nd-core.h" static struct device *to_dev(struct pmem_device *pmem) { @@ -58,7 +56,7 @@ static void hwpoison_clear(struct pmem_device *pmem, unsigned long pfn_start, pfn_end, pfn; /* only pmem in the linear map supports HWPoison */ - if (is_vmalloc_addr(pmem->virt_addr)) + if (is_vmalloc_addr(pmem->mi.map)) return; pfn_start = PHYS_PFN(phys); @@ -80,17 +78,18 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, unsigned int len) { struct device *dev = to_dev(pmem); + struct pfn_map_info *mi = &pmem->mi; sector_t sector; long cleared; blk_status_t rc = BLK_STS_OK; - sector = (offset - pmem->data_offset) / 512; + sector = (offset - mi->map_offset) / 512; - cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len); + cleared = nvdimm_clear_poison(dev, mi->map_base + offset, len); if (cleared < len) rc = BLK_STS_IOERR; if (cleared > 0 && cleared / 512) { - hwpoison_clear(pmem, pmem->phys_addr + offset, cleared); + hwpoison_clear(pmem, mi->map_base + offset, cleared); cleared /= 512; dev_dbg(dev, "%#llx clear %ld sector%s\n", (unsigned long long) sector, cleared, @@ -100,7 +99,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem, sysfs_notify_dirent(pmem->bb_state); } - arch_invalidate_pmem(pmem->virt_addr + offset, len); + arch_invalidate_pmem(mi->map + offset, len); return rc; } @@ -151,8 +150,9 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, { blk_status_t rc = BLK_STS_OK; bool bad_pmem = false; - phys_addr_t pmem_off = sector * 512 + pmem->data_offset; - void *pmem_addr = pmem->virt_addr + pmem_off; + struct pfn_map_info *mi = &pmem->mi; + phys_addr_t pmem_off = sector * 512 + mi->map_offset; + void *pmem_addr = mi->map + pmem_off; if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) bad_pmem = true; @@ -247,16 +247,17 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { - resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; + struct pfn_map_info *mi = &pmem->mi; + resource_size_t offset = PFN_PHYS(pgoff) + mi->map_offset; if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, PFN_PHYS(nr_pages)))) return -EIO; if (kaddr) - *kaddr = pmem->virt_addr + offset; + *kaddr = mi->map + offset; if (pfn) - *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); + *pfn = phys_to_pfn_t(mi->map_base + offset, mi->pfn_flags); /* * If badblocks are present, limit known good range to the @@ -264,7 +265,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, */ if (unlikely(pmem->bb.count)) return nr_pages; - return PHYS_PFN(pmem->size - pmem->pfn_pad - offset); + return PHYS_PFN(mi->map_size - offset); } static const struct block_device_operations pmem_fops = { @@ -354,45 +355,49 @@ static int pmem_attach_disk(struct device *dev, struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); struct nd_region *nd_region = to_nd_region(dev->parent); int nid = dev_to_node(dev), fua; - struct resource *res = &nsio->res; + struct pfn_map_info *mi; struct resource bb_res; struct nd_pfn *nd_pfn = NULL; struct dax_device *dax_dev; - struct nd_pfn_sb *pfn_sb; struct pmem_device *pmem; struct request_queue *q; struct device *gendev; struct gendisk *disk; - void *addr; int rc; pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); if (!pmem) return -ENOMEM; + mi = &pmem->mi; /* while nsio_rw_bytes is active, parse a pfn info block if present */ if (is_nd_pfn(dev)) { nd_pfn = to_nd_pfn(dev); - rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap); + rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap, mi); if (rc) return rc; + } else { + *mi = (struct pfn_map_info) { + .map_offset = 0, + .map_base = nsio->res.start, + .map_size = resource_size(&nsio->res), + }; } /* we're attaching a block device, disable raw namespace access */ devm_nsio_disable(dev, nsio); dev_set_drvdata(dev, pmem); - pmem->phys_addr = res->start; - pmem->size = resource_size(res); fua = nvdimm_has_flush(nd_region); if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) || fua < 0) { dev_warn(dev, "unable to guarantee persistence of writes\n"); fua = 0; } - if (!devm_request_mem_region(dev, res->start, resource_size(res), + if (!devm_request_mem_region(dev, nsio->res.start, + resource_size(&nsio->res), dev_name(&ndns->dev))) { - dev_warn(dev, "could not reserve region %pR\n", res); + dev_warn(dev, "could not reserve region %pR\n", &nsio->res); return -EBUSY; } @@ -403,37 +408,27 @@ static int pmem_attach_disk(struct device *dev, if (devm_add_action_or_reset(dev, pmem_release_queue, q)) return -ENOMEM; - pmem->pfn_flags = PFN_DEV; + mi->pfn_flags = PFN_DEV; pmem->pgmap.ref = &q->q_usage_counter; pmem->pgmap.kill = pmem_freeze_queue; if (is_nd_pfn(dev)) { if (setup_pagemap_fsdax(dev, &pmem->pgmap)) return -ENOMEM; - addr = devm_memremap_pages(dev, &pmem->pgmap); - pfn_sb = nd_pfn->pfn_sb; - pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); - pmem->pfn_pad = resource_size(res) - - resource_size(&pmem->pgmap.res); - pmem->pfn_flags |= PFN_MAP; - memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); - bb_res.start += pmem->data_offset; + mi->map = devm_memremap_pages(dev, &pmem->pgmap); + mi->pfn_flags |= PFN_MAP; } else if (pmem_should_map_pages(dev)) { memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); pmem->pgmap.altmap_valid = false; if (setup_pagemap_fsdax(dev, &pmem->pgmap)) return -ENOMEM; - addr = devm_memremap_pages(dev, &pmem->pgmap); - pmem->pfn_flags |= PFN_MAP; - memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); - } else { - addr = devm_memremap(dev, pmem->phys_addr, - pmem->size, ARCH_MEMREMAP_PMEM); - memcpy(&bb_res, &nsio->res, sizeof(bb_res)); - } + mi->map = devm_memremap_pages(dev, &pmem->pgmap); + mi->pfn_flags |= PFN_MAP; + } else + mi->map = devm_memremap(dev, mi->map_base, mi->map_size, + ARCH_MEMREMAP_PMEM); - if (IS_ERR(addr)) - return PTR_ERR(addr); - pmem->virt_addr = addr; + if (IS_ERR(mi->map)) + return PTR_ERR(mi->map); blk_queue_write_cache(q, true, fua); blk_queue_make_request(q, pmem_make_request); @@ -441,7 +436,7 @@ static int pmem_attach_disk(struct device *dev, blk_queue_logical_block_size(q, pmem_sector_size(ndns)); blk_queue_max_hw_sectors(q, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - if (pmem->pfn_flags & PFN_MAP) + if (mi->pfn_flags & PFN_MAP) blk_queue_flag_set(QUEUE_FLAG_DAX, q); q->queuedata = pmem; @@ -455,10 +450,13 @@ static int pmem_attach_disk(struct device *dev, disk->flags = GENHD_FL_EXT_DEVT; disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; nvdimm_namespace_disk_name(ndns, disk->disk_name); - set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) - / 512); + set_capacity(disk, (mi->map_size - mi->map_offset) / 512); if (devm_init_badblocks(dev, &pmem->bb)) return -ENOMEM; + bb_res = (struct resource) { + .start = mi->map_base + mi->map_pad + mi->map_offset, + .end = mi->map_base + mi->map_pad + mi->map_size - 1, + }; nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res); disk->bb = &pmem->bb; @@ -540,7 +538,6 @@ static void nd_pmem_shutdown(struct device *dev) static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) { struct nd_region *nd_region; - resource_size_t offset = 0, end_trunc = 0; struct nd_namespace_common *ndns; struct nd_namespace_io *nsio; struct resource res; @@ -556,32 +553,26 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) ndns = nd_btt->ndns; nd_region = to_nd_region(ndns->dev.parent); nsio = to_nd_namespace_io(&ndns->dev); + res = (struct resource) { + .start = nsio->res.start, + .end = nsio->res.end, + }; bb = &nsio->bb; bb_state = NULL; } else { struct pmem_device *pmem = dev_get_drvdata(dev); + struct pfn_map_info *mi = &pmem->mi; nd_region = to_region(pmem); bb = &pmem->bb; bb_state = pmem->bb_state; - if (is_nd_pfn(dev)) { - struct nd_pfn *nd_pfn = to_nd_pfn(dev); - struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; - - ndns = nd_pfn->ndns; - offset = pmem->data_offset + - __le32_to_cpu(pfn_sb->start_pad); - end_trunc = __le32_to_cpu(pfn_sb->end_trunc); - } else { - ndns = to_ndns(dev); - } - - nsio = to_nd_namespace_io(&ndns->dev); + res = (struct resource) { + .start = mi->map_base + mi->map_pad + mi->map_offset, + .end = mi->map_base + mi->map_pad + mi->map_size - 1, + }; } - res.start = nsio->res.start + offset; - res.end = nsio->res.end - end_trunc; nvdimm_badblocks_populate(nd_region, bb, &res); if (bb_state) sysfs_notify_dirent(bb_state); diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index 59cfe13ea8a8..6c27bbae349f 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -6,19 +6,11 @@ #include <linux/types.h> #include <linux/pfn_t.h> #include <linux/fs.h> +#include "nd.h" /* this definition is in it's own header for tools/testing/nvdimm to consume */ struct pmem_device { - /* One contiguous memory region per device */ - phys_addr_t phys_addr; - /* when non-zero this device is hosting a 'pfn' instance */ - phys_addr_t data_offset; - u64 pfn_flags; - void *virt_addr; - /* immutable base size of the namespace */ - size_t size; - /* trim size when namespace capacity has been section aligned */ - u32 pfn_pad; + struct pfn_map_info mi; struct kernfs_node *bb_state; struct badblocks bb; struct dax_device *dax_dev; diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c index 2e7fd8227969..52a8760d2ec5 100644 --- a/tools/testing/nvdimm/pmem-dax.c +++ b/tools/testing/nvdimm/pmem-dax.c @@ -18,7 +18,8 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { - resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; + struct pfn_map_info *mi = &pmem->mi; + resource_size_t offset = PFN_PHYS(pgoff) + mi->map_offset; if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, PFN_PHYS(nr_pages)))) @@ -28,12 +29,12 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, * Limit dax to a single page at a time given vmalloc()-backed * in the nfit_test case. */ - if (get_nfit_res(pmem->phys_addr + offset)) { + if (get_nfit_res(mi->map_base + offset)) { struct page *page; if (kaddr) - *kaddr = pmem->virt_addr + offset; - page = vmalloc_to_page(pmem->virt_addr + offset); + *kaddr = mi->map + offset; + page = vmalloc_to_page(mi->map + offset); if (pfn) *pfn = page_to_pfn_t(page); pr_debug_ratelimited("%s: pmem: %p pgoff: %#lx pfn: %#lx\n", @@ -43,9 +44,9 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, } if (kaddr) - *kaddr = pmem->virt_addr + offset; + *kaddr = mi->map + offset; if (pfn) - *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); + *pfn = phys_to_pfn_t(mi->map_base + offset, mi->pfn_flags); /* * If badblocks are present, limit known good range to the @@ -53,5 +54,5 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, */ if (unlikely(pmem->bb.count)) return nr_pages; - return PHYS_PFN(pmem->size - pmem->pfn_pad - offset); + return PHYS_PFN(mi->map_size - offset); }