Before we allow ZONE_DEVICE pages to be put into active use outside of the pmem driver, we need a mechanism to revoke access and assert they are idle when the driver is shutdown. devm_memunmap_pages() checks that the reference count passed in at devm_memremap_pages() time is dead, and then uses zone_device_revoke() to unmap any active inode mappings. For pmem, it is using the q_usage_counter percpu_ref from its request_queue as the reference count for devm_memremap_pages(). Cc: Jan Kara <jack@xxxxxxxx> Cc: Dave Hansen <dave@xxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx> Cc: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> Cc: Dave Chinner <david@xxxxxxxxxxxxx> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> --- drivers/nvdimm/pmem.c | 50 +++++++++++++++++++++---- fs/dax.c | 20 ++++++++++ include/linux/io.h | 17 --------- include/linux/mm.h | 25 +++++++++++++ kernel/memremap.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 182 insertions(+), 28 deletions(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 09093372e5f0..aa2f1292120a 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -24,12 +24,15 @@ #include <linux/memory_hotplug.h> #include <linux/moduleparam.h> #include <linux/vmalloc.h> +#include <linux/async.h> #include <linux/slab.h> #include <linux/pmem.h> #include <linux/nd.h> #include "pfn.h" #include "nd.h" +static ASYNC_DOMAIN_EXCLUSIVE(async_pmem); + struct pmem_device { struct request_queue *pmem_queue; struct gendisk *pmem_disk; @@ -147,7 +150,8 @@ static struct pmem_device *pmem_alloc(struct device *dev, pmem->pfn_flags = PFN_DEV; if (pmem_should_map_pages(dev)) { - pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res); + pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res, + &q->q_usage_counter); pmem->pfn_flags |= PFN_MAP; } else pmem->virt_addr = (void __pmem *) devm_memremap(dev, @@ -163,14 +167,43 @@ static struct pmem_device *pmem_alloc(struct device *dev, return pmem; } -static void pmem_detach_disk(struct pmem_device *pmem) + +static void async_blk_cleanup_queue(void *data, async_cookie_t cookie) { + struct pmem_device *pmem = data; + + blk_cleanup_queue(pmem->pmem_queue); +} + +static void pmem_detach_disk(struct device *dev) +{ + struct pmem_device *pmem = dev_get_drvdata(dev); + struct request_queue *q = pmem->pmem_queue; + if (!pmem->pmem_disk) return; del_gendisk(pmem->pmem_disk); put_disk(pmem->pmem_disk); - blk_cleanup_queue(pmem->pmem_queue); + async_schedule_domain(async_blk_cleanup_queue, pmem, &async_pmem); + + if (pmem_should_map_pages(dev)) { + /* + * Wait for queue to go dead so that we know no new + * references will be taken against the pages allocated + * by devm_memremap_pages(). + */ + blk_wait_queue_dead(q); + + /* + * Manually release the page mapping so that + * blk_cleanup_queue() can complete queue draining. + */ + devm_memunmap_pages(dev, (void __force *) pmem->virt_addr); + } + + /* Wait for blk_cleanup_queue() to finish */ + async_synchronize_full_domain(&async_pmem); } static int pmem_attach_disk(struct device *dev, @@ -299,11 +332,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns) { struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim); - struct pmem_device *pmem; /* free pmem disk */ - pmem = dev_get_drvdata(&nd_pfn->dev); - pmem_detach_disk(pmem); + pmem_detach_disk(&nd_pfn->dev); /* release nd_pfn resources */ kfree(nd_pfn->pfn_sb); @@ -321,6 +352,7 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) struct nd_region *nd_region; struct nd_pfn_sb *pfn_sb; struct pmem_device *pmem; + struct request_queue *q; phys_addr_t offset; int rc; @@ -357,8 +389,10 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) /* establish pfn range for lookup, and switch to direct map */ pmem = dev_get_drvdata(dev); + q = pmem->pmem_queue; devm_memunmap(dev, (void __force *) pmem->virt_addr); - pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res); + pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res, + &q->q_usage_counter); pmem->pfn_flags |= PFN_MAP; if (IS_ERR(pmem->virt_addr)) { rc = PTR_ERR(pmem->virt_addr); @@ -428,7 +462,7 @@ static int nd_pmem_remove(struct device *dev) else if (is_nd_pfn(dev)) nvdimm_namespace_detach_pfn(pmem->ndns); else - pmem_detach_disk(pmem); + pmem_detach_disk(dev); return 0; } diff --git a/fs/dax.c b/fs/dax.c index 1127c5e8f58f..1be771a5d8ba 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -327,6 +327,23 @@ static int copy_user_bh(struct page *to, struct inode *inode, return 0; } +/* must be called within a dax_map_atomic / dax_unmap_atomic section */ +static void dax_account_mapping(struct block_device *bdev, pfn_t pfn, + struct address_space *mapping) +{ + /* + * If we are establishing a mapping for a page mapped pfn, take an + * extra reference against the request_queue. See zone_device_revoke + * for the paired decrement. + */ + if (pfn_t_has_page(pfn)) { + struct page *page = pfn_t_to_page(pfn); + + page->mapping = mapping; + percpu_ref_get(&bdev->bd_queue->q_usage_counter); + } +} + static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -364,6 +381,8 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, clear_pmem(addr, PAGE_SIZE); wmb_pmem(); } + + dax_account_mapping(bdev, pfn, mapping); dax_unmap_atomic(bdev, addr); error = vm_insert_mixed(vma, vaddr, pfn_t_to_pfn(pfn)); @@ -677,6 +696,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); result |= VM_FAULT_MAJOR; } + dax_account_mapping(bdev, pfn, mapping); dax_unmap_atomic(bdev, kaddr); result |= vmf_insert_pfn_pmd(vma, address, pmd, diff --git a/include/linux/io.h b/include/linux/io.h index de64c1e53612..2f2f8859abd9 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -87,23 +87,6 @@ void *devm_memremap(struct device *dev, resource_size_t offset, size_t size, unsigned long flags); void devm_memunmap(struct device *dev, void *addr); -void *__devm_memremap_pages(struct device *dev, struct resource *res); - -#ifdef CONFIG_ZONE_DEVICE -void *devm_memremap_pages(struct device *dev, struct resource *res); -#else -static inline void *devm_memremap_pages(struct device *dev, struct resource *res) -{ - /* - * Fail attempts to call devm_memremap_pages() without - * ZONE_DEVICE support enabled, this requires callers to fall - * back to plain devm_memremap() based on config - */ - WARN_ON_ONCE(1); - return ERR_PTR(-ENXIO); -} -#endif - /* * Some systems do not have legacy ISA devices. * /dev/port is not a valid interface on these systems. diff --git a/include/linux/mm.h b/include/linux/mm.h index a72a04b1a238..4f14a09a6451 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -717,6 +717,31 @@ static inline enum zone_type page_zonenum(const struct page *page) return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } +struct percpu_ref; +struct resource; +struct device; +#ifdef CONFIG_ZONE_DEVICE +void devm_memunmap_pages(struct device *dev, void *addr); +void *devm_memremap_pages(struct device *dev, struct resource *res, + struct percpu_ref *ref); +#else +static inline void devm_memunmap_pages(struct device *dev, void *addr) +{ +} + +static inline void *devm_memremap_pages(struct device *dev, + struct resource *res, struct percpu_ref *ref) +{ + /* + * Fail attempts to call devm_memremap_pages() without + * ZONE_DEVICE support enabled, this requires callers to fall + * back to plain devm_memremap() based on config + */ + WARN_ON_ONCE(1); + return ERR_PTR(-ENXIO); +} +#endif + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif diff --git a/kernel/memremap.c b/kernel/memremap.c index 3218e8b1fc28..a73e18d8a120 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -12,9 +12,11 @@ */ #include <linux/device.h> #include <linux/types.h> +#include <linux/fs.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/memory_hotplug.h> +#include <linux/percpu-refcount.h> #ifndef ioremap_cache /* temporary while we convert existing ioremap_cache users to memremap */ @@ -140,17 +142,88 @@ EXPORT_SYMBOL(devm_memunmap); #ifdef CONFIG_ZONE_DEVICE struct page_map { struct resource res; + struct percpu_ref *ref; }; -static void devm_memremap_pages_release(struct device *dev, void *res) +static unsigned long pfn_first(struct page_map *page_map) { - struct page_map *page_map = res; + const struct resource *res = &page_map->res; + + return res->start >> PAGE_SHIFT; +} + +static unsigned long pfn_end(struct page_map *page_map) +{ + const struct resource *res = &page_map->res; + + return (res->start + resource_size(res)) >> PAGE_SHIFT; +} + +#define for_each_device_pfn(pfn, map) \ + for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) + +static void zone_device_revoke(struct device *dev, struct page_map *page_map) +{ + unsigned long pfn; + int retry = 3; + struct percpu_ref *ref = page_map->ref; + struct address_space *mapping_prev; + + if (percpu_ref_tryget_live(ref)) { + dev_WARN(dev, "%s: page mapping is still live!\n", __func__); + percpu_ref_put(ref); + } + + retry: + mapping_prev = NULL; + for_each_device_pfn(pfn, page_map) { + struct page *page = pfn_to_page(pfn); + struct address_space *mapping = page->mapping; + struct inode *inode = mapping ? mapping->host : NULL; + + dev_WARN_ONCE(dev, atomic_read(&page->_count) < 1, + "%s: ZONE_DEVICE page was freed!\n", __func__); + + /* See dax_account_mapping */ + if (mapping) { + percpu_ref_put(ref); + page->mapping = NULL; + } + + if (!mapping || !inode || mapping == mapping_prev) { + dev_WARN_ONCE(dev, atomic_read(&page->_count) > 1, + "%s: unexpected elevated page count pfn: %lx\n", + __func__, pfn); + continue; + } + + unmap_mapping_range(mapping, 0, 0, 1); + mapping_prev = mapping; + } + + /* + * Straggling mappings may have been established immediately + * after the percpu_ref was killed. + */ + if (!percpu_ref_is_zero(ref) && retry--) + goto retry; + + if (!percpu_ref_is_zero(ref)) + dev_warn(dev, "%s: not all references released\n", __func__); +} + +static void devm_memremap_pages_release(struct device *dev, void *data) +{ + struct page_map *page_map = data; + + zone_device_revoke(dev, page_map); /* pages are dead and unused, undo the arch mapping */ arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); } -void *devm_memremap_pages(struct device *dev, struct resource *res) +void *devm_memremap_pages(struct device *dev, struct resource *res, + struct percpu_ref *ref) { int is_ram = region_intersects(res->start, resource_size(res), "System RAM"); @@ -172,6 +245,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) return ERR_PTR(-ENOMEM); memcpy(&page_map->res, res, sizeof(*res)); + page_map->ref = ref; nid = dev_to_node(dev); if (nid < 0) @@ -187,4 +261,22 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) return __va(res->start); } EXPORT_SYMBOL(devm_memremap_pages); + +static int page_map_match(struct device *dev, void *res, void *match_data) +{ + struct page_map *page_map = res; + resource_size_t phys = *(resource_size_t *) match_data; + + return page_map->res.start == phys; +} + +void devm_memunmap_pages(struct device *dev, void *addr) +{ + resource_size_t start = __pa(addr); + + if (devres_release(dev, devm_memremap_pages_release, page_map_match, + &start) != 0) + dev_WARN(dev, "failed to find page map to release\n"); +} +EXPORT_SYMBOL(devm_memunmap_pages); #endif /* CONFIG_ZONE_DEVICE */ -- To unsubscribe from this list: send the line "unsubscribe linux-block" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html