Catch cases where truncate encounters pages that are still under active dma. This warning is a canary for potential data corruption as truncated blocks could be allocated to a new file while the device is still perform i/o. Cc: Jan Kara <jack@xxxxxxx> Cc: Jeff Moyer <jmoyer@xxxxxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Cc: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx> Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> --- fs/dax.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 20 ++++++++++++---- kernel/memremap.c | 10 ++++---- 3 files changed, 76 insertions(+), 10 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index ac6497dcfebd..fd5d385988d1 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -297,6 +297,55 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, dax_wake_mapping_entry_waiter(mapping, index, entry, false); } +static unsigned long dax_entry_size(void *entry) +{ + if (dax_is_zero_entry(entry)) + return 0; + else if (dax_is_empty_entry(entry)) + return 0; + else if (dax_is_pmd_entry(entry)) + return HPAGE_SIZE; + else + return PAGE_SIZE; +} + +#define for_each_entry_pfn(entry, pfn, end_pfn) \ + for (pfn = dax_radix_pfn(entry), \ + end_pfn = pfn + dax_entry_size(entry) / PAGE_SIZE; \ + pfn < end_pfn; \ + pfn++) + +static void dax_associate_entry(void *entry, struct inode *inode) +{ + unsigned long pfn, end_pfn; + + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) + return; + + for_each_entry_pfn(entry, pfn, end_pfn) { + struct page *page = pfn_to_page(pfn); + + WARN_ON_ONCE(page->inode); + page->inode = inode; + } +} + +static void dax_disassociate_entry(void *entry, struct inode *inode, bool trunc) +{ + unsigned long pfn, end_pfn; + + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) + return; + + for_each_entry_pfn(entry, pfn, end_pfn) { + struct page *page = pfn_to_page(pfn); + + WARN_ON_ONCE(trunc && page_ref_count(page) > 1); + WARN_ON_ONCE(page->inode && page->inode != inode); + page->inode = NULL; + } +} + /* * Find radix tree entry at given index. If it points to an exceptional entry, * return it with the radix tree entry locked. If the radix tree doesn't @@ -403,6 +452,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, } if (pmd_downgrade) { + dax_disassociate_entry(entry, mapping->host, false); radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; dax_wake_mapping_entry_waiter(mapping, index, entry, @@ -452,6 +502,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) goto out; + dax_disassociate_entry(entry, mapping->host, trunc); radix_tree_delete(page_tree, index); mapping->nrexceptional--; ret = 1; @@ -529,6 +580,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, { struct radix_tree_root *page_tree = &mapping->page_tree; unsigned long pfn = pfn_t_to_pfn(pfn_t); + struct inode *inode = mapping->host; pgoff_t index = vmf->pgoff; void *new_entry; @@ -548,6 +600,10 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, spin_lock_irq(&mapping->tree_lock); new_entry = dax_radix_locked_entry(pfn, flags); + if (dax_entry_size(entry) != dax_entry_size(new_entry)) { + dax_disassociate_entry(entry, inode, false); + dax_associate_entry(new_entry, inode); + } if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 46f4ecf5479a..dd976851e8d8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -118,11 +118,21 @@ struct page { * Can be used as a generic list * by the page owner. */ - struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an - * lru or handled by a slab - * allocator, this points to the - * hosting device page map. - */ + struct { + /* + * ZONE_DEVICE pages are never on an lru or handled by + * a slab allocator, this points to the hosting device + * page map. + */ + struct dev_pagemap *pgmap; + /* + * inode association for MEMORY_DEVICE_FS_DAX page-idle + * callbacks. Note that we don't use ->mapping since + * that has hard coded page-cache assumptions in + * several paths. + */ + struct inode *inode; + }; struct { /* slub per cpu partial pages */ struct page *next; /* Next partial slab */ #ifdef CONFIG_64BIT diff --git a/kernel/memremap.c b/kernel/memremap.c index 8a4ebfe9db4e..f9a2929fc310 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -441,13 +441,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, struct page *page = pfn_to_page(pfn); /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer. It is a bug if a ZONE_DEVICE page is ever - * freed or placed on a driver-private list. Seed the - * storage with LIST_POISON* values. + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer + * and ->inode (for the MEMORY_DEVICE_FS_DAX case) association. + * It is a bug if a ZONE_DEVICE page is ever freed or placed on + * a driver-private list. */ - list_del(&page->lru); page->pgmap = pgmap; + page->inode = NULL; percpu_ref_get(ref); if (!(++i % 1024)) cond_resched();