Hold the page lock while invalidating mapping entries to prevent races between rmap using the address_space and the filesystem freeing the address_space. This is more complicated than the simple description implies because dev_pagemap pages that fsdax uses do not have any concept of page size. Size information is stored in the radix and can only be safely read while holding the xa_lock. Since lock_page() can not be taken while holding xa_lock, drop xa_lock and speculatively lock all the associated pages. Once all the pages are locked re-take the xa_lock and revalidate that the radix entry did not change. Cc: Jan Kara <jack@xxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Cc: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx> Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> --- fs/dax.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 85 insertions(+), 6 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 2e4682cd7c69..e6d44d336283 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -319,6 +319,13 @@ static unsigned long dax_radix_end_pfn(void *entry) for (pfn = dax_radix_pfn(entry); \ pfn < dax_radix_end_pfn(entry); pfn++) +#define for_each_mapped_pfn_reverse(entry, pfn) \ + for (pfn = dax_radix_end_pfn(entry) - 1; \ + dax_entry_size(entry) \ + && pfn >= dax_radix_pfn(entry); \ + pfn--) + + static void dax_associate_entry(void *entry, struct address_space *mapping, struct vm_area_struct *vma, unsigned long address) { @@ -497,6 +504,80 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, return entry; } +static bool dax_lock_pages(struct address_space *mapping, pgoff_t index, + void **entry) +{ + struct radix_tree_root *pages = &mapping->i_pages; + unsigned long pfn; + void *entry2; + + xa_lock_irq(pages); + *entry = get_unlocked_mapping_entry(mapping, index, NULL); + if (!*entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(*entry))) { + put_unlocked_mapping_entry(mapping, index, entry); + xa_unlock_irq(pages); + return false; + } + + /* + * In the limited case there are no races to prevent with rmap, + * because rmap can not perform pfn_to_page(). + */ + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) + return true; + + /* + * Now, drop the xa_lock, grab all the page locks then validate + * that the entry has not changed and return with the xa_lock + * held. + */ + xa_unlock_irq(pages); + + /* + * Retry until the entry stabilizes or someone else invalidates + * the entry; + */ + for (;;) { + for_each_mapped_pfn(*entry, pfn) + lock_page(pfn_to_page(pfn)); + + xa_lock_irq(pages); + entry2 = get_unlocked_mapping_entry(mapping, index, NULL); + if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)) + || entry2 != *entry) { + put_unlocked_mapping_entry(mapping, index, entry2); + xa_unlock_irq(pages); + + for_each_mapped_pfn_reverse(*entry, pfn) + unlock_page(pfn_to_page(pfn)); + + if (!entry2 || !radix_tree_exceptional_entry(entry2)) + return false; + *entry = entry2; + continue; + } + break; + } + + return true; +} + +static void dax_unlock_pages(struct address_space *mapping, pgoff_t index, + void *entry) +{ + struct radix_tree_root *pages = &mapping->i_pages; + unsigned long pfn; + + put_unlocked_mapping_entry(mapping, index, entry); + xa_unlock_irq(pages); + + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) + return; + + for_each_mapped_pfn_reverse(entry, pfn) + unlock_page(pfn_to_page(pfn)); +} + static int __dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index, bool trunc) { @@ -504,10 +585,8 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, void *entry; struct radix_tree_root *pages = &mapping->i_pages; - xa_lock_irq(pages); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) - goto out; + if (!dax_lock_pages(mapping, index, &entry)) + return ret; if (!trunc && (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) @@ -517,8 +596,8 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, mapping->nrexceptional--; ret = 1; out: - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(pages); + dax_unlock_pages(mapping, index, entry); + return ret; } /*