[PATCH] mm: fix livelock caused by iterating multi order entry

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The livelock can be triggerred in the following pattern,

	while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
				min(end - index, (pgoff_t)PAGEVEC_SIZE),
				indices)) {
		...
		for (i = 0; i < pagevec_count(&pvec); i++) {
			index = indices[i];
			...
		}
		index++; /* BUG */
	}

multi order exceptional entry is not specially considered in
invalidate_inode_pages2_range() and it ended up with a livelock because
both index 0 and index 1 finds the same pmd, but this pmd is binded to
index 0, so index is set to 0 again.

This introduces a helper to take the pmd entry's length into account when
deciding the next index.

Note that there're other users of the above pattern which doesn't need to
fix,

- dax_layout_busy_page
It's been fixed in commit d7782145e1ad
("filesystem-dax: Fix dax_layout_busy_page() livelock")

- truncate_inode_pages_range
This won't loop forever since the exceptional entries are immediately
removed from radix tree after the search.

Fixes: 642261a ("dax: add struct iomap based DAX PMD support")
Cc: <stable@xxxxxxxxxxxxxxx> since 4.9 to 4.19
Signed-off-by: Liu Bo <bo.liu@xxxxxxxxxxxxxxxxx>
---

The problem is gone after commit f280bf092d48 ("page cache: Convert
find_get_entries to XArray"), but since xarray seems too new to backport
to 4.19, I made this fix based on radix tree implementation.

 fs/dax.c            | 19 +++++++++++++++++++
 include/linux/dax.h |  8 ++++++++
 mm/truncate.c       | 26 ++++++++++++++++++++++++--
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index ac334bc..cd05337 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -764,6 +764,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 	return __dax_invalidate_mapping_entry(mapping, index, false);
 }
 
+pgoff_t dax_get_multi_order(struct address_space *mapping, pgoff_t index,
+			    void *entry)
+{
+	struct radix_tree_root *pages = &mapping->i_pages;
+	pgoff_t nr_pages = 1;
+
+	if (!dax_mapping(mapping))
+		return nr_pages;
+
+	xa_lock_irq(pages);
+	entry = get_unlocked_mapping_entry(mapping, index, NULL);
+	if (entry)
+		nr_pages = 1UL << dax_radix_order(entry);
+	put_unlocked_mapping_entry(mapping, index, entry);
+	xa_unlock_irq(pages);
+
+	return nr_pages;
+}
+
 static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
 		sector_t sector, size_t size, struct page *to,
 		unsigned long vaddr)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index a846184..f3c95c6 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -91,6 +91,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 struct page *dax_layout_busy_page(struct address_space *mapping);
 bool dax_lock_mapping_entry(struct page *page);
 void dax_unlock_mapping_entry(struct page *page);
+pgoff_t dax_get_multi_order(struct address_space *mapping, pgoff_t index,
+			    void *entry);
 #else
 static inline bool bdev_dax_supported(struct block_device *bdev,
 		int blocksize)
@@ -134,6 +136,12 @@ static inline bool dax_lock_mapping_entry(struct page *page)
 static inline void dax_unlock_mapping_entry(struct page *page)
 {
 }
+
+static inline pgoff_t dax_get_multi_order(struct address_space *mapping,
+					  pgoff_t index, void *entry)
+{
+	return 1;
+}
 #endif
 
 int dax_read_lock(void);
diff --git a/mm/truncate.c b/mm/truncate.c
index 71b65aa..835911f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -557,6 +557,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 	while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
 			indices)) {
+		pgoff_t nr_pages = 1;
+
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -568,6 +570,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 			if (radix_tree_exceptional_entry(page)) {
 				invalidate_exceptional_entry(mapping, index,
 							     page);
+				/*
+				 * Account for multi-order entries at
+				 * the end of the pagevec.
+				 */
+				if (i < pagevec_count(&pvec) - 1)
+					continue;
+
+				nr_pages = dax_get_multi_order(mapping, index,
+							       page);
 				continue;
 			}
 
@@ -607,7 +618,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
 		cond_resched();
-		index++;
+		index += nr_pages;
 	}
 	return count;
 }
@@ -688,6 +699,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 	while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
 			indices)) {
+		pgoff_t nr_pages = 1;
+
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -700,6 +713,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 				if (!invalidate_exceptional_entry2(mapping,
 								   index, page))
 					ret = -EBUSY;
+				/*
+				 * Account for multi-order entries at
+				 * the end of the pagevec.
+				 */
+				if (i < pagevec_count(&pvec) - 1)
+					continue;
+
+				nr_pages = dax_get_multi_order(mapping, index,
+							       page);
 				continue;
 			}
 
@@ -739,7 +761,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
 		cond_resched();
-		index++;
+		index += nr_pages;
 	}
 	/*
 	 * For DAX we invalidate page tables after invalidating radix tree.  We
-- 
1.8.3.1




[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux