[PATCH RFC 3/6] mm: Allow device private pages to exist in page cache

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Device private pages can currently only be used for private anonymous
memory. This is because they are inaccessible from the CPU, making
shared mappings between device and CPU difficult.

For private mappings this problem is resolved by installing non-present
PTEs which allows the pages to be migrated back to the CPU as required.
However shared filebacked mappings are not always accessed via PTEs
(for example read/write syscalls), so such entries are not sufficient to
prevent the CPU trying to access device private pages.

However most other accesses go via the pagecache, so can be intercepted
there. Implement this by allowing device private pages to exist in the
pagecache. Whenever a device private entry is found in the pagecache
migrate the entry back from the device to the CPU and restore the data
from disk.

Drivers can create these entries using the standard migrate_vma calls.
For this migration to succeed any buffer heads or private data must
be stripped from the page. Normally the migrate_folio() address space
operation would be used for this if available for a particular mapping.

However this is not appropriate for device private pages because buffers
cannot be migrated to device memory and ZONE_DEVICE pages have no where
to store the private data. So instead the page is always cleaned and
written back to disk in an attempt to remove any buffers and/or private
data. If that fails the migration will fail.

Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx>
---
 include/linux/migrate.h |  2 +-
 mm/filemap.c            | 41 ++++++++++++++++++++++++++-
 mm/memory.c             |  9 ++----
 mm/memremap.c           |  1 +-
 mm/migrate.c            | 21 +++++++++----
 mm/migrate_device.c     | 66 +++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 128 insertions(+), 12 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 9023d0f..623fea4 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -62,6 +62,7 @@ extern const char *migrate_reason_names[MR_TYPES];
 
 #ifdef CONFIG_MIGRATION
 
+void migrate_device_page(struct page *page);
 void putback_movable_pages(struct list_head *l);
 int migrate_folio(struct address_space *mapping, struct folio *dst,
 		struct folio *src, enum migrate_mode mode);
@@ -82,6 +83,7 @@ int folio_migrate_mapping(struct address_space *mapping,
 
 #else
 
+static inline void migrate_device_page(struct page *page) {}
 static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_folio_t new,
 		free_folio_t free, unsigned long private,
diff --git a/mm/filemap.c b/mm/filemap.c
index 804d736..ee35277 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -658,6 +658,12 @@ bool filemap_range_has_writeback(struct address_space *mapping,
 	xas_for_each(&xas, folio, max) {
 		if (xas_retry(&xas, folio))
 			continue;
+		/*
+		 * TODO: We would have to query the driver to find out if write
+		 * back is required. Probably easiest just to migrate the page
+		 * back. Need to drop the rcu lock and retry.
+		 */
+		WARN_ON(is_device_private_page(&folio->page));
 		if (xa_is_value(folio))
 			continue;
 		if (folio_test_dirty(folio) || folio_test_locked(folio) ||
@@ -1874,6 +1880,15 @@ void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
 		folio_put(folio);
 		goto repeat;
 	}
+
+	if (is_device_private_page(&folio->page)) {
+		rcu_read_unlock();
+		migrate_device_page(&folio->page);
+		folio_put(folio);
+		rcu_read_lock();
+		goto repeat;
+	}
+
 out:
 	rcu_read_unlock();
 
@@ -2034,6 +2049,14 @@ static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
 		goto reset;
 	}
 
+	if (is_device_private_page(&folio->page)) {
+		rcu_read_unlock();
+		migrate_device_page(&folio->page);
+		folio_put(folio);
+		rcu_read_lock();
+		goto reset;
+	}
+
 	return folio;
 reset:
 	xas_reset(xas);
@@ -2229,6 +2252,14 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
 		if (unlikely(folio != xas_reload(&xas)))
 			goto put_folio;
 
+		if (is_device_private_page(&folio->page)) {
+			rcu_read_unlock();
+			migrate_device_page(&folio->page);
+			folio_put(folio);
+			rcu_read_lock();
+			goto retry;
+		}
+
 		if (!folio_batch_add(fbatch, folio)) {
 			nr = folio_nr_pages(folio);
 			*start = folio->index + nr;
@@ -2361,6 +2392,14 @@ static void filemap_get_read_batch(struct address_space *mapping,
 		if (unlikely(folio != xas_reload(&xas)))
 			goto put_folio;
 
+		if (is_device_private_page(&folio->page)) {
+			rcu_read_unlock();
+			migrate_device_page(&folio->page);
+			folio_put(folio);
+			rcu_read_lock();
+			goto retry;
+		}
+
 		if (!folio_batch_add(fbatch, folio))
 			break;
 		if (!folio_test_uptodate(folio))
@@ -3642,6 +3681,8 @@ static struct folio *next_uptodate_folio(struct xa_state *xas,
 		/* Has the page moved or been split? */
 		if (unlikely(folio != xas_reload(xas)))
 			goto skip;
+		if (is_device_private_page(&folio->page))
+			goto skip;
 		if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
 			goto skip;
 		if (!folio_trylock(folio))
diff --git a/mm/memory.c b/mm/memory.c
index 539c0f7..c346683 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1616,12 +1616,11 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
 		if (unlikely(!should_zap_folio(details, folio)))
 			return 1;
 		/*
-		 * Both device private/exclusive mappings should only
-		 * work with anonymous page so far, so we don't need to
-		 * consider uffd-wp bit when zap. For more information,
-		 * see zap_install_uffd_wp_if_needed().
+		 * TODO: Do we need to consider uffd-wp bit when zap? For more
+		 * information, see zap_install_uffd_wp_if_needed().
 		 */
-		WARN_ON_ONCE(!vma_is_anonymous(vma));
+		WARN_ON_ONCE(zap_install_uffd_wp_if_needed(vma, addr, pte, nr,
+							details, ptent));
 		rss[mm_counter(folio)]--;
 		if (is_device_private_entry(entry))
 			folio_remove_rmap_pte(folio, page, vma);
diff --git a/mm/memremap.c b/mm/memremap.c
index 40d4547..e49fdcb 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -143,7 +143,6 @@ void memunmap_pages(struct dev_pagemap *pgmap)
 	    pgmap->type != MEMORY_DEVICE_COHERENT)
 		for (i = 0; i < pgmap->nr_range; i++)
 			percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
-
 	wait_for_completion(&pgmap->done);
 
 	for (i = 0; i < pgmap->nr_range; i++)
diff --git a/mm/migrate.c b/mm/migrate.c
index 11fca43..21f92eb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -248,12 +248,14 @@ static bool remove_migration_pte(struct folio *folio,
 		pte_t pte;
 		swp_entry_t entry;
 		struct page *new;
+		struct page *old;
 		unsigned long idx = 0;
 
 		/* pgoff is invalid for ksm pages, but they are never large */
 		if (folio_test_large(folio) && !folio_test_hugetlb(folio))
 			idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
 		new = folio_page(folio, idx);
+		old = folio_page(rmap_walk_arg->folio, idx);
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 		/* PMD-mapped THP migration entry */
@@ -291,7 +293,12 @@ static bool remove_migration_pte(struct folio *folio,
 			rmap_flags |= RMAP_EXCLUSIVE;
 
 		if (unlikely(is_device_private_page(new))) {
-			if (pte_write(pte))
+			/*
+			 * Page should have been written out during migration.
+			 */
+			WARN_ON_ONCE(PageDirty(old) &&
+				folio_mapping(page_folio(old)));
+			if (!folio_mapping(page_folio(old)) && pte_write(pte))
 				entry = make_writable_device_private_entry(
 							page_to_pfn(new));
 			else
@@ -758,9 +765,12 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
 	if (folio_ref_count(src) != expected_count)
 		return -EAGAIN;
 
-	rc = folio_mc_copy(dst, src);
-	if (unlikely(rc))
-		return rc;
+	/* Drivers will do the copy before calling migrate_device_finalize() */
+	if (!folio_is_device_private(dst) && !folio_is_device_private(src)) {
+		rc = folio_mc_copy(dst, src);
+		if (unlikely(rc))
+			return rc;
+	}
 
 	rc = __folio_migrate_mapping(mapping, dst, src, expected_count);
 	if (rc != MIGRATEPAGE_SUCCESS)
@@ -1044,7 +1054,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
 			rc = migrate_folio(mapping, dst, src, mode);
 		else if (mapping_inaccessible(mapping))
 			rc = -EOPNOTSUPP;
-		else if (mapping->a_ops->migrate_folio)
+		else if (!is_device_private_page(&dst->page) &&
+			 mapping->a_ops->migrate_folio)
 			/*
 			 * Most folios have a mapping and most filesystems
 			 * provide a migrate_folio callback. Anonymous folios
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 7bcc177..946e9fd 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -745,7 +745,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
 				 *
 				 * Try to get rid of swap cache if possible.
 				 */
-				if (!folio_test_anon(folio) ||
+				if (folio_test_anon(folio) &&
 				    !folio_free_swap(folio)) {
 					src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
 					continue;
@@ -862,6 +862,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
 
 		if (dst != src) {
 			folio_unlock(dst);
+
 			if (folio_is_zone_device(dst))
 				folio_put(dst);
 			else
@@ -888,6 +889,69 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
 }
 EXPORT_SYMBOL(migrate_vma_finalize);
 
+/*
+ * This migrates the device private page back to the page cache. It doesn't
+ * actually copy any data though, it reads it back from the filesystem.
+ */
+void migrate_device_page(struct page *page)
+{
+	int ret;
+	struct page *newpage;
+
+	WARN_ON(!is_device_private_page(page));
+
+	/*
+	 * We don't support writeback of dirty pages from the driver yet.
+	 */
+	WARN_ON(PageDirty(page));
+
+	lock_page(page);
+	try_to_migrate(page_folio(page), 0);
+
+	/*
+	 * We should always be able to unmap device-private pages. Right?
+	 */
+	WARN_ON(page_mapped(page));
+
+	newpage = alloc_pages(GFP_HIGHUSER_MOVABLE, 0);
+	/*
+	 * OOM is fatal, so need to retry harder although 0-order allocations
+	 * should never fail?
+	 */
+	WARN_ON(!newpage);
+	lock_page(newpage);
+
+	/*
+	 * Replace the device-private page with the new page in the page cache.
+	 */
+	ret = fallback_migrate_folio(folio_mapping(page_folio(page)),
+				page_folio(newpage), page_folio(page),
+				MIGRATE_SYNC, 0);
+
+	/* This should never fail... */
+	WARN_ON_ONCE(ret != MIGRATEPAGE_SUCCESS);
+	page->mapping = NULL;
+
+	/*
+	 * We're going to read the newpage back from disk so make it not
+	 * uptodate.
+	 */
+	ClearPageUptodate(newpage);
+
+	/*
+	 * IO will unlock newpage asynchronously.
+	 */
+	folio_mapping(page_folio(newpage))->a_ops->read_folio(NULL,
+						page_folio(newpage));
+	lock_page(newpage);
+
+	remove_migration_ptes(page_folio(page), page_folio(newpage), false);
+
+	unlock_page(page);
+	unlock_page(newpage);
+	folio_putback_lru(page_folio(newpage));
+}
+
 /**
  * migrate_device_range() - migrate device private pfns to normal memory.
  * @src_pfns: array large enough to hold migrating source device private pfns.
-- 
git-series 0.9.1




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux