[RFC 3/4] mm/zswap: add support for large folio zswapin

Usama Arif <usamaarif642@xxxxxxxxx> · Fri, 18 Oct 2024 11:48:41 +0100

At time of folio allocation, alloc_swap_folio checks if the entire
folio is in zswap to determine folio order.
During swap_read_folio, zswap_load will check if the entire folio
is in zswap, and if it is, it will iterate through the pages in
folio and decompress them.
This will mean the benefits of large folios (fewer page faults, batched
PTE and rmap manipulation, reduced lru list, TLB coalescing (for arm64
and amd) are not lost at swap out when using zswap.
This patch does not add support for hybrid backends (i.e. folios
partly present swap and zswap).

Signed-off-by: Usama Arif <usamaarif642@xxxxxxxxx>
---
 mm/memory.c | 13 +++-------
 mm/zswap.c  | 68 ++++++++++++++++++++++++-----------------------------
 2 files changed, 34 insertions(+), 47 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 49d243131169..75f7b9f5fb32 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4077,13 +4077,14 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
 
 	/*
 	 * swap_read_folio() can't handle the case a large folio is hybridly
-	 * from different backends. And they are likely corner cases. Similar
-	 * things might be added once zswap support large folios.
+	 * from different backends. And they are likely corner cases.
 	 */
 	if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
 		return false;
 	if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
 		return false;
+	if (unlikely(!zswap_present_test(entry, nr_pages)))
+		return false;
 
 	return true;
 }
@@ -4130,14 +4131,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
 	if (unlikely(userfaultfd_armed(vma)))
 		goto fallback;
 
-	/*
-	 * A large swapped out folio could be partially or fully in zswap. We
-	 * lack handling for such cases, so fallback to swapping in order-0
-	 * folio.
-	 */
-	if (!zswap_never_enabled())
-		goto fallback;
-
 	entry = pte_to_swp_entry(vmf->orig_pte);
 	/*
 	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
diff --git a/mm/zswap.c b/mm/zswap.c
index 9cc91ae31116..a5aa86c24060 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1624,59 +1624,53 @@ bool zswap_present_test(swp_entry_t swp, int nr_pages)
 
 bool zswap_load(struct folio *folio)
 {
+	int nr_pages = folio_nr_pages(folio);
 	swp_entry_t swp = folio->swap;
+	unsigned int type = swp_type(swp);
 	pgoff_t offset = swp_offset(swp);
 	bool swapcache = folio_test_swapcache(folio);
-	struct xarray *tree = swap_zswap_tree(swp);
+	struct xarray *tree;
 	struct zswap_entry *entry;
+	int i;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 
 	if (zswap_never_enabled())
 		return false;
 
-	/*
-	 * Large folios should not be swapped in while zswap is being used, as
-	 * they are not properly handled. Zswap does not properly load large
-	 * folios, and a large folio may only be partially in zswap.
-	 *
-	 * Return true without marking the folio uptodate so that an IO error is
-	 * emitted (e.g. do_swap_page() will sigbus).
-	 */
-	if (WARN_ON_ONCE(folio_test_large(folio)))
-		return true;
-
-	/*
-	 * When reading into the swapcache, invalidate our entry. The
-	 * swapcache can be the authoritative owner of the page and
-	 * its mappings, and the pressure that results from having two
-	 * in-memory copies outweighs any benefits of caching the
-	 * compression work.
-	 *
-	 * (Most swapins go through the swapcache. The notable
-	 * exception is the singleton fault on SWP_SYNCHRONOUS_IO
-	 * files, which reads into a private page and may free it if
-	 * the fault fails. We remain the primary owner of the entry.)
-	 */
-	if (swapcache)
-		entry = xa_erase(tree, offset);
-	else
-		entry = xa_load(tree, offset);
-
-	if (!entry)
+	if (!zswap_present_test(folio->swap, nr_pages))
 		return false;
 
-	zswap_decompress(entry, &folio->page);
+	for (i = 0; i < nr_pages; ++i) {
+		tree = swap_zswap_tree(swp_entry(type, offset + i));
+		/*
+		 * When reading into the swapcache, invalidate our entry. The
+		 * swapcache can be the authoritative owner of the page and
+		 * its mappings, and the pressure that results from having two
+		 * in-memory copies outweighs any benefits of caching the
+		 * compression work.
+		 *
+		 * (Swapins with swap count > 1 go through the swapcache.
+		 * For swap count == 1, the swapcache is skipped and we
+		 * remain the primary owner of the entry.)
+		 */
+		if (swapcache)
+			entry = xa_erase(tree, offset + i);
+		else
+			entry = xa_load(tree, offset + i);
 
-	count_vm_event(ZSWPIN);
-	if (entry->objcg)
-		count_objcg_events(entry->objcg, ZSWPIN, 1);
+		zswap_decompress(entry, folio_page(folio, i));
 
-	if (swapcache) {
-		zswap_entry_free(entry);
-		folio_mark_dirty(folio);
+		if (entry->objcg)
+			count_objcg_events(entry->objcg, ZSWPIN, 1);
+		if (swapcache)
+			zswap_entry_free(entry);
 	}
 
+	count_vm_events(ZSWPIN, nr_pages);
+	if (swapcache)
+		folio_mark_dirty(folio);
+
 	folio_mark_uptodate(folio);
 	return true;
 }
-- 
2.43.5