[RFC PATCH 10/10] mm/swap: optimize synchronous swapin

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Kairui Song <kasong@xxxxxxxxxxx>

Interestingly the major performance overhead of synchronous is actually
from the workingset nodes update, that's because synchronous swap in
keeps adding single folios into a xa_node, making the node no longer
a shadow node and have to be removed from shadow_nodes, then remove
the folio very shortly and making the node a shadow node again,
so it has to add back to the shadow_nodes.

Mark synchronous swapin folio with a special bit in swap entry embedded
in folio->swap, as we still have some usable bits there. Skip workingset
node update on insertion of such folio because it will be removed very
quickly, and will trigger the update ensuring the workingset info is
eventual consensus.

Test result of sequential swapin/out of 30G zero page on ZRAM:

               Before (us)        After (us)
Swapout:       33853883           33886008
Swapin:        38336519           32465441 (+15.4%)
Swapout (THP): 6814619            6899938
Swapin (THP) : 38383367           33193479 (+13.6%)

Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
---
 include/linux/swapops.h |  5 +++-
 mm/filemap.c            | 16 +++++++++---
 mm/memory.c             | 34 ++++++++++++++----------
 mm/swap.h               | 15 +++++++++++
 mm/swap_state.c         | 57 ++++++++++++++++++++++++-----------------
 mm/vmscan.c             |  6 +++++
 mm/workingset.c         |  2 +-
 7 files changed, 92 insertions(+), 43 deletions(-)

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 48b700ba1d18..ebc0c3e4668d 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -25,7 +25,10 @@
  * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
  */
 #define SWP_TYPE_SHIFT	(BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
-#define SWP_OFFSET_MASK	((1UL << SWP_TYPE_SHIFT) - 1)
+#define SWP_CACHE_FLAG_BITS	1
+#define SWP_CACHE_SYNCHRONOUS	BIT(SWP_TYPE_SHIFT - 1)
+#define SWP_OFFSET_BITS	(SWP_TYPE_SHIFT - SWP_CACHE_FLAG_BITS)
+#define SWP_OFFSET_MASK	(BIT(SWP_OFFSET_BITS) - 1)
 
 /*
  * Definitions only for PFN swap entries (see is_pfn_swap_entry()).  To
diff --git a/mm/filemap.c b/mm/filemap.c
index 5e8e3fd26b8d..ac24cc65d1da 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -923,12 +923,20 @@ int __filemap_add_swapcache(struct address_space *mapping, struct folio *folio,
 			    pgoff_t index, gfp_t gfp, void **shadowp)
 {
 	XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
+	bool synchronous = swap_cache_test_synchronous(folio);
 	long nr;
 	int ret;
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
-	mapping_set_update(&xas, mapping);
+
+	/*
+	 * Skip node update for synchronous folio insertion, it will be
+	 * updated on folio deletion very soon, avoid repeated LRU locking.
+	 */
+	if (!synchronous)
+		xas_set_update(&xas, workingset_update_node);
+	xas_set_lru(&xas, &shadow_nodes);
 
 	nr = folio_nr_pages(folio);
 	folio_ref_add(folio, nr);
@@ -936,8 +944,10 @@ int __filemap_add_swapcache(struct address_space *mapping, struct folio *folio,
 	ret = __filemap_lock_store(&xas, folio, index, gfp, shadowp);
 	if (likely(!ret)) {
 		mapping->nrpages += nr;
-		__node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
-		__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
+		if (!synchronous) {
+			__node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+			__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
+		}
 		xas_unlock_irq(&xas);
 	} else {
 		folio_put_refs(folio, nr);
diff --git a/mm/memory.c b/mm/memory.c
index 774a912eb46d..bb40202b4f29 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3933,6 +3933,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	struct swap_info_struct *si = NULL;
 	rmap_t rmap_flags = RMAP_NONE;
 	bool folio_allocated = false;
+	bool synchronous_io = false;
 	bool exclusive = false;
 	swp_entry_t entry;
 	pte_t pte;
@@ -4032,18 +4033,19 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (ret & VM_FAULT_RETRY)
 		goto out_release;
 
-	if (swapcache) {
-		/*
-		 * Make sure folio_free_swap() or swapoff did not release the
-		 * swapcache from under us.  The page pin, and pte_same test
-		 * below, are not enough to exclude that.  Even if it is still
-		 * swapcache, we need to check that the page's swap has not
-		 * changed.
-		 */
-		if (unlikely(!folio_test_swapcache(folio) ||
-			     page_swap_entry(page).val != entry.val))
-			goto out_page;
+	/*
+	 * Make sure folio_free_swap() or swapoff did not release the
+	 * swapcache from under us.  The page pin, and pte_same test
+	 * below, are not enough to exclude that.  Even if it is still
+	 * swapcache, we need to check that the page's swap has not
+	 * changed.
+	 */
+	if (unlikely(!folio_test_swapcache(folio) ||
+		     (page_swap_entry(page).val & ~SWP_CACHE_SYNCHRONOUS) != entry.val))
+		goto out_page;
 
+	synchronous_io = swap_cache_test_synchronous(folio);
+	if (!synchronous_io) {
 		/*
 		 * KSM sometimes has to copy on read faults, for example, if
 		 * page->index of !PageKSM() pages would be nonlinear inside the
@@ -4105,9 +4107,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	 */
 	if (!folio_test_ksm(folio)) {
 		exclusive = pte_swp_exclusive(vmf->orig_pte);
-		if (folio != swapcache) {
+		if (synchronous_io || folio != swapcache) {
 			/*
-			 * We have a fresh page that is not exposed to the
+			 * We have a fresh page that is not sharable through the
 			 * swapcache -> certainly exclusive.
 			 */
 			exclusive = true;
@@ -4148,7 +4150,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	 * yet.
 	 */
 	swap_free(entry);
-	if (should_try_to_free_swap(folio, vma, vmf->flags))
+	if (synchronous_io)
+		delete_from_swap_cache(folio);
+	else if (should_try_to_free_swap(folio, vma, vmf->flags))
 		folio_free_swap(folio);
 
 	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
@@ -4223,6 +4227,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 out_nomap:
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
+	if (synchronous_io)
+		delete_from_swap_cache(folio);
 out_page:
 	folio_unlock(folio);
 out_release:
diff --git a/mm/swap.h b/mm/swap.h
index bd872b157950..9d106eebddbd 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -31,6 +31,21 @@ extern struct address_space *swapper_spaces[];
 	(&swapper_spaces[swp_type(entry)][swp_offset(entry) \
 		>> SWAP_ADDRESS_SPACE_SHIFT])
 
+static inline void swap_cache_mark_synchronous(struct folio *folio)
+{
+	folio->swap.val |= SWP_CACHE_SYNCHRONOUS;
+}
+
+static inline bool swap_cache_test_synchronous(struct folio *folio)
+{
+	return folio->swap.val & SWP_CACHE_SYNCHRONOUS;
+}
+
+static inline void swap_cache_clear_synchronous(struct folio *folio)
+{
+	folio->swap.val &= ~SWP_CACHE_SYNCHRONOUS;
+}
+
 void show_swap_cache_info(void);
 bool add_to_swap(struct folio *folio);
 void *get_shadow_from_swap_cache(swp_entry_t entry);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index cf178dd1131a..b0b1b5391ac1 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -86,7 +86,7 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
  * but sets SwapCache flag and private instead of mapping and index.
  */
 static int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
-			     gfp_t gfp, void **shadowp)
+			     gfp_t gfp, bool synchronous, void **shadowp)
 {
 	struct address_space *address_space = swap_address_space(entry);
 	pgoff_t idx = swp_offset(entry);
@@ -98,11 +98,12 @@ static int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
 
 	folio_set_swapcache(folio);
 	folio->swap = entry;
-
+	if (synchronous)
+		swap_cache_mark_synchronous(folio);
 	ret = __filemap_add_swapcache(address_space, folio, idx, gfp, shadowp);
 	if (ret) {
-		folio_clear_swapcache(folio);
 		folio->swap.val = 0;
+		folio_clear_swapcache(folio);
 	}
 
 	return ret;
@@ -129,11 +130,13 @@ void __delete_from_swap_cache(struct folio *folio,
 	xas_set_order(&xas, idx, folio_order(folio));
 	xas_store(&xas, shadow);
 
-	folio->swap.val = 0;
 	folio_clear_swapcache(folio);
 	address_space->nrpages -= nr;
-	__node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
-	__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
+	if (!swap_cache_test_synchronous(folio)) {
+		__node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
+		__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
+	}
+	folio->swap.val = 0;
 }
 
 /**
@@ -393,7 +396,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
  * else or hitting OOM.
  */
 static struct folio *swap_cache_add_or_get(struct folio *folio,
-		swp_entry_t entry, gfp_t gfp_mask)
+		swp_entry_t entry, gfp_t gfp_mask, bool synchronous)
 {
 	int ret = 0;
 	void *shadow = NULL;
@@ -403,7 +406,7 @@ static struct folio *swap_cache_add_or_get(struct folio *folio,
 	if (folio) {
 		__folio_set_locked(folio);
 		__folio_set_swapbacked(folio);
-		ret = add_to_swap_cache(folio, entry, gfp_mask, &shadow);
+		ret = add_to_swap_cache(folio, entry, gfp_mask, synchronous, &shadow);
 		if (ret)
 			__folio_clear_locked(folio);
 	}
@@ -460,7 +463,7 @@ int swap_cache_add_wait(struct folio *folio, swp_entry_t entry, gfp_t gfp)
 	struct folio *wait_folio;
 
 	for (;;) {
-		ret = add_to_swap_cache(folio, entry, gfp, NULL);
+		ret = add_to_swap_cache(folio, entry, gfp, false, NULL);
 		if (ret != -EEXIST)
 			break;
 		wait_folio = filemap_get_folio(swap_address_space(entry),
@@ -493,7 +496,7 @@ struct folio *swap_cache_alloc_or_get(swp_entry_t entry, gfp_t gfp_mask,
 	/* We are very likely the first user, alloc and try add to the swapcache. */
 	folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, mpol, ilx,
 						 numa_node_id());
-	swapcache = swap_cache_add_or_get(folio, entry, gfp_mask);
+	swapcache = swap_cache_add_or_get(folio, entry, gfp_mask, false);
 	if (swapcache != folio) {
 		folio_put(folio);
 		goto out_no_alloc;
@@ -875,21 +878,27 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask,
 			    struct vm_fault *vmf, bool *folio_allocated)
 {
-	struct mempolicy *mpol;
-	struct folio *folio;
-	pgoff_t ilx;
-
-	mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
-	folio = swap_cache_alloc_or_get(entry, gfp_mask, mpol, ilx,
-					folio_allocated);
-	mpol_cond_put(mpol);
-
-	if (*folio_allocated)
+	struct folio *folio = NULL, *swapcache;
+	/* First do a racy check if cache is already loaded. */
+	swapcache = swap_cache_try_get(entry);
+	if (unlikely(swapcache))
+		goto out;
+	folio = vma_alloc_folio(gfp_mask, 0, vmf->vma, vmf->address, false);
+	swapcache = swap_cache_add_or_get(folio, entry, gfp_mask, true);
+	if (!swapcache)
+		goto out_nocache;
+	if (swapcache == folio) {
 		swap_read_folio(folio, true, NULL);
-	else if (folio)
-		swap_cache_update_ra(folio, vmf->vma, vmf->address);
-
-	return folio;
+		*folio_allocated = true;
+		return folio;
+	}
+out:
+	swap_cache_update_ra(swapcache, vmf->vma, vmf->address);
+out_nocache:
+	if (folio)
+		folio_put(folio);
+	*folio_allocated = false;
+	return swapcache;
 }
 
 /**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c3db39393428..e71b049fee01 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1228,6 +1228,12 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 					if (!add_to_swap(folio))
 						goto activate_locked_split;
 				}
+			} else if (swap_cache_test_synchronous(folio)) {
+				/*
+				 * We see a folio being swapped in but not activated either
+				 * due to missing shadow or lived too short, active it.
+				 */
+				goto activate_locked;
 			}
 		} else if (folio_test_swapbacked(folio) &&
 			   folio_test_large(folio)) {
diff --git a/mm/workingset.c b/mm/workingset.c
index f2a0ecaf708d..83a0b409be0f 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -753,7 +753,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	 */
 	if (WARN_ON_ONCE(!node->nr_values))
 		goto out_invalid;
-	if (WARN_ON_ONCE(node->count != node->nr_values))
+	if (WARN_ON_ONCE(node->count != node->nr_values && mapping->host != NULL))
 		goto out_invalid;
 	xa_delete_node(node, workingset_update_node);
 	__inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
-- 
2.43.0





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux