[PATCH] mm: swap: determine swap device by using page nid

Yang Shi <shy828301@xxxxxxxxx> · Wed, 6 Apr 2022 19:09:53 -0700

The swap devices are linked to per node priority lists, the swap device
closer to the node has higher priority on that node's priority list.
This is supposed to improve I/O latency, particularly for some fast
devices.  But the current code gets nid by calling numa_node_id() which
actually returns the nid that the reclaimer is running on instead of the
nid that the page belongs to.

Pass the page's nid dow to get_swap_pages() in order to pick up the
right swap device.  But it doesn't work for the swap slots cache which
is per cpu.  We could skip swap slots cache if the current node is not
the page's node, but it may be overkilling. So keep using the current
node's swap slots cache.  The issue was found by visual code inspection
so it is not sure how much improvement could be achieved due to lack of
suitable testing device.  But anyway the current code does violate the
design.

Cc: Huang Ying <ying.huang@xxxxxxxxx>
Signed-off-by: Yang Shi <shy828301@xxxxxxxxx>
---
 include/linux/swap.h | 3 ++-
 mm/swap_slots.c      | 7 ++++---
 mm/swapfile.c        | 5 ++---
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 27093b477c5f..e442cf6b61ea 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -497,7 +497,8 @@ extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(struct page *page);
 extern void put_swap_page(struct page *page, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
-extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
+extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size,
+			  int node);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 2b5531840583..a1c5cf6a4302 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -264,7 +264,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
 	cache->cur = 0;
 	if (swap_slot_cache_active)
 		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
-					   cache->slots, 1);
+					   cache->slots, 1, numa_node_id());
 
 	return cache->nr;
 }
@@ -305,12 +305,13 @@ swp_entry_t get_swap_page(struct page *page)
 {
 	swp_entry_t entry;
 	struct swap_slots_cache *cache;
+	int nid = page_to_nid(page);
 
 	entry.val = 0;
 
 	if (PageTransHuge(page)) {
 		if (IS_ENABLED(CONFIG_THP_SWAP))
-			get_swap_pages(1, &entry, HPAGE_PMD_NR);
+			get_swap_pages(1, &entry, HPAGE_PMD_NR, nid);
 		goto out;
 	}
 
@@ -342,7 +343,7 @@ swp_entry_t get_swap_page(struct page *page)
 			goto out;
 	}
 
-	get_swap_pages(1, &entry, 1);
+	get_swap_pages(1, &entry, 1, nid);
 out:
 	if (mem_cgroup_try_charge_swap(page, entry)) {
 		put_swap_page(page, entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 63c61f8b2611..151fffe0fd60 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1036,13 +1036,13 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
 	swap_range_free(si, offset, SWAPFILE_CLUSTER);
 }
 
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size,
+		   int node)
 {
 	unsigned long size = swap_entry_size(entry_size);
 	struct swap_info_struct *si, *next;
 	long avail_pgs;
 	int n_ret = 0;
-	int node;
 
 	/* Only single cluster request supported */
 	WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
@@ -1060,7 +1060,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
 	atomic_long_sub(n_goal * size, &nr_swap_pages);
 
 start_over:
-	node = numa_node_id();
 	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
 		/* requeue si to after same-priority siblings */
 		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
-- 
2.26.3