[PATCH 23/24] swap: fix multiple swap leak when after cgroup migrate

Kairui Song <ryncsn@xxxxxxxxx> · Mon, 20 Nov 2023 03:47:39 +0800

From: Kairui Song <kasong@xxxxxxxxxxx>

When a process which previously swapped some memory was moved to
another cgroup, and the cgroup it previous in is dead, then swapped in
pages will be leaked into rootcg. Previous commits fixed the bug for
no readahead path, this commit fix the same issue for readahead path.

This can be easily reproduced by:
- Setup a SSD or HDD swap.
- Create memory cgroup A, B and C.
- Spawn process P1 in cgroup A and make it swap out some pages.
- Move process P1 to memory cgroup B.
- Destroy cgroup A.
- Do a swapoff in cgroup C
- Swapped in pages is accounted into cgroup C.

This patch will fix it make the swapped in pages accounted in cgroup B.

Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
---
 mm/swap.h       |  2 +-
 mm/swap_state.c | 19 ++++++++++---------
 mm/zswap.c      |  2 +-
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/mm/swap.h b/mm/swap.h
index 795a25df87da..4374bf11ca41 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -55,7 +55,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 				   struct swap_iocb **plug);
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 				     struct mempolicy *mpol, pgoff_t ilx,
-				     bool *new_page_allocated);
+				     struct mm_struct *mm, bool *new_page_allocated);
 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
 			      struct vm_fault *vmf, enum swap_cache_result *result);
 struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b377e55cb850..362a6f674b36 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -416,7 +416,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
 
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 				     struct mempolicy *mpol, pgoff_t ilx,
-				     bool *new_page_allocated)
+				     struct mm_struct *mm, bool *new_page_allocated)
 {
 	struct swap_info_struct *si;
 	struct folio *folio;
@@ -462,7 +462,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 						mpol, ilx, numa_node_id());
 		if (!folio)
                         goto fail_put_swap;
-		if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry))
+		if (mem_cgroup_swapin_charge_folio(folio, mm, gfp_mask, entry))
 			goto fail_put_folio;
 
 		/*
@@ -540,7 +540,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 
 	mpol = get_vma_policy(vma, addr, 0, &ilx);
 	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
-					&page_allocated);
+				       vma->vm_mm, &page_allocated);
 	mpol_cond_put(mpol);
 
 	if (page_allocated)
@@ -628,7 +628,8 @@ static unsigned long swapin_nr_pages(unsigned long offset)
  * are fairly likely to have been swapped out from the same node.
  */
 static struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
-					   struct mempolicy *mpol, pgoff_t ilx)
+					   struct mempolicy *mpol, pgoff_t ilx,
+					   struct mm_struct *mm)
 {
 	struct page *page;
 	unsigned long entry_offset = swp_offset(entry);
@@ -657,7 +658,7 @@ static struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		/* Ok, do the async read-ahead now */
 		page = __read_swap_cache_async(
 				swp_entry(swp_type(entry), offset),
-				gfp_mask, mpol, ilx, &page_allocated);
+				gfp_mask, mpol, ilx, mm, &page_allocated);
 		if (!page)
 			continue;
 		if (page_allocated) {
@@ -675,7 +676,7 @@ static struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 skip:
 	/* The page was likely read above, so no need for plugging here */
 	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
-					&page_allocated);
+				       mm, &page_allocated);
 	if (unlikely(page_allocated))
 		swap_readpage(page, false, NULL);
 	return page;
@@ -830,7 +831,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 		pte_unmap(pte);
 		pte = NULL;
 		page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
-						&page_allocated);
+					       vmf->vma->vm_mm, &page_allocated);
 		if (!page)
 			continue;
 		if (page_allocated) {
@@ -850,7 +851,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 skip:
 	/* The page was likely read above, so no need for plugging here */
 	page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
-					&page_allocated);
+				       vmf->vma->vm_mm, &page_allocated);
 	if (unlikely(page_allocated))
 		swap_readpage(page, false, NULL);
 	return page;
@@ -980,7 +981,7 @@ struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
 			workingset_refault(page_folio(page), shadow);
 		cache_result = SWAP_CACHE_BYPASS;
 	} else {
-		page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+		page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx, mm);
 		cache_result = SWAP_CACHE_MISS;
 	}
 done:
diff --git a/mm/zswap.c b/mm/zswap.c
index 030cc137138f..e2712ff169b1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1081,7 +1081,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	/* try to allocate swap cache page */
 	mpol = get_task_policy(current);
 	page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
-				NO_INTERLEAVE_INDEX, &page_was_allocated);
+				NO_INTERLEAVE_INDEX, NULL, &page_was_allocated);
 	if (!page) {
 		ret = -ENOMEM;
 		goto fail;
-- 
2.42.0