[PATCH RFC] mm: map zero-filled pages to zero_pfn while doing swap-in

Barry Song <21cnbao@xxxxxxxxx> · Thu, 12 Dec 2024 20:37:11 +1300

From: Barry Song <v-songbaohua@xxxxxxxx>

While developing the zeromap series, Usama observed that certain
workloads may contain over 10% zero-filled pages. This may present
an opportunity to save memory by mapping zero-filled pages to zero_pfn
in do_swap_page(). If a write occurs later, do_wp_page() can
allocate a new page using the Copy-on-Write mechanism.

For workloads with numerous zero-filled pages, this can greatly
reduce the RSS.

For example:
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <sys/mman.h>

 #define SIZE (20 * 1024 * 1024)
 int main()
 {
 	volatile char *buffer = (char *)mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 	volatile char data;

 	if (buffer == MAP_FAILED) {
 		perror("mmap failed");
 		exit(EXIT_FAILURE);
 	}

 	memset(buffer, 0, SIZE);

 	if (madvise(buffer, SIZE, MADV_PAGEOUT) != 0)
 		perror("madvise MADV_PAGEOUT failed");

 	for (size_t i = 0; i < SIZE; i++)
 		data = buffer[i];
 	sleep(1000);

 	return 0;
 }

~ # ./a.out &

w/o patch:
~ # ps aux | head -n 1; ps aux | grep '[a]\.out'
USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
root       101  2.9 10.6  22540 21268 ttyAMA0  S    06:50   0:00 ./a.out

w/ patch:
~ # ps aux | head -n 1; ps aux | grep '[a]\.out'
USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
root       141  0.1  0.3  22540   792 ttyAMA0  S    06:38   0:00 ./a.out

Signed-off-by: Barry Song <v-songbaohua@xxxxxxxx>
---
 mm/memory.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index 2bacebbf4cf6..b37f0f61d0bc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4294,6 +4294,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	struct swap_info_struct *si = NULL;
 	rmap_t rmap_flags = RMAP_NONE;
 	bool need_clear_cache = false;
+	bool map_zero_pfn = false;
 	bool exclusive = false;
 	swp_entry_t entry;
 	pte_t pte;
@@ -4364,6 +4365,39 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	swapcache = folio;
 
 	if (!folio) {
+		/* Use the zero-page for reads */
+		if (!(vmf->flags & FAULT_FLAG_WRITE) &&
+		    !mm_forbids_zeropage(vma->vm_mm) &&
+		    __swap_count(entry) == 1)  {
+			swap_zeromap_batch(entry, 1, &map_zero_pfn);
+			if (map_zero_pfn) {
+				if (swapcache_prepare(entry, 1)) {
+					add_wait_queue(&swapcache_wq, &wait);
+					schedule_timeout_uninterruptible(1);
+					remove_wait_queue(&swapcache_wq, &wait);
+					goto out;
+				}
+				nr_pages = 1;
+				need_clear_cache = true;
+				pte = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
+						vma->vm_page_prot));
+				vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+						&vmf->ptl);
+				if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte),
+						vmf->orig_pte)))
+					goto unlock;
+
+				page = pfn_to_page(my_zero_pfn(vmf->address));
+				arch_swap_restore(entry, page_folio(page));
+				swap_free_nr(entry, 1);
+				add_mm_counter(vma->vm_mm, MM_SWAPENTS, -1);
+				set_ptes(vma->vm_mm, vmf->address, vmf->pte, pte, 1);
+				arch_do_swap_page_nr(vma->vm_mm, vma, vmf->address, pte, pte, 1);
+				update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
+				goto unlock;
+			}
+		}
+
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
 		    __swap_count(entry) == 1) {
 			/* skip swapcache */
-- 
2.39.3 (Apple Git-146)