From: Barry Song <v-songbaohua@xxxxxxxx> While developing the zeromap series, Usama observed that certain workloads may contain over 10% zero-filled pages. This may present an opportunity to save memory by mapping zero-filled pages to zero_pfn in do_swap_page(). If a write occurs later, do_wp_page() can allocate a new page using the Copy-on-Write mechanism. For workloads with numerous zero-filled pages, this can greatly reduce the RSS. For example: #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <sys/mman.h> #define SIZE (20 * 1024 * 1024) int main() { volatile char *buffer = (char *)mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); volatile char data; if (buffer == MAP_FAILED) { perror("mmap failed"); exit(EXIT_FAILURE); } memset(buffer, 0, SIZE); if (madvise(buffer, SIZE, MADV_PAGEOUT) != 0) perror("madvise MADV_PAGEOUT failed"); for (size_t i = 0; i < SIZE; i++) data = buffer[i]; sleep(1000); return 0; } ~ # ./a.out & w/o patch: ~ # ps aux | head -n 1; ps aux | grep '[a]\.out' USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND root 101 2.9 10.6 22540 21268 ttyAMA0 S 06:50 0:00 ./a.out w/ patch: ~ # ps aux | head -n 1; ps aux | grep '[a]\.out' USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND root 141 0.1 0.3 22540 792 ttyAMA0 S 06:38 0:00 ./a.out Signed-off-by: Barry Song <v-songbaohua@xxxxxxxx> --- mm/memory.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index 2bacebbf4cf6..b37f0f61d0bc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4294,6 +4294,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; bool need_clear_cache = false; + bool map_zero_pfn = false; bool exclusive = false; swp_entry_t entry; pte_t pte; @@ -4364,6 +4365,39 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) swapcache = folio; if (!folio) { + /* Use the zero-page for reads */ + if (!(vmf->flags & FAULT_FLAG_WRITE) && + !mm_forbids_zeropage(vma->vm_mm) && + __swap_count(entry) == 1) { + swap_zeromap_batch(entry, 1, &map_zero_pfn); + if (map_zero_pfn) { + if (swapcache_prepare(entry, 1)) { + add_wait_queue(&swapcache_wq, &wait); + schedule_timeout_uninterruptible(1); + remove_wait_queue(&swapcache_wq, &wait); + goto out; + } + nr_pages = 1; + need_clear_cache = true; + pte = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), + vma->vm_page_prot)); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), + vmf->orig_pte))) + goto unlock; + + page = pfn_to_page(my_zero_pfn(vmf->address)); + arch_swap_restore(entry, page_folio(page)); + swap_free_nr(entry, 1); + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -1); + set_ptes(vma->vm_mm, vmf->address, vmf->pte, pte, 1); + arch_do_swap_page_nr(vma->vm_mm, vma, vmf->address, pte, pte, 1); + update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + goto unlock; + } + } + if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ -- 2.39.3 (Apple Git-146)