Now in order to pursue high performance, applications mostly use some high-performance user-mode memory allocators, such as jemalloc or tcmalloc. These memory allocators use madvise(MADV_DONTNEED or MADV_FREE) to release physical memory, but neither MADV_DONTNEED nor MADV_FREE will release page table memory, which may cause huge page table memory usage. The following are a memory usage snapshot of one process which actually happened on our server: VIRT: 55t RES: 590g VmPTE: 110g In this case, most of the page table entries are empty. For such a PTE page where all entries are empty, we can actually free it back to the system for others to use. As a first step, this commit attempts to synchronously free the empty PTE pages in zap_page_range_single() (MADV_DONTNEED etc will invoke this). In order to reduce overhead, we only handle the cases with a high probability of generating empty PTE pages, and other cases will be filtered out, such as: - hugetlb vma (unsuitable) - userfaultfd_wp vma (may reinstall the pte entry) - writable private file mapping case (COW-ed anon page is not zapped) - etc For userfaultfd_wp and private file mapping cases (and MADV_FREE case, of course), consider scanning and freeing empty PTE pages asynchronously in the future. The following code snippet can show the effect of optimization: mmap 50G while (1) { for (; i < 1024 * 25; i++) { touch 2M memory madvise MADV_DONTNEED 2M } } As we can see, the memory usage of VmPTE is reduced: before after VIRT 50.0 GB 50.0 GB RES 3.1 MB 3.1 MB VmPTE 102640 KB 240 KB Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx> --- include/linux/pgtable.h | 14 +++++ mm/Makefile | 1 + mm/huge_memory.c | 3 + mm/internal.h | 14 +++++ mm/khugepaged.c | 22 ++++++- mm/memory.c | 2 + mm/pt_reclaim.c | 131 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 mm/pt_reclaim.c diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2f32eaccf0b9..59e894f705a7 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -447,6 +447,20 @@ static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, } #endif +#ifndef arch_flush_tlb_before_set_huge_page +static inline void arch_flush_tlb_before_set_huge_page(struct mm_struct *mm, + unsigned long addr) +{ +} +#endif + +#ifndef arch_flush_tlb_before_set_pte_page +static inline void arch_flush_tlb_before_set_pte_page(struct mm_struct *mm, + unsigned long addr) +{ +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, diff --git a/mm/Makefile b/mm/Makefile index d2915f8c9dc0..3cb3c1f5d090 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -141,3 +141,4 @@ obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o +obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c7ce28f6b7f3..444a1cdaf06d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -977,6 +977,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + arch_flush_tlb_before_set_huge_page(vma->vm_mm, haddr); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -1044,6 +1045,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, entry = mk_pmd(&zero_folio->page, vma->vm_page_prot); entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); + arch_flush_tlb_before_set_huge_page(mm, haddr); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); } @@ -1151,6 +1153,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pgtable = NULL; } + arch_flush_tlb_before_set_huge_page(mm, addr); set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); diff --git a/mm/internal.h b/mm/internal.h index 1dfdad110a9a..ac1fdd4681dc 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1579,4 +1579,18 @@ void unlink_file_vma_batch_init(struct unlink_vma_file_batch *); void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *); void unlink_file_vma_batch_final(struct unlink_vma_file_batch *); +#ifdef CONFIG_PT_RECLAIM +void try_to_reclaim_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr, + struct zap_details *details); +#else +static inline void try_to_reclaim_pgtables(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long start_addr, + unsigned long end_addr, + struct zap_details *details) +{ +} +#endif /* CONFIG_PT_RECLAIM */ + #endif /* __MM_INTERNAL_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7b7c858d5f99..63551077795d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1578,7 +1578,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) pml = pmd_lock(mm, pmd); - start_pte = pte_offset_map_nolock(mm, pmd, NULL, haddr, &ptl); + start_pte = pte_offset_map_nolock(mm, pmd, &pgt_pmd, haddr, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto abort; if (!pml) @@ -1586,6 +1586,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, else if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + /* pmd entry may be changed by others */ + if (unlikely(IS_ENABLED(CONFIG_PT_RECLAIM) && !pml && + !pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) + goto abort; + /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { @@ -1633,6 +1638,12 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, pml = pmd_lock(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + + if (unlikely(IS_ENABLED(CONFIG_PT_RECLAIM) && + !pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { + spin_unlock(ptl); + goto unlock; + } } pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); pmdp_get_lockless_sync(); @@ -1660,6 +1671,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, } if (start_pte) pte_unmap_unlock(start_pte, ptl); +unlock: if (pml && pml != ptl) spin_unlock(pml); if (notified) @@ -1719,6 +1731,14 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) mmu_notifier_invalidate_range_start(&range); pml = pmd_lock(mm, pmd); +#ifdef CONFIG_PT_RECLAIM + /* check if the pmd is still valid */ + if (check_pmd_still_valid(mm, addr, pmd) != SCAN_SUCCEED) { + spin_unlock(pml); + mmu_notifier_invalidate_range_end(&range); + continue; + } +#endif ptl = pte_lockptr(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); diff --git a/mm/memory.c b/mm/memory.c index 09db2c97cc5c..b07d63767d93 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -423,6 +423,7 @@ void pmd_install(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t *ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + arch_flush_tlb_before_set_pte_page(mm, addr); mm_inc_nr_ptes(mm); /* * Ensure all pte setup (eg. pte page lock and page clearing) are @@ -1931,6 +1932,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, * could have been expanded for hugetlb pmd sharing. */ unmap_single_vma(&tlb, vma, address, end, details, false); + try_to_reclaim_pgtables(&tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); hugetlb_zap_end(vma, details); diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c new file mode 100644 index 000000000000..e375e7f2059f --- /dev/null +++ b/mm/pt_reclaim.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/pagewalk.h> +#include <linux/hugetlb.h> +#include <asm-generic/tlb.h> +#include <asm/pgalloc.h> + +#include "internal.h" + +/* + * Locking: + * - already held the mmap read lock to traverse the pgtable + * - use pmd lock for clearing pmd entry + * - use pte lock for checking empty PTE page, and release it after clearing + * pmd entry, then we can capture the changed pmd in pte_offset_map_lock() + * etc after holding this pte lock. Thanks to this, we don't need to hold the + * rmap-related locks. + * - users of pte_offset_map_lock() etc all expect the PTE page to be stable by + * using rcu lock, so PTE pages should be freed by RCU. + */ +static int reclaim_pgtables_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct mm_struct *mm = walk->mm; + struct mmu_gather *tlb = walk->private; + pte_t *start_pte, *pte; + pmd_t pmdval; + spinlock_t *pml = NULL, *ptl; + int i; + + start_pte = pte_offset_map_nolock(mm, pmd, &pmdval, addr, &ptl); + if (!start_pte) + return 0; + + pml = pmd_lock(mm, pmd); + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) + goto out_ptl; + + /* Check if it is empty PTE page */ + for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(ptep_get(pte))) + goto out_ptl; + } + pte_unmap(start_pte); + + pmd_clear(pmd); + if (ptl != pml) + spin_unlock(ptl); + spin_unlock(pml); + + /* + * NOTE: + * In order to reuse mmu_gather to batch flush tlb and free PTE pages, + * here tlb is not flushed before pmd lock is unlocked. This may + * result in the following two situations: + * + * 1) Userland can trigger page fault and fill a huge page, which will + * cause the existence of small size TLB and huge TLB for the same + * address. + * + * 2) Userland can also trigger page fault and fill a PTE page, which + * will cause the existence of two small size TLBs, but the PTE + * page they map are different. + * + * Some CPUs do not allow these, to solve this, we can define + * arch_flush_tlb_before_set_{huge|pte}_page to detect this case and + * flush TLB before filling a huge page or a PTE page in page fault + * path. + */ + pte_free_tlb(tlb, pmd_pgtable(pmdval), addr); + mm_dec_nr_ptes(mm); + + return 0; + +out_ptl: + pte_unmap_unlock(start_pte, ptl); + if (pml != ptl) + spin_unlock(pml); + + return 0; +} + +static const struct mm_walk_ops reclaim_pgtables_walk_ops = { + .pmd_entry = reclaim_pgtables_pmd_entry, + .walk_lock = PGWALK_RDLOCK, +}; + +void try_to_reclaim_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr, + struct zap_details *details) +{ + unsigned long start = max(vma->vm_start, start_addr); + unsigned long end; + + if (start >= vma->vm_end) + return; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + + /* Skip hugetlb case */ + if (is_vm_hugetlb_page(vma)) + return; + + /* Leave this to the THP path to handle */ + if (vma->vm_flags & VM_HUGEPAGE) + return; + + /* userfaultfd_wp case may reinstall the pte entry, also skip */ + if (userfaultfd_wp(vma)) + return; + + /* + * For private file mapping, the COW-ed page is an anon page, and it + * will not be zapped. For simplicity, skip the all writable private + * file mapping cases. + */ + if (details && !vma_is_anonymous(vma) && + !(vma->vm_flags & VM_MAYSHARE) && + (vma->vm_flags & VM_WRITE)) + return; + + start = ALIGN(start, PMD_SIZE); + end = ALIGN_DOWN(end, PMD_SIZE); + if (end - start < PMD_SIZE) + return; + + walk_page_range_vma(vma, start, end, &reclaim_pgtables_walk_ops, tlb); +} -- 2.20.1