From: Alexander Zhu <alexlzhu@xxxxxx> Changes to avoid remap on zero pages that are free'd in split_huge_page(). Pages are not remapped except in the case of userfaultfd. In the case of userfaultfd we remap to the shared zero page, similar to what is done by KSM. Signed-off-by: Alexander Zhu <alexlzhu@xxxxxx> --- v1 to v2 -Only trigger the unmap_clean/zap in split_huge_page on anonymous THPs. We cannot zap zero pages for file THPs. RFC to v1 -Added support to map to the read only zero page when splitting a THP registered with userfaultfd. include/linux/rmap.h | 2 +- mm/huge_memory.c | 8 ++--- mm/migrate.c | 73 +++++++++++++++++++++++++++++++++++++++----- mm/migrate_device.c | 4 +-- 4 files changed, 73 insertions(+), 14 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bd3504d11b15..3f83bbcf1333 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -428,7 +428,7 @@ int folio_mkclean(struct folio *); int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 363218e8a22a..f68a353e0adf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2373,7 +2373,7 @@ static void unmap_folio(struct folio *folio) try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); } -static void remap_page(struct folio *folio, unsigned long nr) +static void remap_page(struct folio *folio, unsigned long nr, bool unmap_clean) { int i = 0; @@ -2381,7 +2381,7 @@ static void remap_page(struct folio *folio, unsigned long nr) if (!folio_test_anon(folio)) return; for (;;) { - remove_migration_ptes(folio, folio, true); + remove_migration_ptes(folio, folio, true, unmap_clean); i += folio_nr_pages(folio); if (i >= nr) break; @@ -2560,7 +2560,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } local_irq_enable(); - remap_page(folio, nr); + remap_page(folio, nr, PageAnon(head)); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2789,7 +2789,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) xas_unlock(&xas); local_irq_enable(); - remap_page(folio, folio_nr_pages(folio)); + remap_page(folio, folio_nr_pages(folio), false); ret = -EBUSY; } diff --git a/mm/migrate.c b/mm/migrate.c index 1379e1912772..bc96a084d925 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -30,6 +30,7 @@ #include <linux/writeback.h> #include <linux/mempolicy.h> #include <linux/vmalloc.h> +#include <linux/vm_event_item.h> #include <linux/security.h> #include <linux/backing-dev.h> #include <linux/compaction.h> @@ -168,13 +169,62 @@ void putback_movable_pages(struct list_head *l) } } +static bool try_to_unmap_clean(struct page_vma_mapped_walk *pvmw, struct page *page) +{ + void *addr; + bool dirty; + pte_t newpte; + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); + + if (PageMlocked(page) || (pvmw->vma->vm_flags & VM_LOCKED)) + return false; + + /* + * The pmd entry mapping the old thp was flushed and the pte mapping + * this subpage has been non present. Therefore, this subpage is + * inaccessible. We don't need to remap it if it contains only zeros. + */ + addr = kmap_local_page(page); + dirty = memchr_inv(addr, 0, PAGE_SIZE); + kunmap_local(addr); + + if (dirty) + return false; + + pte_clear_not_present_full(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, false); + + if (userfaultfd_armed(pvmw->vma)) { + newpte = pte_mkspecial(pfn_pte(page_to_pfn(ZERO_PAGE(pvmw->address)), + pvmw->vma->vm_page_prot)); + ptep_clear_flush(pvmw->vma, pvmw->address, pvmw->pte); + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte); + dec_mm_counter(pvmw->vma->vm_mm, MM_ANONPAGES); + count_vm_event(THP_SPLIT_REMAP_READONLY_ZERO_PAGE); + return true; + } + + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(page)); + count_vm_event(THP_SPLIT_UNMAP); + return true; +} + +struct rmap_walk_arg { + struct folio *folio; + bool unmap_clean; +}; + /* * Restore a potential migration pte to a working pte entry */ static bool remove_migration_pte(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *old) + struct vm_area_struct *vma, unsigned long addr, void *arg) { - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); + struct rmap_walk_arg *rmap_walk_arg = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; @@ -197,6 +247,8 @@ static bool remove_migration_pte(struct folio *folio, continue; } #endif + if (rmap_walk_arg->unmap_clean && try_to_unmap_clean(&pvmw, new)) + continue; folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); @@ -272,13 +324,20 @@ static bool remove_migration_pte(struct folio *folio, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean) { + struct rmap_walk_arg rmap_walk_arg = { + .folio = src, + .unmap_clean = unmap_clean, + }; + struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, - .arg = src, + .arg = &rmap_walk_arg, }; + VM_BUG_ON_FOLIO(unmap_clean && src != dst, src); + if (locked) rmap_walk_locked(dst, &rwc); else @@ -872,7 +931,7 @@ static int writeout(struct address_space *mapping, struct folio *folio) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, false, false); rc = mapping->a_ops->writepage(&folio->page, &wbc); @@ -1128,7 +1187,7 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false); out_unlock_both: folio_unlock(dst); @@ -1338,7 +1397,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false); unlock_put_anon: folio_unlock(dst); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 6fa682eef7a0..6508a083d7fd 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -421,7 +421,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, continue; folio = page_folio(page); - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, false, false); src_pfns[i] = 0; folio_unlock(folio); @@ -847,7 +847,7 @@ void migrate_device_finalize(unsigned long *src_pfns, src = page_folio(page); dst = page_folio(newpage); - remove_migration_ptes(src, dst, false); + remove_migration_ptes(src, dst, false, false); folio_unlock(src); if (is_zone_device_page(page)) -- 2.30.2