On Thu, Feb 29, 2024 at 10:34 AM Yu Zhao <yuzhao@xxxxxxxxxx> wrote: > > HVO can be one of the perks for heavy THP users like it is for hugeTLB > users. For example, if such a user uses 60% of physical memory for 2MB > THPs, THP HVO can reduce the struct page overhead by half (60% * 7/8 > ~= 50%). > > ZONE_NOMERGE considerably simplifies the implementation of HVO for > THPs, since THPs from it cannot be split or merged and thus do not > require any correctness-related operations on tail pages beyond the > second one. > > If a THP is mapped by PTEs, two optimization-related operations on its > tail pages, i.e., _mapcount and PG_anon_exclusive, can be binned to > track a group of pages, e.g., eight pages per group for 2MB THPs. The > estimation, as the copying cost incurred during shattering, is also by > design, since mapping by PTEs is another discouraged behavior. I'm confused by this. Can you please elaborate a little bit about binning mapcount and PG_anon_exclusive? For mapcount, IIUC, for example, when inc'ing a subpage's mapcount, you actually inc the (i % 64) page's mapcount (assuming THP size is 2M and base page size is 4K, so 8 strides and 64 pages in each stride), right? But how you can tell each page of the 8 pages has mapcount 1 or one page is mapped 8 times? Or this actually doesn't matter, we don't even care to distinguish the two cases? For PG_anon_exclusive, if one page has it set, it means other 7 pages in other strides have it set too? > > Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx> > --- > include/linux/mm.h | 140 ++++++++++++++++++++++++++++++++++++++ > include/linux/mmzone.h | 1 + > include/linux/rmap.h | 4 ++ > init/main.c | 1 + > mm/gup.c | 3 +- > mm/huge_memory.c | 2 + > mm/hugetlb_vmemmap.c | 2 +- > mm/internal.h | 9 --- > mm/memory.c | 11 +-- > mm/page_alloc.c | 151 ++++++++++++++++++++++++++++++++++++++++- > mm/rmap.c | 17 ++++- > mm/vmstat.c | 2 + > 12 files changed, 323 insertions(+), 20 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index f5a97dec5169..d7014fc35cca 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1196,6 +1196,138 @@ static inline void page_mapcount_reset(struct page *page) > atomic_set(&(page)->_mapcount, -1); > } > > +#define HVO_MOD (PAGE_SIZE / sizeof(struct page)) > + > +static inline int hvo_order_size(int order) > +{ > + if (PAGE_SIZE % sizeof(struct page) || !is_power_of_2(HVO_MOD)) > + return 0; > + > + return (1 << order) * sizeof(struct page); > +} > + > +static inline bool page_hvo_suitable(struct page *head, int order) > +{ > + VM_WARN_ON_ONCE_PAGE(!test_bit(PG_head, &head->flags), head); > + > + if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) > + return false; > + > + return page_zonenum(head) == ZONE_NOMERGE && > + IS_ALIGNED((unsigned long)head, PAGE_SIZE) && > + hvo_order_size(order) > PAGE_SIZE; > +} > + > +static inline bool folio_hvo_suitable(struct folio *folio) > +{ > + return folio_test_large(folio) && page_hvo_suitable(&folio->page, folio_order(folio)); > +} > + > +static inline bool page_is_hvo(struct page *head, int order) > +{ > + return page_hvo_suitable(head, order) && test_bit(PG_head, &head[HVO_MOD].flags); > +} > + > +static inline bool folio_is_hvo(struct folio *folio) > +{ > + return folio_test_large(folio) && page_is_hvo(&folio->page, folio_order(folio)); > +} > + > +/* > + * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, > + * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit > + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently > + * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. > + */ > +#define ENTIRELY_MAPPED 0x800000 > +#define FOLIO_PAGES_MAPPED (ENTIRELY_MAPPED - 1) > + > +static inline int hvo_range_mapcount(struct folio *folio, struct page *page, int nr_pages, int *ret) > +{ > + int i, next, end; > + int stride = hvo_order_size(folio_order(folio)) / PAGE_SIZE; > + > + if (!folio_is_hvo(folio)) > + return false; > + > + *ret = folio_entire_mapcount(folio); > + > + for (i = folio_page_idx(folio, page), end = i + nr_pages; i != end; i = next) { > + next = min(end, round_down(i + stride, stride)); > + > + page = folio_page(folio, i / stride); > + *ret += atomic_read(&page->_mapcount) + 1; > + } > + > + return true; > +} > + > +static inline bool hvo_map_range(struct folio *folio, struct page *page, int nr_pages, int *ret) > +{ > + int i, next, end; > + int stride = hvo_order_size(folio_order(folio)) / PAGE_SIZE; > + > + if (!folio_is_hvo(folio)) > + return false; > + > + *ret = 0; > + > + for (i = folio_page_idx(folio, page), end = i + nr_pages; i != end; i = next) { > + next = min(end, round_down(i + stride, stride)); > + > + page = folio_page(folio, i / stride); > + if (atomic_add_return(next - i, &page->_mapcount) == next - i - 1) > + *ret += stride; > + } > + > + if (atomic_add_return(*ret, &folio->_nr_pages_mapped) >= ENTIRELY_MAPPED) > + *ret = 0; > + > + return true; > +} > + > +static inline bool hvo_unmap_range(struct folio *folio, struct page *page, int nr_pages, int *ret) > +{ > + int i, next, end; > + int stride = hvo_order_size(folio_order(folio)) / PAGE_SIZE; > + > + if (!folio_is_hvo(folio)) > + return false; > + > + *ret = 0; > + > + for (i = folio_page_idx(folio, page), end = i + nr_pages; i != end; i = next) { > + next = min(end, round_down(i + stride, stride)); > + > + page = folio_page(folio, i / stride); > + if (atomic_sub_return(next - i, &page->_mapcount) == -1) > + *ret += stride; > + } > + > + if (atomic_sub_return(*ret, &folio->_nr_pages_mapped) >= ENTIRELY_MAPPED) > + *ret = 0; > + > + return true; > +} > + > +static inline bool hvo_dup_range(struct folio *folio, struct page *page, int nr_pages) > +{ > + int i, next, end; > + int stride = hvo_order_size(folio_order(folio)) / PAGE_SIZE; > + > + if (!folio_is_hvo(folio)) > + return false; > + > + for (i = folio_page_idx(folio, page), end = i + nr_pages; i != end; i = next) { > + next = min(end, round_down(i + stride, stride)); > + > + page = folio_page(folio, i / stride); > + atomic_add(next - i, &page->_mapcount); > + } > + > + return true; > +} > + > /** > * page_mapcount() - Number of times this precise page is mapped. > * @page: The page. > @@ -1212,6 +1344,9 @@ static inline int page_mapcount(struct page *page) > { > int mapcount = atomic_read(&page->_mapcount) + 1; > > + if (hvo_range_mapcount(page_folio(page), page, 1, &mapcount)) > + return mapcount; > + > if (unlikely(PageCompound(page))) > mapcount += folio_entire_mapcount(page_folio(page)); > > @@ -3094,6 +3229,11 @@ static inline void pagetable_pud_dtor(struct ptdesc *ptdesc) > > extern void __init pagecache_init(void); > extern void free_initmem(void); > +extern void free_vmemmap(void); > +extern int vmemmap_remap_free(unsigned long start, unsigned long end, > + unsigned long reuse, > + struct list_head *vmemmap_pages, > + unsigned long flags); > > /* > * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 532218167bba..00e4bb6c8533 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -916,6 +916,7 @@ struct zone { > #ifdef CONFIG_CMA > unsigned long cma_pages; > #endif > + atomic_long_t hvo_freed; > > const char *name; > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h > index b7944a833668..d058c4cb3c96 100644 > --- a/include/linux/rmap.h > +++ b/include/linux/rmap.h > @@ -322,6 +322,8 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, > > switch (level) { > case RMAP_LEVEL_PTE: > + if (hvo_dup_range(folio, page, nr_pages)) > + break; > do { > atomic_inc(&page->_mapcount); > } while (page++, --nr_pages > 0); > @@ -401,6 +403,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, > if (PageAnonExclusive(page + i)) > return -EBUSY; > } > + if (hvo_dup_range(folio, page, nr_pages)) > + break; > do { > if (PageAnonExclusive(page)) > ClearPageAnonExclusive(page); > diff --git a/init/main.c b/init/main.c > index e24b0780fdff..74003495db32 100644 > --- a/init/main.c > +++ b/init/main.c > @@ -1448,6 +1448,7 @@ static int __ref kernel_init(void *unused) > kgdb_free_init_mem(); > exit_boot_config(); > free_initmem(); > + free_vmemmap(); > mark_readonly(); > > /* > diff --git a/mm/gup.c b/mm/gup.c > index df83182ec72d..f3df0078505b 100644 > --- a/mm/gup.c > +++ b/mm/gup.c > @@ -57,7 +57,7 @@ static inline void sanity_check_pinned_pages(struct page **pages, > continue; > if (!folio_test_large(folio) || folio_test_hugetlb(folio)) > VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page); > - else > + else if (!folio_is_hvo(folio) || !folio_nr_pages_mapped(folio)) > /* Either a PTE-mapped or a PMD-mapped THP. */ > VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) && > !PageAnonExclusive(page), page); > @@ -645,6 +645,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, > } > > VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && > + !folio_is_hvo(page_folio(page)) && > !PageAnonExclusive(page), page); > > /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 62d2254bc51c..9e7e5d587a5c 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -2535,6 +2535,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > * > * See folio_try_share_anon_rmap_pmd(): invalidate PMD first. > */ > + if (folio_is_hvo(folio)) > + ClearPageAnonExclusive(page); > anon_exclusive = PageAnonExclusive(page); > if (freeze && anon_exclusive && > folio_try_share_anon_rmap_pmd(folio, page)) > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c > index da177e49d956..9f43d900e83c 100644 > --- a/mm/hugetlb_vmemmap.c > +++ b/mm/hugetlb_vmemmap.c > @@ -310,7 +310,7 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end, > * > * Return: %0 on success, negative error code otherwise. > */ > -static int vmemmap_remap_free(unsigned long start, unsigned long end, > +int vmemmap_remap_free(unsigned long start, unsigned long end, > unsigned long reuse, > struct list_head *vmemmap_pages, > unsigned long flags) > diff --git a/mm/internal.h b/mm/internal.h > index ac1d27468899..871c6eeb78b8 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -52,15 +52,6 @@ struct folio_batch; > > void page_writeback_init(void); > > -/* > - * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, > - * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit > - * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently > - * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. > - */ > -#define ENTIRELY_MAPPED 0x800000 > -#define FOLIO_PAGES_MAPPED (ENTIRELY_MAPPED - 1) > - > /* > * Flags passed to __show_mem() and show_free_areas() to suppress output in > * various contexts. > diff --git a/mm/memory.c b/mm/memory.c > index 0bfc8b007c01..db389f1d776d 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -3047,8 +3047,8 @@ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) > VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); > > if (folio) { > - VM_BUG_ON(folio_test_anon(folio) && > - !PageAnonExclusive(vmf->page)); > + VM_BUG_ON_PAGE(folio_test_anon(folio) && !folio_is_hvo(folio) && > + !PageAnonExclusive(vmf->page), vmf->page); > /* > * Clear the folio's cpupid information as the existing > * information potentially belongs to a now completely > @@ -3502,7 +3502,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) > */ > if (folio && folio_test_anon(folio) && > (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) { > - if (!PageAnonExclusive(vmf->page)) > + if (!folio_is_hvo(folio) && !PageAnonExclusive(vmf->page)) > SetPageAnonExclusive(vmf->page); > if (unlikely(unshare)) { > pte_unmap_unlock(vmf->pte, vmf->ptl); > @@ -4100,8 +4100,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > rmap_flags); > } > > - VM_BUG_ON(!folio_test_anon(folio) || > - (pte_write(pte) && !PageAnonExclusive(page))); > + VM_BUG_ON_PAGE(!folio_test_anon(folio) || > + (pte_write(pte) && !folio_is_hvo(folio) && !PageAnonExclusive(page)), > + page); > set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); > arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index dd843fb04f78..5f8c6583a191 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -53,6 +53,7 @@ > #include <linux/khugepaged.h> > #include <linux/delayacct.h> > #include <linux/cacheinfo.h> > +#include <linux/bootmem_info.h> > #include <asm/div64.h> > #include "internal.h" > #include "shuffle.h" > @@ -585,6 +586,10 @@ void prep_compound_page(struct page *page, unsigned int order) > int nr_pages = 1 << order; > > __SetPageHead(page); > + > + if (page_is_hvo(page, order)) > + nr_pages = HVO_MOD; > + > for (i = 1; i < nr_pages; i++) > prep_compound_tail(page, i); > > @@ -1124,10 +1129,15 @@ static __always_inline bool free_pages_prepare(struct page *page, > */ > if (unlikely(order)) { > int i; > + int nr_pages = 1 << order; > > - if (compound) > + if (compound) { > + if (page_is_hvo(page, order)) > + nr_pages = HVO_MOD; > page[1].flags &= ~PAGE_FLAGS_SECOND; > - for (i = 1; i < (1 << order); i++) { > + } > + > + for (i = 1; i < nr_pages; i++) { > if (compound) > bad += free_tail_page_prepare(page, page + i); > if (is_check_pages_enabled()) { > @@ -1547,6 +1557,141 @@ inline void post_alloc_hook(struct page *page, unsigned int order, > page_table_check_alloc(page, order); > } > > +static void prep_hvo_page(struct page *head, int order) > +{ > + LIST_HEAD(list); > + struct page *page, *next; > + int freed = 0; > + unsigned long start = (unsigned long)head; > + unsigned long end = start + hvo_order_size(order); > + > + if (page_zonenum(head) != ZONE_NOMERGE) > + return; > + > + if (WARN_ON_ONCE(order != page_zone(head)->order)) { > + bad_page(head, "invalid page order"); > + return; > + } > + > + if (!page_hvo_suitable(head, order) || page_is_hvo(head, order)) > + return; > + > + vmemmap_remap_free(start + PAGE_SIZE, end, start, &list, 0); > + > + list_for_each_entry_safe(page, next, &list, lru) { > + if (PageReserved(page)) > + free_bootmem_page(page); > + else > + __free_page(page); > + freed++; > + } > + > + atomic_long_add(freed, &page_zone(head)->hvo_freed); > +} > + > +static void prep_nomerge_zone(struct zone *zone, enum migratetype type) > +{ > + int order; > + unsigned long flags; > + > + spin_lock_irqsave(&zone->lock, flags); > + > + for (order = MAX_PAGE_ORDER; order > zone->order; order--) { > + struct page *page; > + int split = 0; > + struct free_area *area = zone->free_area + order; > + > + while ((page = get_page_from_free_area(area, type))) { > + del_page_from_free_list(page, zone, order); > + expand(zone, page, zone->order, order, type); > + set_buddy_order(page, zone->order); > + add_to_free_list(page, zone, zone->order, type); > + split++; > + } > + > + pr_info(" HVO: order %d split %d\n", order, split); > + } > + > + spin_unlock_irqrestore(&zone->lock, flags); > +} > + > +static void hvo_nomerge_zone(struct zone *zone, enum migratetype type) > +{ > + LIST_HEAD(old); > + LIST_HEAD(new); > + int nomem, freed; > + unsigned long flags; > + struct list_head list; > + struct page *page, *next; > + struct free_area *area = zone->free_area + zone->order; > +again: > + nomem = freed = 0; > + INIT_LIST_HEAD(&list); > + > + spin_lock_irqsave(&zone->lock, flags); > + list_splice_init(area->free_list + type, &old); > + spin_unlock_irqrestore(&zone->lock, flags); > + > + list_for_each_entry_safe(page, next, &old, buddy_list) { > + unsigned long start = (unsigned long)page; > + unsigned long end = start + hvo_order_size(zone->order); > + > + if (WARN_ON_ONCE(!IS_ALIGNED(start, PAGE_SIZE))) > + continue; > + > + if (vmemmap_remap_free(start + PAGE_SIZE, end, start, &list, 0)) > + nomem++; > + } > + > + list_for_each_entry_safe(page, next, &list, lru) { > + if (PageReserved(page)) > + free_bootmem_page(page); > + else > + __free_page(page); > + freed++; > + } > + > + list_splice_init(&old, &new); > + atomic_long_add(freed, &zone->hvo_freed); > + > + pr_info(" HVO: nomem %d freed %d\n", nomem, freed); > + > + if (!list_empty(area->free_list + type)) > + goto again; > + > + spin_lock_irqsave(&zone->lock, flags); > + list_splice(&new, area->free_list + type); > + spin_unlock_irqrestore(&zone->lock, flags); > +} > + > +static bool zone_hvo_suitable(struct zone *zone) > +{ > + if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) > + return false; > + > + return zone_idx(zone) == ZONE_NOMERGE && hvo_order_size(zone->order) > PAGE_SIZE; > +} > + > +void free_vmemmap(void) > +{ > + struct zone *zone; > + > + static_branch_inc(&hugetlb_optimize_vmemmap_key); > + > + for_each_populated_zone(zone) { > + if (!zone_hvo_suitable(zone)) > + continue; > + > + pr_info("Freeing vmemmap of node %d zone %s\n", > + zone_to_nid(zone), zone->name); > + > + prep_nomerge_zone(zone, MIGRATE_MOVABLE); > + hvo_nomerge_zone(zone, MIGRATE_MOVABLE); > + > + cond_resched(); > + } > +} > + > static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, > unsigned int alloc_flags) > { > @@ -1565,6 +1710,8 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags > set_page_pfmemalloc(page); > else > clear_page_pfmemalloc(page); > + > + prep_hvo_page(page, order); > } > > /* > diff --git a/mm/rmap.c b/mm/rmap.c > index 0ddb28c52961..d339bf489230 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -1143,6 +1143,10 @@ int folio_total_mapcount(struct folio *folio) > /* In the common case, avoid the loop when no pages mapped by PTE */ > if (folio_nr_pages_mapped(folio) == 0) > return mapcount; > + > + if (hvo_range_mapcount(folio, &folio->page, folio_nr_pages(folio), &mapcount)) > + return mapcount; > + > /* > * Add all the PTE mappings of those pages mapped by PTE. > * Limit the loop to folio_nr_pages_mapped()? > @@ -1168,6 +1172,8 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, > > switch (level) { > case RMAP_LEVEL_PTE: > + if (hvo_map_range(folio, page, nr_pages, &nr)) > + break; > do { > first = atomic_inc_and_test(&page->_mapcount); > if (first && folio_test_large(folio)) { > @@ -1314,6 +1320,8 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, > if (flags & RMAP_EXCLUSIVE) { > switch (level) { > case RMAP_LEVEL_PTE: > + if (folio_is_hvo(folio)) > + break; > for (i = 0; i < nr_pages; i++) > SetPageAnonExclusive(page + i); > break; > @@ -1421,6 +1429,9 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, > } else if (!folio_test_pmd_mappable(folio)) { > int i; > > + if (hvo_map_range(folio, &folio->page, nr, &nr)) > + goto done; > + > for (i = 0; i < nr; i++) { > struct page *page = folio_page(folio, i); > > @@ -1437,7 +1448,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, > SetPageAnonExclusive(&folio->page); > __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr); > } > - > +done: > __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); > } > > @@ -1510,6 +1521,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, > > switch (level) { > case RMAP_LEVEL_PTE: > + if (hvo_unmap_range(folio, page, nr_pages, &nr)) > + break; > do { > last = atomic_add_negative(-1, &page->_mapcount); > if (last && folio_test_large(folio)) { > @@ -2212,7 +2225,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, > break; > } > VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) && > - !anon_exclusive, subpage); > + !folio_is_hvo(folio) && !anon_exclusive, subpage); > > /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ > if (folio_test_hugetlb(folio)) { > diff --git a/mm/vmstat.c b/mm/vmstat.c > index ff2114452334..f51f3b872270 100644 > --- a/mm/vmstat.c > +++ b/mm/vmstat.c > @@ -1704,6 +1704,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, > "\n present %lu" > "\n managed %lu" > "\n cma %lu" > + "\n hvo freed %lu" > "\n order %u", > zone_page_state(zone, NR_FREE_PAGES), > zone->watermark_boost, > @@ -1714,6 +1715,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, > zone->present_pages, > zone_managed_pages(zone), > zone_cma_pages(zone), > + atomic_long_read(&zone->hvo_freed), > zone->order); > > seq_printf(m, > -- > 2.44.0.rc1.240.g4c46232300-goog > >