On 05/10/2023 13:05, Daniel Gomez wrote: > On Fri, Sep 29, 2023 at 12:44:16PM +0100, Ryan Roberts wrote: > > Hi Ryan, >> Introduce the logic to allow THP to be configured (through the new >> anon_orders interface we just added) to allocate large folios to back >> anonymous memory, which are smaller than PMD-size (for example order-2, >> order-3, order-4, etc). >> >> These THPs continue to be PTE-mapped, but in many cases can still >> provide similar benefits to traditional PMD-sized THP: Page faults are >> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on >> the configured order), but latency spikes are much less prominent >> because the size of each page isn't as huge as the PMD-sized variant and >> there is less memory to clear in each page fault. The number of per-page >> operations (e.g. ref counting, rmap management, lru list management) are >> also significantly reduced since those ops now become per-folio. >> >> Some architectures also employ TLB compression mechanisms to squeeze >> more entries in when a set of PTEs are virtually and physically >> contiguous and approporiately aligned. In this case, TLB misses will >> occur less often. >> >> The new behaviour is disabled by default because the anon_orders >> defaults to only enabling PMD-order, but can be enabled at runtime by >> writing to anon_orders (see documentation in previous commit). The long >> term aim is to default anon_orders to include suitable lower orders, but >> there are some risks around internal fragmentation that need to be >> better understood first. >> >> Signed-off-by: Ryan Roberts <ryan.roberts@xxxxxxx> >> --- >> Documentation/admin-guide/mm/transhuge.rst | 9 +- >> include/linux/huge_mm.h | 6 +- >> mm/memory.c | 108 +++++++++++++++++++-- >> 3 files changed, 111 insertions(+), 12 deletions(-) >> >> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst >> index 9f954e73a4ca..732c3b2f4ba8 100644 >> --- a/Documentation/admin-guide/mm/transhuge.rst >> +++ b/Documentation/admin-guide/mm/transhuge.rst >> @@ -353,7 +353,9 @@ anonymous transparent huge pages, it is necessary to read >> ``/proc/PID/smaps`` and count the AnonHugePages and AnonHugePteMap >> fields for each mapping. Note that in both cases, AnonHugePages refers >> only to PMD-mapped THPs. AnonHugePteMap refers to THPs that are mapped >> -using PTEs. >> +using PTEs. This includes all THPs whose order is smaller than >> +PMD-order, as well as any PMD-order THPs that happen to be PTE-mapped >> +for other reasons. >> >> The number of file transparent huge pages mapped to userspace is available >> by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``. >> @@ -367,6 +369,11 @@ frequently will incur overhead. >> There are a number of counters in ``/proc/vmstat`` that may be used to >> monitor how successfully the system is providing huge pages for use. >> >> +.. note:: >> + Currently the below counters only record events relating to >> + PMD-order THPs. Events relating to smaller order THPs are not >> + included. >> + >> thp_fault_alloc >> is incremented every time a huge page is successfully >> allocated to handle a page fault. >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >> index 2e7c338229a6..c4860476a1f5 100644 >> --- a/include/linux/huge_mm.h >> +++ b/include/linux/huge_mm.h >> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr; >> #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) >> >> /* >> - * Mask of all large folio orders supported for anonymous THP. >> + * Mask of all large folio orders supported for anonymous THP; all orders up to >> + * and including PMD_ORDER, except order-0 (which is not "huge") and order-1 >> + * (which is a limitation of the THP implementation). >> */ >> -#define THP_ORDERS_ALL_ANON BIT(PMD_ORDER) >> +#define THP_ORDERS_ALL_ANON ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1))) >> >> /* >> * Mask of all large folio orders supported for file THP. >> diff --git a/mm/memory.c b/mm/memory.c >> index b5b82fc8e164..92ed9c782dc9 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -4059,6 +4059,87 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> return ret; >> } >> >> +static bool vmf_pte_range_changed(struct vm_fault *vmf, int nr_pages) >> +{ >> + int i; >> + >> + if (nr_pages == 1) >> + return vmf_pte_changed(vmf); >> + >> + for (i = 0; i < nr_pages; i++) { >> + if (!pte_none(ptep_get_lockless(vmf->pte + i))) >> + return true; >> + } >> + >> + return false; >> +} >> + >> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE >> +static struct folio *alloc_anon_folio(struct vm_fault *vmf) >> +{ >> + gfp_t gfp; >> + pte_t *pte; >> + unsigned long addr; >> + struct folio *folio; >> + struct vm_area_struct *vma = vmf->vma; >> + unsigned int orders; >> + int order; >> + >> + /* >> + * If uffd is active for the vma we need per-page fault fidelity to >> + * maintain the uffd semantics. >> + */ >> + if (userfaultfd_armed(vma)) >> + goto fallback; >> + >> + /* >> + * Get a list of all the (large) orders below PMD_ORDER that are enabled >> + * for this vma. Then filter out the orders that can't be allocated over >> + * the faulting address and still be fully contained in the vma. >> + */ >> + orders = hugepage_vma_check(vma, vma->vm_flags, false, true, true, >> + BIT(PMD_ORDER) - 1); >> + orders = transhuge_vma_suitable(vma, vmf->address, orders); >> + >> + if (!orders) >> + goto fallback; >> + >> + pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK); >> + if (!pte) >> + return ERR_PTR(-EAGAIN); >> + >> + order = first_order(orders); >> + while (orders) { >> + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); >> + vmf->pte = pte + pte_index(addr); >> + if (!vmf_pte_range_changed(vmf, 1 << order)) >> + break; >> + order = next_order(&orders, order); >> + } >> + >> + vmf->pte = NULL; >> + pte_unmap(pte); >> + >> + gfp = vma_thp_gfp_mask(vma); >> + >> + while (orders) { >> + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); >> + folio = vma_alloc_folio(gfp, order, vma, addr, true); > > I was checking your series and noticed about the hugepage flag. I think > you've changed it from v1 -> v2 from being false to true when orders >=2 > but I'm not sure about the reasoning. Is this because of your statement > in the cover letter [1]? That hugepage flags is spec'ed as follows: * @hugepage: For hugepages try only the preferred node if possible. The intent of passing true for orders higher than 0, is that we would prefer to allocate a smaller order folio that is on the preferred node than a higher order folio that is not on the preferred node. The assumption is that the on-going cost of accessing the memory on the non-preferred node will outweigh the benefit of allocating it as a high order folio. Thanks, Ryan > > [1] cover letter snippet: > > "to implement variable order, large folios for anonymous memory. > (previously called ..., but now exposed as an extension to THP; > "small-order THP")" > > Thanks, > Daniel > >> + if (folio) { >> + clear_huge_page(&folio->page, addr, 1 << order); >> + return folio; >> + } >> + order = next_order(&orders, order); >> + } >> + >> +fallback: >> + return vma_alloc_zeroed_movable_folio(vma, vmf->address); >> +} >> +#else >> +#define alloc_anon_folio(vmf) \ >> + vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address) >> +#endif >> + >> /* >> * We enter with non-exclusive mmap_lock (to exclude vma changes, >> * but allow concurrent faults), and pte mapped but not yet locked. >> @@ -4066,6 +4147,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> */ >> static vm_fault_t do_anonymous_page(struct vm_fault *vmf) >> { >> + int i; >> + int nr_pages = 1; >> + unsigned long addr = vmf->address; >> bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); >> struct vm_area_struct *vma = vmf->vma; >> struct folio *folio; >> @@ -4110,10 +4194,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) >> /* Allocate our own private page. */ >> if (unlikely(anon_vma_prepare(vma))) >> goto oom; >> - folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); >> + folio = alloc_anon_folio(vmf); >> + if (IS_ERR(folio)) >> + return 0; >> if (!folio) >> goto oom; >> >> + nr_pages = folio_nr_pages(folio); >> + addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); >> + >> if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) >> goto oom_free_page; >> folio_throttle_swaprate(folio, GFP_KERNEL); >> @@ -4130,12 +4219,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) >> if (vma->vm_flags & VM_WRITE) >> entry = pte_mkwrite(pte_mkdirty(entry), vma); >> >> - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, >> - &vmf->ptl); >> + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); >> if (!vmf->pte) >> goto release; >> - if (vmf_pte_changed(vmf)) { >> - update_mmu_tlb(vma, vmf->address, vmf->pte); >> + if (vmf_pte_range_changed(vmf, nr_pages)) { >> + for (i = 0; i < nr_pages; i++) >> + update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i); >> goto release; >> } >> >> @@ -4150,16 +4239,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) >> return handle_userfault(vmf, VM_UFFD_MISSING); >> } >> >> - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); >> - folio_add_new_anon_rmap(folio, vma, vmf->address); >> + folio_ref_add(folio, nr_pages - 1); >> + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); >> + folio_add_new_anon_rmap(folio, vma, addr); >> folio_add_lru_vma(folio, vma); >> setpte: >> if (uffd_wp) >> entry = pte_mkuffd_wp(entry); >> - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); >> + set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); >> >> /* No need to invalidate - it was non-present before */ >> - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); >> + update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); >> unlock: >> if (vmf->pte) >> pte_unmap_unlock(vmf->pte, vmf->ptl); >> -- >> 2.25.1 >>