Hi Baolin, On Mon, May 13, 2024 at 1:08 PM Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx> wrote: > > Commit 19eaf44954df adds multi-size THP (mTHP) for anonymous pages, that > can allow THP to be configured through the sysfs interface located at > '/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled'. > > However, the anonymous share pages will ignore the anonymous mTHP rule > configured through the sysfs interface, and can only use the PMD-mapped > THP, that is not reasonable. Users expect to apply the mTHP rule for > all anonymous pages, including the anonymous share pages, in order to > enjoy the benefits of mTHP. For example, lower latency than PMD-mapped THP, > smaller memory bloat than PMD-mapped THP, contiguous PTEs on ARM architecture > to reduce TLB miss etc. > > The primary strategy is similar to supporting anonymous mTHP. Introduce > a new interface '/mm/transparent_hugepage/hugepage-XXkb/shmem_enabled', > which can have all the same values as the top-level > '/sys/kernel/mm/transparent_hugepage/shmem_enabled', with adding a new > additional "inherit" option. By default all sizes will be set to "never" > except PMD size, which is set to "inherit". This ensures backward compatibility > with the anonymous shmem enabled of the top level, meanwhile also allows > independent control of anonymous shmem enabled for each mTHP. > > Signed-off-by: Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx> > --- > include/linux/huge_mm.h | 10 +++ > mm/shmem.c | 179 +++++++++++++++++++++++++++++++++------- > 2 files changed, 161 insertions(+), 28 deletions(-) > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index 1fce6fee7766..b5339210268d 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -583,6 +583,16 @@ static inline bool thp_migration_supported(void) > { > return false; > } > + > +static inline int highest_order(unsigned long orders) > +{ > + return 0; > +} > + > +static inline int next_order(unsigned long *orders, int prev) > +{ > + return 0; > +} > #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > > static inline int split_folio_to_list_to_order(struct folio *folio, > diff --git a/mm/shmem.c b/mm/shmem.c > index 59cc26d44344..b50ddf013e37 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -1611,6 +1611,106 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) > return result; > } > > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > +static unsigned long anon_shmem_allowable_huge_orders(struct inode *inode, > + struct vm_area_struct *vma, pgoff_t index, > + bool global_huge) > +{ > + unsigned long mask = READ_ONCE(huge_anon_shmem_orders_always); > + unsigned long within_size_orders = READ_ONCE(huge_anon_shmem_orders_within_size); > + unsigned long vm_flags = vma->vm_flags; > + /* > + * Check all the (large) orders below HPAGE_PMD_ORDER + 1 that > + * are enabled for this vma. > + */ > + unsigned long orders = BIT(PMD_ORDER + 1) - 1; > + loff_t i_size; > + int order; > + > + if ((vm_flags & VM_NOHUGEPAGE) || > + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) > + return 0; > + > + /* If the hardware/firmware marked hugepage support disabled. */ > + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) > + return 0; > + > + /* > + * Following the 'deny' semantics of the top level, force the huge > + * option off from all mounts. > + */ > + if (shmem_huge == SHMEM_HUGE_DENY) > + return 0; > + /* > + * Only allow inherit orders if the top-level value is 'force', which > + * means non-PMD sized THP can not override 'huge' mount option now. > + */ > + if (shmem_huge == SHMEM_HUGE_FORCE) > + return READ_ONCE(huge_anon_shmem_orders_inherit); > + > + /* Allow mTHP that will be fully within i_size. */ > + order = highest_order(within_size_orders); > + while (within_size_orders) { > + index = round_up(index + 1, order); > + i_size = round_up(i_size_read(inode), PAGE_SIZE); > + if (i_size >> PAGE_SHIFT >= index) { > + mask |= within_size_orders; > + break; > + } > + > + order = next_order(&within_size_orders, order); > + } > + > + if (vm_flags & VM_HUGEPAGE) > + mask |= READ_ONCE(huge_anon_shmem_orders_madvise); > + > + if (global_huge) > + mask |= READ_ONCE(huge_anon_shmem_orders_inherit); > + > + return orders & mask; > +} > + > +static unsigned long anon_shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, > + struct address_space *mapping, pgoff_t index, > + unsigned long orders) > +{ > + struct vm_area_struct *vma = vmf->vma; > + unsigned long pages; > + int order; > + > + orders = thp_vma_suitable_orders(vma, vmf->address, orders); > + if (!orders) > + return 0; > + > + /* Find the highest order that can add into the page cache */ > + order = highest_order(orders); > + while (orders) { > + pages = 1UL << order; > + index = round_down(index, pages); > + if (!xa_find(&mapping->i_pages, &index, > + index + pages - 1, XA_PRESENT)) > + break; > + order = next_order(&orders, order); > + } > + > + return orders; > +} > +#else > +static unsigned long anon_shmem_allowable_huge_orders(struct inode *inode, > + struct vm_area_struct *vma, pgoff_t index, > + bool global_huge) > +{ > + return 0; > +} > + > +static unsigned long anon_shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, > + struct address_space *mapping, pgoff_t index, > + unsigned long orders) > +{ > + return 0; > +} > +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > + > static struct folio *shmem_alloc_hugefolio(gfp_t gfp, > struct shmem_inode_info *info, pgoff_t index, int order) > { > @@ -1639,38 +1739,55 @@ static struct folio *shmem_alloc_folio(gfp_t gfp, > return (struct folio *)page; > } > > -static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, > - struct inode *inode, pgoff_t index, > - struct mm_struct *fault_mm, bool huge) > +static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, > + gfp_t gfp, struct inode *inode, pgoff_t index, > + struct mm_struct *fault_mm, bool huge, unsigned long orders) IMO, it might be cleaner to drop the huge parameter and just set 'orders' as BIT(HPAGE_PMD_ORDER), then we only do the 'orders' check :) Likely: if (orders > 0) { if (vma && vma_is_anon_shmem(vma)) { ... } else if (orders & BIT(HPAGE_PMD_ORDER)) { ... } } > { > struct address_space *mapping = inode->i_mapping; > struct shmem_inode_info *info = SHMEM_I(inode); > - struct folio *folio; > + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; > + unsigned long suitable_orders; > + struct folio *folio = NULL; > long pages; > - int error; > + int error, order; > > if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) > huge = false; Currently, if THP is disabled, 'huge' will fall back to order-0, but 'orders' does not, IIUC. How about we make both consistent if THP is disabled? if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { huge = false; orders = 0; } Thanks, Lance > > - if (huge) { > - pages = HPAGE_PMD_NR; > - index = round_down(index, HPAGE_PMD_NR); > + if (huge || orders > 0) { > + if (vma && vma_is_anon_shmem(vma) && orders) { > + suitable_orders = anon_shmem_suitable_orders(inode, vmf, > + mapping, index, orders); > + } else { > + pages = HPAGE_PMD_NR; > + suitable_orders = BIT(HPAGE_PMD_ORDER); > + index = round_down(index, HPAGE_PMD_NR); > > - /* > - * Check for conflict before waiting on a huge allocation. > - * Conflict might be that a huge page has just been allocated > - * and added to page cache by a racing thread, or that there > - * is already at least one small page in the huge extent. > - * Be careful to retry when appropriate, but not forever! > - * Elsewhere -EEXIST would be the right code, but not here. > - */ > - if (xa_find(&mapping->i_pages, &index, > + /* > + * Check for conflict before waiting on a huge allocation. > + * Conflict might be that a huge page has just been allocated > + * and added to page cache by a racing thread, or that there > + * is already at least one small page in the huge extent. > + * Be careful to retry when appropriate, but not forever! > + * Elsewhere -EEXIST would be the right code, but not here. > + */ > + if (xa_find(&mapping->i_pages, &index, > index + HPAGE_PMD_NR - 1, XA_PRESENT)) > - return ERR_PTR(-E2BIG); > + return ERR_PTR(-E2BIG); > + } > > - folio = shmem_alloc_hugefolio(gfp, info, index, HPAGE_PMD_ORDER); > - if (!folio && pages == HPAGE_PMD_NR) > - count_vm_event(THP_FILE_FALLBACK); > + order = highest_order(suitable_orders); > + while (suitable_orders) { > + pages = 1 << order; > + index = round_down(index, pages); > + folio = shmem_alloc_hugefolio(gfp, info, index, order); > + if (folio) > + goto allocated; > + > + if (pages == HPAGE_PMD_NR) > + count_vm_event(THP_FILE_FALLBACK); > + order = next_order(&suitable_orders, order); > + } > } else { > pages = 1; > folio = shmem_alloc_folio(gfp, info, index); > @@ -1678,6 +1795,7 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, > if (!folio) > return ERR_PTR(-ENOMEM); > > +allocated: > __folio_set_locked(folio); > __folio_set_swapbacked(folio); > > @@ -1972,7 +2090,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, > struct mm_struct *fault_mm; > struct folio *folio; > int error; > - bool alloced; > + bool alloced, huge; > + unsigned long orders = 0; > > if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) > return -EINVAL; > @@ -2044,14 +2163,18 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, > return 0; > } > > - if (shmem_is_huge(inode, index, false, fault_mm, > - vma ? vma->vm_flags : 0)) { > + huge = shmem_is_huge(inode, index, false, fault_mm, > + vma ? vma->vm_flags : 0); > + /* Find hugepage orders that are allowed for anonymous shmem. */ > + if (vma && vma_is_anon_shmem(vma)) > + orders = anon_shmem_allowable_huge_orders(inode, vma, index, huge); > + if (huge || orders > 0) { > gfp_t huge_gfp; > > huge_gfp = vma_thp_gfp_mask(vma); > huge_gfp = limit_gfp_mask(huge_gfp, gfp); > - folio = shmem_alloc_and_add_folio(huge_gfp, > - inode, index, fault_mm, true); > + folio = shmem_alloc_and_add_folio(vmf, huge_gfp, > + inode, index, fault_mm, true, orders); > if (!IS_ERR(folio)) { > if (folio_test_pmd_mappable(folio)) > count_vm_event(THP_FILE_ALLOC); > @@ -2061,7 +2184,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, > goto repeat; > } > > - folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false); > + folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, false, 0); > if (IS_ERR(folio)) { > error = PTR_ERR(folio); > if (error == -EEXIST) > @@ -2072,7 +2195,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, > > alloced: > alloced = true; > - if (folio_test_pmd_mappable(folio) && > + if (folio_test_large(folio) && > DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < > folio_next_index(folio) - 1) { > struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); > -- > 2.39.3 >