On Wed, Jul 6, 2022 at 5:06 PM Zach O'Keefe <zokeefe@xxxxxxxxxx> wrote: > > MADV_COLLAPSE is not coupled to the kernel-oriented sysfs THP settings[1]. > > hugepage_vma_check() is the authority on determining if a VMA is eligible > for THP allocation/collapse, and currently enforces the sysfs THP settings. > Add a flag to disable these checks. For now, only apply this arg to anon > and file, which use /sys/kernel/transparent_hugepage/enabled. We can > expand this to shmem, which uses > /sys/kernel/transparent_hugepage/shmem_enabled, later. > > Use this flag in collapse_pte_mapped_thp() where previously the VMA flags > passed to hugepage_vma_check() were OR'd with VM_HUGEPAGE to elide the > VM_HUGEPAGE check in "madvise" THP mode. Prior to "mm: khugepaged: check > THP flag in hugepage_vma_check()", this check also didn't check "never" THP > mode. As such, this restores the previous behavior of > collapse_pte_mapped_thp() where sysfs THP settings are ignored. See > comment in code for justification why this is OK. > > [1] https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@xxxxxxxxxxxxxx/ > > Signed-off-by: Zach O'Keefe <zokeefe@xxxxxxxxxx> Reviewed-by: Yang Shi <shy828301@xxxxxxxxx> > --- > fs/proc/task_mmu.c | 2 +- > include/linux/huge_mm.h | 9 ++++----- > mm/huge_memory.c | 14 ++++++-------- > mm/khugepaged.c | 25 ++++++++++++++----------- > mm/memory.c | 4 ++-- > 5 files changed, 27 insertions(+), 27 deletions(-) > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > index 34d292cec79a..f8cd58846a28 100644 > --- a/fs/proc/task_mmu.c > +++ b/fs/proc/task_mmu.c > @@ -866,7 +866,7 @@ static int show_smap(struct seq_file *m, void *v) > __show_smap(m, &mss, false); > > seq_printf(m, "THPeligible: %d\n", > - hugepage_vma_check(vma, vma->vm_flags, true, false)); > + hugepage_vma_check(vma, vma->vm_flags, true, false, true)); > > if (arch_pkeys_enabled()) > seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index 37f2f11a6d7e..00312fc251c1 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) > !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); > } > > -bool hugepage_vma_check(struct vm_area_struct *vma, > - unsigned long vm_flags, > - bool smaps, bool in_pf); > +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, > + bool smaps, bool in_pf, bool enforce_sysfs); > > #define transparent_hugepage_use_zero_page() \ > (transparent_hugepage_flags & \ > @@ -321,8 +320,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, > } > > static inline bool hugepage_vma_check(struct vm_area_struct *vma, > - unsigned long vm_flags, > - bool smaps, bool in_pf) > + unsigned long vm_flags, bool smaps, > + bool in_pf, bool enforce_sysfs) > { > return false; > } > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index da300ce9dedb..4fbe43dc1568 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -69,9 +69,8 @@ static atomic_t huge_zero_refcount; > struct page *huge_zero_page __read_mostly; > unsigned long huge_zero_pfn __read_mostly = ~0UL; > > -bool hugepage_vma_check(struct vm_area_struct *vma, > - unsigned long vm_flags, > - bool smaps, bool in_pf) > +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, > + bool smaps, bool in_pf, bool enforce_sysfs) > { > if (!vma->vm_mm) /* vdso */ > return false; > @@ -120,11 +119,10 @@ bool hugepage_vma_check(struct vm_area_struct *vma, > if (!in_pf && shmem_file(vma->vm_file)) > return shmem_huge_enabled(vma); > > - if (!hugepage_flags_enabled()) > - return false; > - > - /* THP settings require madvise. */ > - if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) > + /* Enforce sysfs THP requirements as necessary */ > + if (enforce_sysfs && > + (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && > + !hugepage_flags_always()))) > return false; > > /* Only regular file is valid */ > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index d89056d8cbad..b0e20db3f805 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -478,7 +478,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, > { > if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && > hugepage_flags_enabled()) { > - if (hugepage_vma_check(vma, vm_flags, false, false)) > + if (hugepage_vma_check(vma, vm_flags, false, false, true)) > __khugepaged_enter(vma->vm_mm); > } > } > @@ -844,7 +844,8 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) > */ > > static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, > - struct vm_area_struct **vmap) > + struct vm_area_struct **vmap, > + struct collapse_control *cc) > { > struct vm_area_struct *vma; > > @@ -855,7 +856,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, > if (!vma) > return SCAN_VMA_NULL; > > - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) > + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, > + cc->is_khugepaged)) > return SCAN_VMA_CHECK; > /* > * Anon VMA expected, the address may be unmapped then > @@ -974,7 +976,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > goto out_nolock; > > mmap_read_lock(mm); > - result = hugepage_vma_revalidate(mm, address, &vma); > + result = hugepage_vma_revalidate(mm, address, &vma, cc); > if (result != SCAN_SUCCEED) { > mmap_read_unlock(mm); > goto out_nolock; > @@ -1006,7 +1008,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > * handled by the anon_vma lock + PG_lock. > */ > mmap_write_lock(mm); > - result = hugepage_vma_revalidate(mm, address, &vma); > + result = hugepage_vma_revalidate(mm, address, &vma, cc); > if (result != SCAN_SUCCEED) > goto out_up_write; > /* check if the pmd is still valid */ > @@ -1350,12 +1352,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) > return; > > /* > - * This vm_flags may not have VM_HUGEPAGE if the page was not > - * collapsed by this mm. But we can still collapse if the page is > - * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() > - * will not fail the vma for missing VM_HUGEPAGE > + * If we are here, we've succeeded in replacing all the native pages > + * in the page cache with a single hugepage. If a mm were to fault-in > + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage > + * and map it by a PMD, regardless of sysfs THP settings. As such, let's > + * analogously elide sysfs THP settings here. > */ > - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) > + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) > return; > > /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ > @@ -2042,7 +2045,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, > progress++; > break; > } > - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { > + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { > skip: > progress++; > continue; > diff --git a/mm/memory.c b/mm/memory.c > index 8917bea2f0bc..96cd776e84f1 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -5001,7 +5001,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, > return VM_FAULT_OOM; > retry_pud: > if (pud_none(*vmf.pud) && > - hugepage_vma_check(vma, vm_flags, false, true)) { > + hugepage_vma_check(vma, vm_flags, false, true, true)) { > ret = create_huge_pud(&vmf); > if (!(ret & VM_FAULT_FALLBACK)) > return ret; > @@ -5035,7 +5035,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, > goto retry_pud; > > if (pmd_none(*vmf.pmd) && > - hugepage_vma_check(vma, vm_flags, false, true)) { > + hugepage_vma_check(vma, vm_flags, false, true, true)) { > ret = create_huge_pmd(&vmf); > if (!(ret & VM_FAULT_FALLBACK)) > return ret; > -- > 2.37.0.rc0.161.g10f37bed90-goog > >