On Jul 11 13:57, Yang Shi wrote: > On Wed, Jul 6, 2022 at 5:06 PM Zach O'Keefe <zokeefe@xxxxxxxxxx> wrote: > > > > MADV_COLLAPSE is not coupled to the kernel-oriented sysfs THP settings[1]. > > > > hugepage_vma_check() is the authority on determining if a VMA is eligible > > for THP allocation/collapse, and currently enforces the sysfs THP settings. > > Add a flag to disable these checks. For now, only apply this arg to anon > > and file, which use /sys/kernel/transparent_hugepage/enabled. We can > > expand this to shmem, which uses > > /sys/kernel/transparent_hugepage/shmem_enabled, later. > > > > Use this flag in collapse_pte_mapped_thp() where previously the VMA flags > > passed to hugepage_vma_check() were OR'd with VM_HUGEPAGE to elide the > > VM_HUGEPAGE check in "madvise" THP mode. Prior to "mm: khugepaged: check > > THP flag in hugepage_vma_check()", this check also didn't check "never" THP > > mode. As such, this restores the previous behavior of > > collapse_pte_mapped_thp() where sysfs THP settings are ignored. See > > comment in code for justification why this is OK. > > > > [1] https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@xxxxxxxxxxxxxx/ > > > > Signed-off-by: Zach O'Keefe <zokeefe@xxxxxxxxxx> > > Reviewed-by: Yang Shi <shy828301@xxxxxxxxx> Thanks for the review! Best, Zach > > --- > > fs/proc/task_mmu.c | 2 +- > > include/linux/huge_mm.h | 9 ++++----- > > mm/huge_memory.c | 14 ++++++-------- > > mm/khugepaged.c | 25 ++++++++++++++----------- > > mm/memory.c | 4 ++-- > > 5 files changed, 27 insertions(+), 27 deletions(-) > > > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > > index 34d292cec79a..f8cd58846a28 100644 > > --- a/fs/proc/task_mmu.c > > +++ b/fs/proc/task_mmu.c > > @@ -866,7 +866,7 @@ static int show_smap(struct seq_file *m, void *v) > > __show_smap(m, &mss, false); > > > > seq_printf(m, "THPeligible: %d\n", > > - hugepage_vma_check(vma, vma->vm_flags, true, false)); > > + hugepage_vma_check(vma, vma->vm_flags, true, false, true)); > > > > if (arch_pkeys_enabled()) > > seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > > index 37f2f11a6d7e..00312fc251c1 100644 > > --- a/include/linux/huge_mm.h > > +++ b/include/linux/huge_mm.h > > @@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) > > !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); > > } > > > > -bool hugepage_vma_check(struct vm_area_struct *vma, > > - unsigned long vm_flags, > > - bool smaps, bool in_pf); > > +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, > > + bool smaps, bool in_pf, bool enforce_sysfs); > > > > #define transparent_hugepage_use_zero_page() \ > > (transparent_hugepage_flags & \ > > @@ -321,8 +320,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, > > } > > > > static inline bool hugepage_vma_check(struct vm_area_struct *vma, > > - unsigned long vm_flags, > > - bool smaps, bool in_pf) > > + unsigned long vm_flags, bool smaps, > > + bool in_pf, bool enforce_sysfs) > > { > > return false; > > } > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index da300ce9dedb..4fbe43dc1568 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > > @@ -69,9 +69,8 @@ static atomic_t huge_zero_refcount; > > struct page *huge_zero_page __read_mostly; > > unsigned long huge_zero_pfn __read_mostly = ~0UL; > > > > -bool hugepage_vma_check(struct vm_area_struct *vma, > > - unsigned long vm_flags, > > - bool smaps, bool in_pf) > > +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, > > + bool smaps, bool in_pf, bool enforce_sysfs) > > { > > if (!vma->vm_mm) /* vdso */ > > return false; > > @@ -120,11 +119,10 @@ bool hugepage_vma_check(struct vm_area_struct *vma, > > if (!in_pf && shmem_file(vma->vm_file)) > > return shmem_huge_enabled(vma); > > > > - if (!hugepage_flags_enabled()) > > - return false; > > - > > - /* THP settings require madvise. */ > > - if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) > > + /* Enforce sysfs THP requirements as necessary */ > > + if (enforce_sysfs && > > + (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && > > + !hugepage_flags_always()))) > > return false; > > > > /* Only regular file is valid */ > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > index d89056d8cbad..b0e20db3f805 100644 > > --- a/mm/khugepaged.c > > +++ b/mm/khugepaged.c > > @@ -478,7 +478,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, > > { > > if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && > > hugepage_flags_enabled()) { > > - if (hugepage_vma_check(vma, vm_flags, false, false)) > > + if (hugepage_vma_check(vma, vm_flags, false, false, true)) > > __khugepaged_enter(vma->vm_mm); > > } > > } > > @@ -844,7 +844,8 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) > > */ > > > > static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, > > - struct vm_area_struct **vmap) > > + struct vm_area_struct **vmap, > > + struct collapse_control *cc) > > { > > struct vm_area_struct *vma; > > > > @@ -855,7 +856,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, > > if (!vma) > > return SCAN_VMA_NULL; > > > > - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) > > + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, > > + cc->is_khugepaged)) > > return SCAN_VMA_CHECK; > > /* > > * Anon VMA expected, the address may be unmapped then > > @@ -974,7 +976,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > > goto out_nolock; > > > > mmap_read_lock(mm); > > - result = hugepage_vma_revalidate(mm, address, &vma); > > + result = hugepage_vma_revalidate(mm, address, &vma, cc); > > if (result != SCAN_SUCCEED) { > > mmap_read_unlock(mm); > > goto out_nolock; > > @@ -1006,7 +1008,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > > * handled by the anon_vma lock + PG_lock. > > */ > > mmap_write_lock(mm); > > - result = hugepage_vma_revalidate(mm, address, &vma); > > + result = hugepage_vma_revalidate(mm, address, &vma, cc); > > if (result != SCAN_SUCCEED) > > goto out_up_write; > > /* check if the pmd is still valid */ > > @@ -1350,12 +1352,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) > > return; > > > > /* > > - * This vm_flags may not have VM_HUGEPAGE if the page was not > > - * collapsed by this mm. But we can still collapse if the page is > > - * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() > > - * will not fail the vma for missing VM_HUGEPAGE > > + * If we are here, we've succeeded in replacing all the native pages > > + * in the page cache with a single hugepage. If a mm were to fault-in > > + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage > > + * and map it by a PMD, regardless of sysfs THP settings. As such, let's > > + * analogously elide sysfs THP settings here. > > */ > > - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) > > + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) > > return; > > > > /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ > > @@ -2042,7 +2045,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, > > progress++; > > break; > > } > > - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { > > + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { > > skip: > > progress++; > > continue; > > diff --git a/mm/memory.c b/mm/memory.c > > index 8917bea2f0bc..96cd776e84f1 100644 > > --- a/mm/memory.c > > +++ b/mm/memory.c > > @@ -5001,7 +5001,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, > > return VM_FAULT_OOM; > > retry_pud: > > if (pud_none(*vmf.pud) && > > - hugepage_vma_check(vma, vm_flags, false, true)) { > > + hugepage_vma_check(vma, vm_flags, false, true, true)) { > > ret = create_huge_pud(&vmf); > > if (!(ret & VM_FAULT_FALLBACK)) > > return ret; > > @@ -5035,7 +5035,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, > > goto retry_pud; > > > > if (pmd_none(*vmf.pmd) && > > - hugepage_vma_check(vma, vm_flags, false, true)) { > > + hugepage_vma_check(vma, vm_flags, false, true, true)) { > > ret = create_huge_pmd(&vmf); > > if (!(ret & VM_FAULT_FALLBACK)) > > return ret; > > -- > > 2.37.0.rc0.161.g10f37bed90-goog > > > >