On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@xxxxxxxxxx> wrote: > > Add enforce_thp_enabled flag to struct collapse_control that allows context > to ignore constraints imposed by /sys/kernel/transparent_hugepage/enabled. > > This flag is set in khugepaged collapse context to preserve existing > khugepaged behavior. > > This flag will be used (unset) when introducing madvise collapse > context since the desired THP semantics of MADV_COLLAPSE aren't coupled > to sysfs THP settings. Most notably, for the purpose of eventual > madvise_collapse(2) support, this allows userspace to trigger THP collapse > on behalf of another processes, without adding support to meddle with > the VMA flags of said process, or change sysfs THP settings. > > For now, limit this flag to /sys/kernel/transparent_hugepage/enabled, > but it can be expanded to include > /sys/kernel/transparent_hugepage/shmem_enabled later. > > Link: https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@xxxxxxxxxxxxxx/ > > Signed-off-by: Zach O'Keefe <zokeefe@xxxxxxxxxx> Looks good to me. Reviewed-by: Yang Shi <shy828301@xxxxxxxxx> Just a reminder, I just posted series https://lore.kernel.org/linux-mm/20220606214414.736109-1-shy828301@xxxxxxxxx/T/#m5dae2dfa4b247f3b3903951dd3a1f0978a927e16, it changed some logic in hugepage_vma_check(). If your series gets in after it, you should need some additional tweaks to disregard sys THP setting. > --- > mm/khugepaged.c | 34 +++++++++++++++++++++++++++------- > 1 file changed, 27 insertions(+), 7 deletions(-) > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index c3589b3e238d..4ad04f552347 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -94,6 +94,11 @@ struct collapse_control { > */ > bool enforce_page_heuristics; > > + /* Enforce constraints of > + * /sys/kernel/mm/transparent_hugepage/enabled > + */ > + bool enforce_thp_enabled; > + > /* Num pages scanned per node */ > int node_load[MAX_NUMNODES]; > > @@ -893,10 +898,12 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) > */ > > static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, > - struct vm_area_struct **vmap) > + struct vm_area_struct **vmap, > + struct collapse_control *cc) > { > struct vm_area_struct *vma; > unsigned long hstart, hend; > + unsigned long vma_flags; > > if (unlikely(khugepaged_test_exit(mm))) > return SCAN_ANY_PROCESS; > @@ -909,7 +916,18 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, > hend = vma->vm_end & HPAGE_PMD_MASK; > if (address < hstart || address + HPAGE_PMD_SIZE > hend) > return SCAN_ADDRESS_RANGE; > - if (!hugepage_vma_check(vma, vma->vm_flags)) > + > + /* > + * If !cc->enforce_thp_enabled, set VM_HUGEPAGE so that > + * hugepage_vma_check() can pass even if > + * TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG is set (i.e. "madvise" mode). > + * Note that hugepage_vma_check() doesn't enforce that > + * TRANSPARENT_HUGEPAGE_FLAG or TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG > + * must be set (i.e. "never" mode). > + */ > + vma_flags = cc->enforce_thp_enabled ? vma->vm_flags > + : vma->vm_flags | VM_HUGEPAGE; > + if (!hugepage_vma_check(vma, vma_flags)) > return SCAN_VMA_CHECK; > /* Anon VMA expected */ > if (!vma->anon_vma || !vma_is_anonymous(vma)) > @@ -953,7 +971,8 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, > static bool __collapse_huge_page_swapin(struct mm_struct *mm, > struct vm_area_struct *vma, > unsigned long haddr, pmd_t *pmd, > - int referenced) > + int referenced, > + struct collapse_control *cc) > { > int swapped_in = 0; > vm_fault_t ret = 0; > @@ -980,7 +999,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, > /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */ > if (ret & VM_FAULT_RETRY) { > mmap_read_lock(mm); > - if (hugepage_vma_revalidate(mm, haddr, &vma)) { > + if (hugepage_vma_revalidate(mm, haddr, &vma, cc)) { > /* vma is no longer available, don't continue to swapin */ > trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); > return false; > @@ -1047,7 +1066,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > goto out_nolock; > > mmap_read_lock(mm); > - result = hugepage_vma_revalidate(mm, address, &vma); > + result = hugepage_vma_revalidate(mm, address, &vma, cc); > if (result) { > mmap_read_unlock(mm); > goto out_nolock; > @@ -1066,7 +1085,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > * Continuing to collapse causes inconsistency. > */ > if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, > - pmd, referenced)) { > + pmd, referenced, cc)) { > mmap_read_unlock(mm); > goto out_nolock; > } > @@ -1078,7 +1097,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > * handled by the anon_vma lock + PG_lock. > */ > mmap_write_lock(mm); > - result = hugepage_vma_revalidate(mm, address, &vma); > + result = hugepage_vma_revalidate(mm, address, &vma, cc); > if (result) > goto out_up_write; > /* check if the pmd is still valid */ > @@ -2277,6 +2296,7 @@ static int khugepaged(void *none) > struct mm_slot *mm_slot; > struct collapse_control cc = { > .enforce_page_heuristics = true, > + .enforce_thp_enabled = true, > .last_target_node = NUMA_NO_NODE, > /* .gfp set later */ > }; > -- > 2.36.1.255.ge46751e96f-goog >