The idea of hugepage collapse in process context was previously introduced by David Rientjes to linux-mm[1]. The idea is to introduce a new madvise mode, MADV_COLLAPSE, that allows users to request a synchronous collapse of memory. The benefits of this approach are: * cpu is charged to the process that wants to spend the cycles for the THP * avoid unpredictable timing of khugepaged collapse * flexible separation of sync userspace and async khugepaged THP collapse policies Immediate users of this new functionality include: * malloc implementations that manage memory in hugepage-sized chunks, but sometimes subrelease memory back to the system in native-sized chunks via MADV_DONTNEED; zapping the pmd. Later, when the memory is hot, the implementation could madvise(MADV_COLLAPSE) to re-back the memory by THP to regain TLB performance. * immediately back executable text by hugepages. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take too long on a large system. To keep patches digestible, introduce MADV_COLLAPSE in a few stages. Add plumbing to existing madvise infrastructure, as well as populate uapi header files, leaving the actual madvise(MADV_COLLAPSE) handler stubbed out. Only privately-mapped anon memory is supported for now. [1] https://lore.kernel.org/linux-mm/d098c392-273a-36a4-1a29-59731cdf5d3d@xxxxxxxxxx/ Signed-off-by: Zach O'Keefe <zokeefe@xxxxxxxxxx> --- include/linux/huge_mm.h | 12 +++++++ include/uapi/asm-generic/mman-common.h | 2 ++ mm/khugepaged.c | 46 ++++++++++++++++++++++++++ mm/madvise.c | 5 +++ 4 files changed, 65 insertions(+) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fd905b0b2c71..407b63ab4185 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -226,6 +226,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); +int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); @@ -383,6 +386,15 @@ static inline int hugepage_madvise(struct vm_area_struct *vma, BUG(); return 0; } + +static inline int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + BUG(); + return 0; +} + static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6c1aa92a92e4..6ce1f1ceb432 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -77,6 +77,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 12ae765c5c32..ca1e523086ed 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2519,3 +2519,49 @@ void khugepaged_min_free_kbytes_update(void) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } + +/* + * Returns 0 if successfully able to collapse range into THPs (or range already + * backed by THPs). Due to implementation detail, THPs collapsed here may be + * split again before this function returns. + */ +static int _madvise_collapse(struct mm_struct *mm, + struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, + unsigned long end, gfp_t gfp, + struct collapse_control *cc) +{ + /* Implemented in later patch */ + return -ENOSYS; +} + +int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end) +{ + struct collapse_control cc; + gfp_t gfp; + int error; + struct mm_struct *mm = vma->vm_mm; + + /* Requested to hold mmap_lock in read */ + mmap_assert_locked(mm); + + mmgrab(mm); + collapse_control_init(&cc, /* enforce_pte_scan_limits= */ false); + gfp = vma_thp_gfp_mask(vma); + lru_add_drain(); /* lru_add_drain_all() too heavy here */ + error = _madvise_collapse(mm, vma, prev, start, end, gfp, &cc); + mmap_assert_locked(mm); + mmdrop(mm); + + /* + * madvise() returns EAGAIN if kernel resources are temporarily + * unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + + return error; +} diff --git a/mm/madvise.c b/mm/madvise.c index 5b6d796e55de..292aa017c150 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -58,6 +58,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_FREE: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: + case MADV_COLLAPSE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -1046,6 +1047,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, if (error) goto out; break; + case MADV_COLLAPSE: + return madvise_collapse(vma, prev, start, end); } anon_name = anon_vma_name(vma); @@ -1139,6 +1142,7 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: + case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: @@ -1328,6 +1332,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. + * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. -- 2.35.1.616.g0bdcbb4464-goog