On Sun, Jan 12, 2025 at 4:23 AM Dev Jain <dev.jain@xxxxxxx> wrote: > > > > On 11/01/25 3:18 am, Nico Pache wrote: > > On Fri, Jan 10, 2025 at 2:06 AM Dev Jain <dev.jain@xxxxxxx> wrote: > >> > >> > >> > >> On 09/01/25 5:01 am, Nico Pache wrote: > >>> khugepaged scans PMD ranges for potential collapse to a hugepage. To add > >>> mTHP support we use this scan to instead record chunks of fully utilized > >>> sections of the PMD. > >>> > >>> create a bitmap to represent a PMD in order MTHP_MIN_ORDER chunks. > >>> by default we will set this to order 3. The reasoning is that for 4K 512 > >>> PMD size this results in a 64 bit bitmap which has some optimizations. > >>> For other arches like ARM64 64K, we can set a larger order if needed. > >>> > >>> khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap > >>> that represents chunks of fully utilized regions. We can then determine > >>> what mTHP size fits best and in the following patch, we set this bitmap > >>> while scanning the PMD. > >>> > >>> max_ptes_none is used as a scale to determine how "full" an order must > >>> be before being considered for collapse. > >>> > >>> Signed-off-by: Nico Pache <npache@xxxxxxxxxx> > >>> --- > >>> include/linux/khugepaged.h | 4 +- > >>> mm/khugepaged.c | 129 +++++++++++++++++++++++++++++++++++-- > >>> 2 files changed, 126 insertions(+), 7 deletions(-) > >>> > >>> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h > >>> index 1f46046080f5..31cff8aeec4a 100644 > >>> --- a/include/linux/khugepaged.h > >>> +++ b/include/linux/khugepaged.h > >>> @@ -1,7 +1,9 @@ > >>> /* SPDX-License-Identifier: GPL-2.0 */ > >>> #ifndef _LINUX_KHUGEPAGED_H > >>> #define _LINUX_KHUGEPAGED_H > >>> - > >> > >> Nit: I don't think this line needs to be deleted. > >> > >>> +#define MIN_MTHP_ORDER 3 > >>> +#define MIN_MTHP_NR (1<<MIN_MTHP_ORDER) > >> > >> Nit: Insert a space: (1 << MIN_MTHP_ORDER) > >> > >>> +#define MTHP_BITMAP_SIZE (1<<(HPAGE_PMD_ORDER - MIN_MTHP_ORDER)) > >>> extern unsigned int khugepaged_max_ptes_none __read_mostly; > >>> #ifdef CONFIG_TRANSPARENT_HUGEPAGE > >>> extern struct attribute_group khugepaged_attr_group; > >>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c > >>> index 9eb161b04ee4..de1dc6ea3c71 100644 > >>> --- a/mm/khugepaged.c > >>> +++ b/mm/khugepaged.c > >>> @@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); > >>> > >>> static struct kmem_cache *mm_slot_cache __ro_after_init; > >>> > >>> +struct scan_bit_state { > >>> + u8 order; > >>> + u8 offset; > >>> +}; > >>> + > >>> struct collapse_control { > >>> bool is_khugepaged; > >>> > >>> @@ -102,6 +107,15 @@ struct collapse_control { > >>> > >>> /* nodemask for allocation fallback */ > >>> nodemask_t alloc_nmask; > >>> + > >>> + /* bitmap used to collapse mTHP sizes. 1bit = order MIN_MTHP_ORDER mTHP */ > >>> + unsigned long *mthp_bitmap; > >>> + unsigned long *mthp_bitmap_temp; > >>> + struct scan_bit_state *mthp_bitmap_stack; > >>> +}; > >>> + > >>> +struct collapse_control khugepaged_collapse_control = { > >>> + .is_khugepaged = true, > >>> }; > >>> > >>> /** > >>> @@ -389,6 +403,25 @@ int __init khugepaged_init(void) > >>> if (!mm_slot_cache) > >>> return -ENOMEM; > >>> > >>> + /* > >>> + * allocate the bitmaps dynamically since MTHP_BITMAP_SIZE is not known at > >>> + * compile time for some architectures. > >>> + */ > >>> + khugepaged_collapse_control.mthp_bitmap = kmalloc_array( > >>> + BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL); > >>> + if (!khugepaged_collapse_control.mthp_bitmap) > >>> + return -ENOMEM; > >>> + > >>> + khugepaged_collapse_control.mthp_bitmap_temp = kmalloc_array( > >>> + BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL); > >>> + if (!khugepaged_collapse_control.mthp_bitmap_temp) > >>> + return -ENOMEM; > >>> + > >>> + khugepaged_collapse_control.mthp_bitmap_stack = kmalloc_array( > >>> + MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL); > >>> + if (!khugepaged_collapse_control.mthp_bitmap_stack) > >>> + return -ENOMEM; > >>> + > >>> khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; > >>> khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; > >>> khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; > >>> @@ -400,6 +433,9 @@ int __init khugepaged_init(void) > >>> void __init khugepaged_destroy(void) > >>> { > >>> kmem_cache_destroy(mm_slot_cache); > >>> + kfree(khugepaged_collapse_control.mthp_bitmap); > >>> + kfree(khugepaged_collapse_control.mthp_bitmap_temp); > >>> + kfree(khugepaged_collapse_control.mthp_bitmap_stack); > >>> } > >>> > >>> static inline int khugepaged_test_exit(struct mm_struct *mm) > >>> @@ -850,10 +886,6 @@ static void khugepaged_alloc_sleep(void) > >>> remove_wait_queue(&khugepaged_wait, &wait); > >>> } > >>> > >>> -struct collapse_control khugepaged_collapse_control = { > >>> - .is_khugepaged = true, > >>> -}; > >>> - > >>> static bool khugepaged_scan_abort(int nid, struct collapse_control *cc) > >>> { > >>> int i; > >>> @@ -1102,7 +1134,8 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, > >>> > >>> static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > >>> int referenced, int unmapped, > >>> - struct collapse_control *cc) > >>> + struct collapse_control *cc, bool *mmap_locked, > >>> + int order, int offset) > >>> { > >>> LIST_HEAD(compound_pagelist); > >>> pmd_t *pmd, _pmd; > >>> @@ -1115,6 +1148,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > >>> struct mmu_notifier_range range; > >>> VM_BUG_ON(address & ~HPAGE_PMD_MASK); > >>> > >>> + /* if collapsing mTHPs we may have already released the read_lock, and > >>> + * need to reaquire it to keep the proper locking order. > >>> + */ > >>> + if (!*mmap_locked) > >>> + mmap_read_lock(mm); > >> > >> There is no need to take the read lock again, because we drop it just > >> after this. > > > > collapse_huge_page expects the mmap_lock to already be taken, and it > > returns with it unlocked. If we are collapsing multiple mTHPs under > > the same PMD, then I think we need to reacquire the lock before > > calling unlock on it. > > I cannot figure out a potential place where we drop the lock before > entering collapse_huge_page(). In any case, wouldn't this be better: Let's say we are collapsing two 1024kB mTHPs in a single PMD region. We call collapse_huge_page on the first mTHP and during the collapse the lock is dropped. When the second mTHP collapse is attempted the lock has already been dropped. > if (*mmap_locked) > mmap_read_unlock(mm); > > Basically, instead of putting the if condition around the lock, you do > it around the unlock? Yeah that seems much cleaner, Ill give it a try, thanks! > > > > >> > >>> /* > >>> * Before allocating the hugepage, release the mmap_lock read lock. > >>> * The allocation can take potentially a long time if it involves > >>> @@ -1122,6 +1160,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > >>> * that. We will recheck the vma after taking it again in write mode. > >>> */ > >>> mmap_read_unlock(mm); > >>> + *mmap_locked = false; > >>> > >>> result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER); > >>> if (result != SCAN_SUCCEED) > >>> @@ -1256,12 +1295,71 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > >>> out_up_write: > >>> mmap_write_unlock(mm); > >>> out_nolock: > >>> + *mmap_locked = false; > >>> if (folio) > >>> folio_put(folio); > >>> trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); > >>> return result; > >>> } > >>> > >>> +// Recursive function to consume the bitmap > >>> +static int khugepaged_scan_bitmap(struct mm_struct *mm, unsigned long address, > >>> + int referenced, int unmapped, struct collapse_control *cc, > >>> + bool *mmap_locked, unsigned long enabled_orders) > >>> +{ > >>> + u8 order, offset; > >>> + int num_chunks; > >>> + int bits_set, max_percent, threshold_bits; > >>> + int next_order, mid_offset; > >>> + int top = -1; > >>> + int collapsed = 0; > >>> + int ret; > >>> + struct scan_bit_state state; > >>> + > >>> + cc->mthp_bitmap_stack[++top] = (struct scan_bit_state) > >>> + { HPAGE_PMD_ORDER - MIN_MTHP_ORDER, 0 }; > >>> + > >>> + while (top >= 0) { > >>> + state = cc->mthp_bitmap_stack[top--]; > >>> + order = state.order; > >>> + offset = state.offset; > >>> + num_chunks = 1 << order; > >>> + // Skip mTHP orders that are not enabled > >>> + if (!(enabled_orders >> (order + MIN_MTHP_ORDER)) & 1) > >>> + goto next; > >>> + > >>> + // copy the relavant section to a new bitmap > >>> + bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset, > >>> + MTHP_BITMAP_SIZE); > >>> + > >>> + bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks); > >>> + > >>> + // Check if the region is "almost full" based on the threshold > >>> + max_percent = ((HPAGE_PMD_NR - khugepaged_max_ptes_none - 1) * 100) > >>> + / (HPAGE_PMD_NR - 1); > >>> + threshold_bits = (max_percent * num_chunks) / 100; > >>> + > >>> + if (bits_set >= threshold_bits) { > >>> + ret = collapse_huge_page(mm, address, referenced, unmapped, cc, > >>> + mmap_locked, order + MIN_MTHP_ORDER, offset * MIN_MTHP_NR); > >>> + if (ret == SCAN_SUCCEED) > >>> + collapsed += (1 << (order + MIN_MTHP_ORDER)); > >>> + continue; > >>> + } > >>> + > >>> +next: > >>> + if (order > 0) { > >>> + next_order = order - 1; > >>> + mid_offset = offset + (num_chunks / 2); > >>> + cc->mthp_bitmap_stack[++top] = (struct scan_bit_state) > >>> + { next_order, mid_offset }; > >>> + cc->mthp_bitmap_stack[++top] = (struct scan_bit_state) > >>> + { next_order, offset }; > >>> + } > >>> + } > >>> + return collapsed; > >>> +} > >>> + > >>> static int khugepaged_scan_pmd(struct mm_struct *mm, > >>> struct vm_area_struct *vma, > >>> unsigned long address, bool *mmap_locked, > >>> @@ -1430,7 +1528,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, > >>> pte_unmap_unlock(pte, ptl); > >>> if (result == SCAN_SUCCEED) { > >>> result = collapse_huge_page(mm, address, referenced, > >>> - unmapped, cc); > >>> + unmapped, cc, mmap_locked, HPAGE_PMD_ORDER, 0); > >>> /* collapse_huge_page will return with the mmap_lock released */ > >>> *mmap_locked = false; > >>> } > >>> @@ -2767,6 +2865,21 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, > >>> return -ENOMEM; > >>> cc->is_khugepaged = false; > >>> > >>> + cc->mthp_bitmap = kmalloc_array( > >>> + BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL); > >>> + if (!cc->mthp_bitmap) > >>> + return -ENOMEM; > >>> + > >>> + cc->mthp_bitmap_temp = kmalloc_array( > >>> + BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL); > >>> + if (!cc->mthp_bitmap_temp) > >>> + return -ENOMEM; > >>> + > >>> + cc->mthp_bitmap_stack = kmalloc_array( > >>> + MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL); > >>> + if (!cc->mthp_bitmap_stack) > >>> + return -ENOMEM; > >>> + > >>> mmgrab(mm); > >>> lru_add_drain_all(); > >>> > >>> @@ -2831,8 +2944,12 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, > >>> out_nolock: > >>> mmap_assert_locked(mm); > >>> mmdrop(mm); > >>> + kfree(cc->mthp_bitmap); > >>> + kfree(cc->mthp_bitmap_temp); > >>> + kfree(cc->mthp_bitmap_stack); > >>> kfree(cc); > >>> > >>> + > >>> return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 > >>> : madvise_collapse_errno(last_fail); > >>> } > >> > > >