On Tue, Mar 8, 2022 at 1:35 PM Zach O'Keefe <zokeefe@xxxxxxxxxx> wrote: > > Introduce the main madvise collapse batched logic, including the overall > locking strategy. Stubs for individual batched actions, such as > scanning pmds in batch, have been stubbed out, and will be added later > in the series. > > Note the main benefit from doing all this work in a batched manner is > that __madvise__collapse_pmd_batch() (stubbed out) can be called inside > a single mmap_lock write. I don't get why this is preferred? Isn't it more preferred to minimize the scope of write mmap_lock? Assuming you batch large number of PMDs, MADV_COLLAPSE may hold write mmap_lock for a long time, it doesn't seem it could scale. > > Per-batch data is stored in a struct madvise_collapse_data array, with > an entry for each pmd to collapse, and is shared between the various > *_batch actions. This allows for partial success of collapsing a range > of pmds - we continue as long as some pmds can be successfully > collapsed. > > A "success" here, is where all pmds can be (or already are) collapsed. > On failure, the caller will need to verify what, if any, partial > successes occurred via smaps or otherwise. And the further question is why you have to batch it? In the first place my guess is you want to achieve a binary result, all valid PMDs get collapsed or no PMD gets collapsed. But it seems partial collapse is fine. So I don't get why you have to batch it. The side effect, off the top of my head, is you may preallocate a lot of huge pages, but the later collapse is blocked, for example, can't get mmap_lock or ptl, and the system may be under memory pressure, however the pre-allocated huge pages can't get reclaimed at all. Could you please elaborate why you didn't go with the non-batch approach? > > Also note that, where possible, if collapse fails for a particular pmd > after a hugepage has already been allocated, said hugepage is kept on a > per-node free list for the purpose of backing subsequent pmd collapses. > All unused hugepages are returned before _madvise_collapse() returns. > > Note that bisect at this patch won't break; madvise(MADV_COLLAPSE) will > return -1 always. > > Signed-off-by: Zach O'Keefe <zokeefe@xxxxxxxxxx> > --- > mm/khugepaged.c | 279 ++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 273 insertions(+), 6 deletions(-) > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index ca1e523086ed..ea53c706602e 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -86,6 +86,9 @@ static struct kmem_cache *mm_slot_cache __read_mostly; > #define MAX_PTE_MAPPED_THP 8 > > struct collapse_control { > + /* Used by MADV_COLLAPSE batch collapse */ > + struct list_head free_hpages[MAX_NUMNODES]; > + > /* Respect khugepaged_max_ptes_[none|swap|shared] */ > bool enforce_pte_scan_limits; > > @@ -99,8 +102,13 @@ struct collapse_control { > static void collapse_control_init(struct collapse_control *cc, > bool enforce_pte_scan_limits) > { > + int i; > + > cc->enforce_pte_scan_limits = enforce_pte_scan_limits; > cc->last_target_node = NUMA_NO_NODE; > + > + for (i = 0; i < MAX_NUMNODES; ++i) > + INIT_LIST_HEAD(cc->free_hpages + i); > } > > /** > @@ -1033,7 +1041,7 @@ static pmd_t *find_pmd_or_thp_or_none(struct mm_struct *mm, > /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ > barrier(); > #endif > - if (!pmd_present(pmde) || !pmd_none(pmde)) { > + if (!pmd_present(pmde) || pmd_none(pmde)) { > *result = SCAN_PMD_NULL; > return NULL; > } else if (pmd_trans_huge(pmde)) { > @@ -1054,12 +1062,16 @@ static pmd_t *find_pmd_or_thp_or_none(struct mm_struct *mm, > static bool __collapse_huge_page_swapin(struct mm_struct *mm, > struct vm_area_struct *vma, > unsigned long haddr, pmd_t *pmd, > - int referenced) > + int referenced, > + unsigned long vm_flags_ignored, > + bool *mmap_lock_dropped) > { > int swapped_in = 0; > vm_fault_t ret = 0; > unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); > > + if (mmap_lock_dropped) > + *mmap_lock_dropped = false; > for (address = haddr; address < end; address += PAGE_SIZE) { > struct vm_fault vmf = { > .vma = vma, > @@ -1080,8 +1092,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, > > /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */ > if (ret & VM_FAULT_RETRY) { > + if (mmap_lock_dropped) > + *mmap_lock_dropped = true; > mmap_read_lock(mm); > - if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) { > + if (hugepage_vma_revalidate(mm, haddr, vm_flags_ignored, &vma)) { > /* vma is no longer available, don't continue to swapin */ > trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); > return false; > @@ -1256,7 +1270,8 @@ static void khugepaged_collapse_huge_page(struct mm_struct *mm, > * Continuing to collapse causes inconsistency. > */ > if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, > - pmd, referenced)) { > + pmd, referenced, VM_NONE, > + NULL)) { > mmap_read_unlock(mm); > goto out_nolock; > } > @@ -2520,6 +2535,128 @@ void khugepaged_min_free_kbytes_update(void) > mutex_unlock(&khugepaged_mutex); > } > > +struct madvise_collapse_data { > + struct page *hpage; /* Preallocated THP */ > + bool continue_collapse; /* Should we attempt / continue collapse? */ > + > + struct scan_pmd_result scan_result; > + pmd_t *pmd; > +}; > + > +static int > +madvise_collapse_vma_revalidate_pmd_count(struct mm_struct *mm, > + unsigned long address, int nr, > + struct vm_area_struct **vmap) > +{ > + /* madvise_collapse() ignores MADV_NOHUGEPAGE */ > + return hugepage_vma_revalidate_pmd_count(mm, address, nr, VM_NOHUGEPAGE, > + vmap); > +} > + > +/* > + * Scan pmd to see which we can collapse, and to determine node to allocate on. > + * > + * Must be called with mmap_lock in read, and returns with the lock held in > + * read. Does not drop the lock. > + * > + * Set batch_data[i]->continue_collapse to false for any pmd that can't be > + * collapsed. > + * > + * Return the number of existing THPs in batch. > + */ > +static int > +__madvise_collapse_scan_pmd_batch(struct mm_struct *mm, > + struct vm_area_struct *vma, > + unsigned long batch_start, > + struct madvise_collapse_data *batch_data, > + int batch_size, > + struct collapse_control *cc) > +{ > + /* Implemented in later patch */ > + return 0; > +} > + > +/* > + * Preallocate and charge huge page for each pmd in the batch, store the > + * new page in batch_data[i]->hpage. > + * > + * Return the number of huge pages allocated. > + */ > +static int > +__madvise_collapse_prealloc_hpages_batch(struct mm_struct *mm, > + gfp_t gfp, > + int node, > + struct madvise_collapse_data *batch_data, > + int batch_size, > + struct collapse_control *cc) > +{ > + /* Implemented in later patch */ > + return 0; > +} > + > +/* > + * Do swapin for all ranges in batch, returns true iff successful. > + * > + * Called with mmap_lock held in read, and returns with it held in read. > + * Might drop the lock. > + * > + * Set batch_data[i]->continue_collapse to false for any pmd that can't be > + * collapsed. Else, set batch_data[i]->pmd to the found pmd. > + */ > +static bool > +__madvise_collapse_swapin_pmd_batch(struct mm_struct *mm, > + int node, > + unsigned long batch_start, > + struct madvise_collapse_data *batch_data, > + int batch_size, > + struct collapse_control *cc) > + > +{ > + /* Implemented in later patch */ > + return true; > +} > + > +/* > + * Do the collapse operation. Return number of THPs collapsed successfully. > + * > + * Called with mmap_lock held in write, and returns with it held. Does not > + * drop the lock. > + */ > +static int > +__madvise_collapse_pmd_batch(struct mm_struct *mm, > + unsigned long batch_start, > + int batch_size, > + struct madvise_collapse_data *batch_data, > + int node, > + struct collapse_control *cc) > +{ > + /* Implemented in later patch */ > + return 0; > +} > + > +static bool continue_collapse(struct madvise_collapse_data *batch_data, > + int batch_size) > +{ > + int i; > + > + for (i = 0; i < batch_size; ++i) > + if (batch_data[i].continue_collapse) > + return true; > + return false; > +} > + > +static bool madvise_transparent_hugepage_enabled(struct vm_area_struct *vma) > +{ > + if (vma_is_anonymous(vma)) > + /* madvise_collapse() ignores MADV_NOHUGEPAGE */ > + return __transparent_hugepage_enabled(vma, vma->vm_flags & > + ~VM_NOHUGEPAGE); > + /* TODO: Support file-backed memory */ > + return false; > +} > + > +#define MADVISE_COLLAPSE_BATCH_SIZE 8 > + > /* > * Returns 0 if successfully able to collapse range into THPs (or range already > * backed by THPs). Due to implementation detail, THPs collapsed here may be > @@ -2532,8 +2669,138 @@ static int _madvise_collapse(struct mm_struct *mm, > unsigned long end, gfp_t gfp, > struct collapse_control *cc) > { > - /* Implemented in later patch */ > - return -ENOSYS; > + unsigned long hstart, hend, batch_addr; > + int ret = -EINVAL, collapsed = 0, nr_hpages = 0, i; > + struct madvise_collapse_data batch_data[MADVISE_COLLAPSE_BATCH_SIZE]; > + > + mmap_assert_locked(mm); > + BUG_ON(vma->vm_start > start); > + BUG_ON(vma->vm_end < end); > + VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm); > + > + hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; > + hend = end & HPAGE_PMD_MASK; > + nr_hpages = (hend - hstart) >> HPAGE_PMD_SHIFT; > + if (hstart >= hend) > + goto out; > + > + if (!madvise_transparent_hugepage_enabled(vma)) > + goto out; > + > + /* > + * Request might cover multiple hugepages. Strategy is to batch > + * allocation and collapse operations so that we do more work while > + * mmap_lock is held exclusively. > + * > + * While processing batch, mmap_lock is locked/unlocked many times for > + * the supplied VMA. It's possible that the original VMA is split while > + * lock was dropped. If in the context of the (possibly new) VMA, THP > + * collapse is possible, we continue. > + */ > + for (batch_addr = hstart; > + batch_addr < hend; > + batch_addr += HPAGE_PMD_SIZE * MADVISE_COLLAPSE_BATCH_SIZE) { > + int node, batch_size; > + int thps; /* Number of existing THPs in range */ > + > + batch_size = (hend - batch_addr) >> HPAGE_PMD_SHIFT; > + batch_size = min_t(int, batch_size, > + MADVISE_COLLAPSE_BATCH_SIZE); > + > + BUG_ON(batch_size <= 0); > + memset(batch_data, 0, sizeof(batch_data)); > + cond_resched(); > + VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm); > + > + /* > + * If first batch, we still hold mmap_lock from madvise > + * call and haven't dropped it since checking the VMA. Else, > + * we've dropped the lock and we need to revalidate. > + */ > + if (batch_addr != hstart) { > + mmap_read_lock(mm); > + if (madvise_collapse_vma_revalidate_pmd_count(mm, > + batch_addr, > + batch_size, > + &vma)) > + goto loop_unlock_break; > + } > + > + mmap_assert_locked(mm); > + > + thps = __madvise_collapse_scan_pmd_batch(mm, vma, batch_addr, > + batch_data, batch_size, > + cc); > + mmap_read_unlock(mm); > + > + /* Count existing THPs as-if we collapsed them */ > + collapsed += thps; > + if (thps == batch_size || !continue_collapse(batch_data, > + batch_size)) > + continue; > + > + node = find_target_node(cc); > + if (!__madvise_collapse_prealloc_hpages_batch(mm, gfp, node, > + batch_data, > + batch_size, cc)) { > + /* No more THPs available - so give up */ > + ret = -ENOMEM; > + break; > + } > + > + mmap_read_lock(mm); > + if (!__madvise_collapse_swapin_pmd_batch(mm, node, batch_addr, > + batch_data, batch_size, > + cc)) > + goto loop_unlock_break; > + mmap_read_unlock(mm); > + mmap_write_lock(mm); > + collapsed += __madvise_collapse_pmd_batch(mm, > + batch_addr, batch_size, batch_data, > + node, cc); > + mmap_write_unlock(mm); > + > + for (i = 0; i < batch_size; ++i) { > + struct page *page = batch_data[i].hpage; > + > + if (page && !IS_ERR(page)) { > + list_add_tail(&page->lru, > + &cc->free_hpages[node]); > + batch_data[i].hpage = NULL; > + } > + } > + /* mmap_lock is unlocked here */ > + continue; > +loop_unlock_break: > + mmap_read_unlock(mm); > + break; > + } > + /* mmap_lock is unlocked here */ > + > + for (i = 0; i < MADVISE_COLLAPSE_BATCH_SIZE; ++i) { > + struct page *page = batch_data[i].hpage; > + > + if (page && !IS_ERR(page)) { > + mem_cgroup_uncharge(page_folio(page)); > + put_page(page); > + } > + } > + for (i = 0; i < MAX_NUMNODES; ++i) { > + struct page *page, *tmp; > + > + list_for_each_entry_safe(page, tmp, cc->free_hpages + i, lru) { > + list_del(&page->lru); > + mem_cgroup_uncharge(page_folio(page)); > + put_page(page); > + } > + } > + ret = collapsed == nr_hpages ? 0 : -1; > + vma = NULL; /* tell sys_madvise we dropped mmap_lock */ > + mmap_read_lock(mm); /* sys_madvise expects us to have mmap_lock */ > +out: > + *prev = vma; /* we didn't drop mmap_lock, so this holds */ > + > + return ret; > } > > int madvise_collapse(struct vm_area_struct *vma, > -- > 2.35.1.616.g0bdcbb4464-goog >