Add implementations for the following batch actions: scan_pmd: Iterate over batch and scan the pmd for eligibility. Note that this function is called with mmap_lock in read, and does not drop it before returning. If a batch entry fails, ->continue_collapse field of its madvise_collapse_data is set to 'false' so that later _batch actions know to ignore it. Return the number of THPs already the batch, which is needed by _madvise_collapse() to determine overall "success" criteria (all pmds either collapsed successfully, or already THP-backed). prealloc_hpages: Iterate over batch and allocate / charge hugepages. Before allocating a new page, check on local free hugepage list. Similarly, if, after allocating a hugepage, charging the memcg fails, save the hugepage on a local free list for future use. swapin_pmd: Iterate over batch and attempt to swap-in pages that are currently swapped out. Called with mmap_lock in read, and returns with it held; however, it might drop and require the lock internally. Specifically, __collapse_huge_page_swapin() might drop + require the mmap_lock. When it does so, it only revalidates the vma/address for a single pmd. Since we need to revalidate the vma for the entire region covered in the batch, we need to be notified when the lock is dropped so that we can perform the required revalidation. As such, add an argument to __collapse_huge_page_swapin() to notify caller when mmap_lock is dropped. collapse_pmd: Iterate over the batch and perform the actual collapse for each pmd. Note that this is done while holding the mmap_lock in write for the entire batch action. Signed-off-by: Zach O'Keefe <zokeefe@xxxxxxxxxx> --- mm/khugepaged.c | 153 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 145 insertions(+), 8 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ea53c706602e..e8156f15a3da 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2572,8 +2572,23 @@ __madvise_collapse_scan_pmd_batch(struct mm_struct *mm, int batch_size, struct collapse_control *cc) { - /* Implemented in later patch */ - return 0; + unsigned long addr, i; + int thps = 0; + + mmap_assert_locked(mm); + + for (addr = batch_start, i = 0; i < batch_size; + addr += HPAGE_PMD_SIZE, ++i) { + struct madvise_collapse_data *data = batch_data + i; + + scan_pmd(mm, vma, addr, cc, &data->scan_result); + data->continue_collapse = + data->scan_result.result == SCAN_SUCCEED; + if (data->scan_result.result == SCAN_PAGE_COMPOUND) + ++thps; + } + mmap_assert_locked(mm); + return thps; } /* @@ -2590,8 +2605,39 @@ __madvise_collapse_prealloc_hpages_batch(struct mm_struct *mm, int batch_size, struct collapse_control *cc) { - /* Implemented in later patch */ - return 0; + int nr_hpages = 0; + int i; + + for (i = 0; i < batch_size; ++i) { + struct madvise_collapse_data *data = batch_data + i; + + if (!data->continue_collapse) + continue; + + if (!list_empty(&cc->free_hpages[node])) { + data->hpage = list_first_entry(&cc->free_hpages[node], + struct page, lru); + list_del(&data->hpage->lru); + } else { + data->hpage = __alloc_pages_node(node, gfp, + HPAGE_PMD_ORDER); + if (unlikely(!data->hpage)) + break; + + prep_transhuge_page(data->hpage); + + if (unlikely(mem_cgroup_charge(page_folio(data->hpage), + mm, gfp))) { + /* No use reusing page, so give it back */ + put_page(data->hpage); + data->hpage = NULL; + data->continue_collapse = false; + break; + } + } + ++nr_hpages; + } + return nr_hpages; } /* @@ -2612,8 +2658,67 @@ __madvise_collapse_swapin_pmd_batch(struct mm_struct *mm, struct collapse_control *cc) { - /* Implemented in later patch */ - return true; + unsigned long addr; + int i; + bool ret = true; + + /* + * This function is called with mmap_lock held, and returns with it + * held. However, __collapse_huge_page_swapin() may internally drop and + * reaquire the lock. When it does, it only revalidates the single pmd + * provided to it. We need to know when it drops the lock so that we can + * revalidate the batch of pmds we are operating on. + * + * Initially setting this to 'true' because the caller just locked + * mmap_lock and so we need to revalidate before doing anything else. + */ + bool need_revalidate_pmd_count = true; + + for (addr = batch_start, i = 0; + i < batch_size; + addr += HPAGE_PMD_SIZE, ++i) { + struct vm_area_struct *vma; + struct madvise_collapse_data *data = batch_data + i; + + mmap_assert_locked(mm); + + /* + * We might have dropped the lock during previous iteration. + * It's acceptable to exit this function without revalidating + * the vma since the caller immediately unlocks mmap_lock + * anyway. + */ + if (!data->continue_collapse) + continue; + + if (need_revalidate_pmd_count) { + if (madvise_collapse_vma_revalidate_pmd_count(mm, + batch_start, + batch_size, + &vma)) { + ret = false; + break; + } + need_revalidate_pmd_count = false; + } + + data->pmd = mm_find_pmd(mm, addr); + + if (!data->pmd || + (data->scan_result.unmapped && + !__collapse_huge_page_swapin(mm, vma, addr, data->pmd, + VM_NOHUGEPAGE, + data->scan_result.referenced, + &need_revalidate_pmd_count))) { + /* Hold on to the THP until we know we don't need it. */ + data->continue_collapse = false; + list_add_tail(&data->hpage->lru, + &cc->free_hpages[node]); + data->hpage = NULL; + } + } + mmap_assert_locked(mm); + return ret; } /* @@ -2630,8 +2735,40 @@ __madvise_collapse_pmd_batch(struct mm_struct *mm, int node, struct collapse_control *cc) { - /* Implemented in later patch */ - return 0; + unsigned long addr; + struct vm_area_struct *vma; + int i, ret = 0; + + mmap_assert_write_locked(mm); + + if (madvise_collapse_vma_revalidate_pmd_count(mm, batch_start, + batch_size, &vma)) + goto out; + + for (addr = batch_start, i = 0; + i < batch_size; + addr += HPAGE_PMD_SIZE, ++i) { + int result; + struct madvise_collapse_data *data = batch_data + i; + + if (!data->continue_collapse || + (mm_find_pmd(mm, addr) != data->pmd)) + continue; + + result = __do_collapse_huge_page(mm, vma, addr, data->pmd, + data->hpage, + cc->enforce_pte_scan_limits, + NULL); + + if (result == SCAN_SUCCEED) + ++ret; + else + list_add_tail(&data->hpage->lru, + &cc->free_hpages[node]); + data->hpage = NULL; + } +out: + return ret; } static bool continue_collapse(struct madvise_collapse_data *batch_data, -- 2.35.1.616.g0bdcbb4464-goog