On Sun, May 21, 2023 at 10:24 PM Hugh Dickins <hughd@xxxxxxxxxx> wrote: > > __collapse_huge_page_swapin(): don't drop the map after every pte, it > only has to be dropped by do_swap_page(); give up if pte_offset_map() > fails; trace_mm_collapse_huge_page_swapin() at the end, with result; > fix comment on returned result; fix vmf.pgoff, though it's not used. > > collapse_huge_page(): use pte_offset_map_lock() on the _pmd returned > from clearing; allow failure, but it should be impossible there. > hpage_collapse_scan_pmd() and collapse_pte_mapped_thp() allow for > pte_offset_map_lock() failure. > > Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx> Reviewed-by: Yang Shi <shy828301@xxxxxxxxx> A nit below: > --- > mm/khugepaged.c | 72 +++++++++++++++++++++++++++++++++---------------- > 1 file changed, 49 insertions(+), 23 deletions(-) > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index 732f9ac393fc..49cfa7cdfe93 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -993,9 +993,8 @@ static int check_pmd_still_valid(struct mm_struct *mm, > * Only done if hpage_collapse_scan_pmd believes it is worthwhile. > * > * Called and returns without pte mapped or spinlocks held. > - * Note that if false is returned, mmap_lock will be released. > + * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. > */ > - > static int __collapse_huge_page_swapin(struct mm_struct *mm, > struct vm_area_struct *vma, > unsigned long haddr, pmd_t *pmd, > @@ -1004,23 +1003,35 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, > int swapped_in = 0; > vm_fault_t ret = 0; > unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); > + int result; > + pte_t *pte = NULL; > > for (address = haddr; address < end; address += PAGE_SIZE) { > struct vm_fault vmf = { > .vma = vma, > .address = address, > - .pgoff = linear_page_index(vma, haddr), > + .pgoff = linear_page_index(vma, address), > .flags = FAULT_FLAG_ALLOW_RETRY, > .pmd = pmd, > }; > > - vmf.pte = pte_offset_map(pmd, address); > - vmf.orig_pte = *vmf.pte; > - if (!is_swap_pte(vmf.orig_pte)) { > - pte_unmap(vmf.pte); > - continue; > + if (!pte++) { > + pte = pte_offset_map(pmd, address); > + if (!pte) { > + mmap_read_unlock(mm); > + result = SCAN_PMD_NULL; > + goto out; > + } > } > + > + vmf.orig_pte = *pte; > + if (!is_swap_pte(vmf.orig_pte)) > + continue; > + > + vmf.pte = pte; > ret = do_swap_page(&vmf); > + /* Which unmaps pte (after perhaps re-checking the entry) */ > + pte = NULL; > > /* > * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. > @@ -1029,24 +1040,29 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, > * resulting in later failure. > */ > if (ret & VM_FAULT_RETRY) { > - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); > /* Likely, but not guaranteed, that page lock failed */ > - return SCAN_PAGE_LOCK; > + result = SCAN_PAGE_LOCK; With per-VMA lock, this may not be true anymore, at least not true until per-VMA lock supports swap fault. It may be better to have a more general failure code, for example, SCAN_FAIL. But anyway you don't have to change it in your patch, I can send a follow-up patch once this series is landed on mm-unstable. > + goto out; > } > if (ret & VM_FAULT_ERROR) { > mmap_read_unlock(mm); > - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); > - return SCAN_FAIL; > + result = SCAN_FAIL; > + goto out; > } > swapped_in++; > } > > + if (pte) > + pte_unmap(pte); > + > /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ > if (swapped_in) > lru_add_drain(); > > - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); > - return SCAN_SUCCEED; > + result = SCAN_SUCCEED; > +out: > + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); > + return result; > } > > static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, > @@ -1146,9 +1162,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > address + HPAGE_PMD_SIZE); > mmu_notifier_invalidate_range_start(&range); > > - pte = pte_offset_map(pmd, address); > - pte_ptl = pte_lockptr(mm, pmd); > - > pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ > /* > * This removes any huge TLB entry from the CPU so we won't allow > @@ -1163,13 +1176,18 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > mmu_notifier_invalidate_range_end(&range); > tlb_remove_table_sync_one(); > > - spin_lock(pte_ptl); > - result = __collapse_huge_page_isolate(vma, address, pte, cc, > - &compound_pagelist); > - spin_unlock(pte_ptl); > + pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); > + if (pte) { > + result = __collapse_huge_page_isolate(vma, address, pte, cc, > + &compound_pagelist); > + spin_unlock(pte_ptl); > + } else { > + result = SCAN_PMD_NULL; > + } > > if (unlikely(result != SCAN_SUCCEED)) { > - pte_unmap(pte); > + if (pte) > + pte_unmap(pte); > spin_lock(pmd_ptl); > BUG_ON(!pmd_none(*pmd)); > /* > @@ -1253,6 +1271,11 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, > memset(cc->node_load, 0, sizeof(cc->node_load)); > nodes_clear(cc->alloc_nmask); > pte = pte_offset_map_lock(mm, pmd, address, &ptl); > + if (!pte) { > + result = SCAN_PMD_NULL; > + goto out; > + } > + > for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; > _pte++, _address += PAGE_SIZE) { > pte_t pteval = *_pte; > @@ -1622,8 +1645,10 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, > * lockless_pages_from_mm() and the hardware page walker can access page > * tables while all the high-level locks are held in write mode. > */ > - start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); > result = SCAN_FAIL; > + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); > + if (!start_pte) > + goto drop_immap; > > /* step 1: check all mapped PTEs are to the right huge page */ > for (i = 0, addr = haddr, pte = start_pte; > @@ -1697,6 +1722,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, > > abort: > pte_unmap_unlock(start_pte, ptl); > +drop_immap: > i_mmap_unlock_write(vma->vm_file->f_mapping); > goto drop_hpage; > } > -- > 2.35.3 >