The patch titled Subject: mm/khugepaged: fix GUP-fast interaction by freeing ptes via mmu_gather has been added to the -mm mm-hotfixes-unstable branch. Its filename is mm-khugepaged-fix-gup-fast-interaction-by-freeing-ptes-via-mmu_gather.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-khugepaged-fix-gup-fast-interaction-by-freeing-ptes-via-mmu_gather.patch This patch will later appear in the mm-hotfixes-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Jann Horn <jannh@xxxxxxxxxx> Subject: mm/khugepaged: fix GUP-fast interaction by freeing ptes via mmu_gather Date: Wed, 23 Nov 2022 17:56:51 +0100 Since commit 70cbc3cc78a99 ("mm: gup: fix the fast GUP race against THP collapse"), the lockless_pages_from_mm() fastpath rechecks the pmd_t to ensure that the page table was not removed by khugepaged in between. However, lockless_pages_from_mm() still requires that the page table is not concurrently freed. We could provide this guarantee in khugepaged by using some variant of pte_free() with appropriate delay; but such a helper doesn't really exist outside the mmu_gather infrastructure. To avoid having to wire up a new codepath for freeing page tables that might have been in use in the past, fix the issue by letting khugepaged deposit a fresh page table (if required) instead of depositing the existing page table, and free the old page table via mmu_gather. Link: https://lkml.kernel.org/r/20221123165652.2204925-4-jannh@xxxxxxxxxx Fixes: ba76149f47d8 ("thp: khugepaged") Signed-off-by: Jann Horn <jannh@xxxxxxxxxx> Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: John Hubbard <jhubbard@xxxxxxxxxx> Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Cc: Peter Xu <peterx@xxxxxxxxxx> Cc: Yang Shi <shy828301@xxxxxxxxx> Cc: <stable@xxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- mm/khugepaged.c | 47 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) --- a/mm/khugepaged.c~mm-khugepaged-fix-gup-fast-interaction-by-freeing-ptes-via-mmu_gather +++ a/mm/khugepaged.c @@ -975,6 +975,8 @@ static int collapse_huge_page(struct mm_ int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; + struct mmu_gather tlb; + pgtable_t deposit_table = NULL; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -989,6 +991,11 @@ static int collapse_huge_page(struct mm_ result = alloc_charge_hpage(&hpage, mm, cc); if (result != SCAN_SUCCEED) goto out_nolock; + deposit_table = pte_alloc_one(mm); + if (!deposit_table) { + result = SCAN_FAIL; + goto out_nolock; + } mmap_read_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); @@ -1041,12 +1048,12 @@ static int collapse_huge_page(struct mm_ pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* - * This removes any huge TLB entry from the CPU so we won't allow - * huge and small TLB entries for the same virtual address to - * avoid the risk of CPU bugs in that area. - * - * Parallel fast GUP is fine since fast GUP will back off when - * it detects PMD is changed. + * Unlink the page table from the PMD and do a TLB flush. + * This ensures that the CPUs can't write to the old pages anymore by + * the time __collapse_huge_page_copy() copies their contents, and it + * allows __collapse_huge_page_copy() to free the old pages. + * This also prevents lockless_pages_from_mm() from grabbing references + * on the old pages from here on. */ _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); @@ -1090,6 +1097,16 @@ static int collapse_huge_page(struct mm_ __SetPageUptodate(hpage); pgtable = pmd_pgtable(_pmd); + /* + * Discard the old page table. + * The TLB flush that's implied here is redundant, but hard to avoid + * with the current API. + */ + tlb_gather_mmu(&tlb, mm); + tlb_flush_pte_range(&tlb, address, HPAGE_PMD_SIZE); + pte_free_tlb(&tlb, pgtable, address); + tlb_finish_mmu(&tlb); + _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); @@ -1097,7 +1114,8 @@ static int collapse_huge_page(struct mm_ BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(hpage, vma, address); lru_cache_add_inactive_or_unevictable(hpage, vma); - pgtable_trans_huge_deposit(mm, pmd, pgtable); + pgtable_trans_huge_deposit(mm, pmd, deposit_table); + deposit_table = NULL; set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); @@ -1112,6 +1130,8 @@ out_nolock: mem_cgroup_uncharge(page_folio(hpage)); put_page(hpage); } + if (deposit_table) + pte_free(mm, deposit_table); trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); return result; } @@ -1393,11 +1413,14 @@ static int set_huge_pmd(struct vm_area_s * The mmap lock together with this VMA's rmap locks covers all paths towards * the page table entries we're messing with here, except for hardware page * table walks and lockless_pages_from_mm(). + * + * This function is similar to free_pte_range(). */ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t pmd; + struct mmu_gather tlb; mmap_assert_write_locked(mm); if (vma->vm_file) @@ -1408,11 +1431,15 @@ static void collapse_and_free_pmd(struct */ if (vma->anon_vma) lockdep_assert_held_write(&vma->anon_vma->root->rwsem); + page_table_check_pte_clear_range(mm, addr, pmd); - pmd = pmdp_collapse_flush(vma, addr, pmdp); + tlb_gather_mmu(&tlb, mm); + pmd = READ_ONCE(*pmdp); + pmd_clear(pmdp); + tlb_flush_pte_range(&tlb, addr, HPAGE_PMD_SIZE); + pte_free_tlb(&tlb, pmd_pgtable(pmd), addr); + tlb_finish_mmu(&tlb); mm_dec_nr_ptes(mm); - page_table_check_pte_clear_range(mm, addr, pmd); - pte_free(mm, pmd_pgtable(pmd)); } /** _ Patches currently in -mm which might be from jannh@xxxxxxxxxx are mm-khugepaged-take-the-right-locks-for-page-table-retraction.patch mmu_gather-use-macro-arguments-more-carefully.patch mm-khugepaged-fix-gup-fast-interaction-by-freeing-ptes-via-mmu_gather.patch mm-khugepaged-invoke-mmu-notifiers-in-shmem-file-collapse-paths.patch