On Mon, 7 Jun 2021, Aneesh Kumar K.V wrote: > CPU 1 CPU 2 CPU 3 > > mremap(old_addr, new_addr) page_shrinker/try_to_unmap_one > > mmap_write_lock_killable() > > addr = old_addr > lock(pte_ptl) > lock(pmd_ptl) > pmd = *old_pmd > pmd_clear(old_pmd) > flush_tlb_range(old_addr) > > *new_pmd = pmd > *new_addr = 10; and fills > TLB with new addr > and old pfn > > unlock(pmd_ptl) > ptep_clear_flush() > old pfn is free. > Stale TLB entry > > Fix this race by holding pmd lock in pageout. This still doesn't handle the race > between MOVE_PUD and pageout. > > Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions") > Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@xxxxxxxxxxxxxx > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> This seems very wrong to me, to require another level of locking in the rmap lookup, just to fix some new pagetable games in mremap. But Linus asked "Am I missing something?": neither of you have mentioned mremap's take_rmap_locks(), so I hope that already meets your need. And if it needs to be called more often than before (see "need_rmap_locks"), that's probably okay. Hugh > --- > include/linux/rmap.h | 9 ++++++--- > mm/page_vma_mapped.c | 36 ++++++++++++++++++------------------ > 2 files changed, 24 insertions(+), 21 deletions(-) > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h > index def5c62c93b3..272ab0c2b60b 100644 > --- a/include/linux/rmap.h > +++ b/include/linux/rmap.h > @@ -207,7 +207,8 @@ struct page_vma_mapped_walk { > unsigned long address; > pmd_t *pmd; > pte_t *pte; > - spinlock_t *ptl; > + spinlock_t *pte_ptl; > + spinlock_t *pmd_ptl; > unsigned int flags; > }; > > @@ -216,8 +217,10 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) > /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ > if (pvmw->pte && !PageHuge(pvmw->page)) > pte_unmap(pvmw->pte); > - if (pvmw->ptl) > - spin_unlock(pvmw->ptl); > + if (pvmw->pte_ptl) > + spin_unlock(pvmw->pte_ptl); > + if (pvmw->pmd_ptl) > + spin_unlock(pvmw->pmd_ptl); > } > > bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c > index 2cf01d933f13..87a2c94c7e27 100644 > --- a/mm/page_vma_mapped.c > +++ b/mm/page_vma_mapped.c > @@ -47,8 +47,10 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw) > return false; > } > } > - pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd); > - spin_lock(pvmw->ptl); > + if (USE_SPLIT_PTE_PTLOCKS) { > + pvmw->pte_ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd); > + spin_lock(pvmw->pte_ptl); > + } > return true; > } > > @@ -162,8 +164,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) > if (!pvmw->pte) > return false; > > - pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte); > - spin_lock(pvmw->ptl); > + pvmw->pte_ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte); > + spin_lock(pvmw->pte_ptl); > if (!check_pte(pvmw)) > return not_found(pvmw); > return true; > @@ -179,6 +181,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) > if (!pud_present(*pud)) > return false; > pvmw->pmd = pmd_offset(pud, pvmw->address); > + pvmw->pmd_ptl = pmd_lock(mm, pvmw->pmd); > /* > * Make sure the pmd value isn't cached in a register by the > * compiler and used as a stale value after we've observed a > @@ -186,7 +189,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) > */ > pmde = READ_ONCE(*pvmw->pmd); > if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { > - pvmw->ptl = pmd_lock(mm, pvmw->pmd); > if (likely(pmd_trans_huge(*pvmw->pmd))) { > if (pvmw->flags & PVMW_MIGRATION) > return not_found(pvmw); > @@ -206,14 +208,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) > } > } > return not_found(pvmw); > - } else { > - /* THP pmd was split under us: handle on pte level */ > - spin_unlock(pvmw->ptl); > - pvmw->ptl = NULL; > } > - } else if (!pmd_present(pmde)) { > - return false; > - } > + } else if (!pmd_present(pmde)) > + return not_found(pvmw); > + > if (!map_pte(pvmw)) > goto next_pte; > while (1) { > @@ -233,19 +231,21 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) > /* Did we cross page table boundary? */ > if (pvmw->address % PMD_SIZE == 0) { > pte_unmap(pvmw->pte); > - if (pvmw->ptl) { > - spin_unlock(pvmw->ptl); > - pvmw->ptl = NULL; > + if (pvmw->pte_ptl) { > + spin_unlock(pvmw->pte_ptl); > + pvmw->pte_ptl = NULL; > } > + spin_unlock(pvmw->pmd_ptl); > + pvmw->pmd_ptl = NULL; > goto restart; > } else { > pvmw->pte++; > } > } while (pte_none(*pvmw->pte)); > > - if (!pvmw->ptl) { > - pvmw->ptl = pte_lockptr(mm, pvmw->pmd); > - spin_lock(pvmw->ptl); > + if (USE_SPLIT_PTE_PTLOCKS && !pvmw->pte_ptl) { > + pvmw->pte_ptl = pte_lockptr(mm, pvmw->pmd); > + spin_lock(pvmw->pte_ptl); > } > } > } > -- > 2.31.1