On Thu, Jul 25, 2024 at 08:39:54PM +0200, David Hildenbrand wrote: > pte_lockptr() is the only *_lockptr() function that doesn't consume > what would be expected: it consumes a pmd_t pointer instead of a pte_t > pointer. > > Let's change that. The two callers in pgtable-generic.c are easily > adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a > pte_offset_map_nolock() to obtain the lock, even though we won't actually > be traversing the page table. > > This makes the code more similar to the other variants and avoids other > hacks to make the new pte_lockptr() version happy. pte_lockptr() users > reside now only in pgtable-generic.c. > > Maybe, using pte_offset_map_nolock() is the right thing to do because > the PTE table could have been removed in the meantime? At least it sounds > more future proof if we ever have other means of page table reclaim. I think it can't change, because anyone who wants to race against this should try to take the pmd lock first (which was held already)? I wonder an open coded "ptlock_ptr(page_ptdesc(pmd_page(*pmd)))" would be nicer here, but only if my understanding is correct. Thanks, > > It's not quite clear if holding the PTE table lock is really required: > what if someone else obtains the lock just after we unlock it? But we'll > leave that as is for now, maybe there are good reasons. > > This is a preparation for adapting hugetlb page table locking logic to > take the same locks as core-mm page table walkers would. > > Signed-off-by: David Hildenbrand <david@xxxxxxxxxx> > --- > include/linux/mm.h | 7 ++++--- > mm/khugepaged.c | 21 +++++++++++++++------ > mm/pgtable-generic.c | 4 ++-- > 3 files changed, 21 insertions(+), 11 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 2c6ccf088c7be..0472a5090b180 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2873,9 +2873,10 @@ static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) > } > #endif /* ALLOC_SPLIT_PTLOCKS */ > > -static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) > +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte) > { > - return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); > + /* PTE page tables don't currently exceed a single page. */ > + return ptlock_ptr(virt_to_ptdesc(pte)); > } > > static inline bool ptlock_init(struct ptdesc *ptdesc) > @@ -2898,7 +2899,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) > /* > * We use mm->page_table_lock to guard all pagetable pages of the mm. > */ > -static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) > +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte) > { > return &mm->page_table_lock; > } > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index cdd1d8655a76b..f3b3db1046155 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1697,12 +1697,13 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) > i_mmap_lock_read(mapping); > vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { > struct mmu_notifier_range range; > + bool retracted = false; > struct mm_struct *mm; > unsigned long addr; > pmd_t *pmd, pgt_pmd; > spinlock_t *pml; > spinlock_t *ptl; > - bool skipped_uffd = false; > + pte_t *pte; > > /* > * Check vma->anon_vma to exclude MAP_PRIVATE mappings that > @@ -1739,9 +1740,17 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) > mmu_notifier_invalidate_range_start(&range); > > pml = pmd_lock(mm, pmd); > - ptl = pte_lockptr(mm, pmd); > + > + /* > + * No need to check the PTE table content, but we'll grab the > + * PTE table lock while we zap it. > + */ > + pte = pte_offset_map_nolock(mm, pmd, addr, &ptl); > + if (!pte) > + goto unlock_pmd; > if (ptl != pml) > spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); > + pte_unmap(pte); > > /* > * Huge page lock is still held, so normally the page table > @@ -1752,20 +1761,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) > * repeating the anon_vma check protects from one category, > * and repeating the userfaultfd_wp() check from another. > */ > - if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) { > - skipped_uffd = true; > - } else { > + if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { > pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); > pmdp_get_lockless_sync(); > + retracted = true; > } > > if (ptl != pml) > spin_unlock(ptl); > +unlock_pmd: > spin_unlock(pml); > > mmu_notifier_invalidate_range_end(&range); > > - if (!skipped_uffd) { > + if (retracted) { > mm_dec_nr_ptes(mm); > page_table_check_pte_clear_range(mm, addr, pgt_pmd); > pte_free_defer(mm, pmd_pgtable(pgt_pmd)); > diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c > index a78a4adf711ac..13a7705df3f87 100644 > --- a/mm/pgtable-generic.c > +++ b/mm/pgtable-generic.c > @@ -313,7 +313,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, > > pte = __pte_offset_map(pmd, addr, &pmdval); > if (likely(pte)) > - *ptlp = pte_lockptr(mm, &pmdval); > + *ptlp = pte_lockptr(mm, pte); > return pte; > } > > @@ -371,7 +371,7 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, > pte = __pte_offset_map(pmd, addr, &pmdval); > if (unlikely(!pte)) > return pte; > - ptl = pte_lockptr(mm, &pmdval); > + ptl = pte_lockptr(mm, pte); > spin_lock(ptl); > if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) { > *ptlp = ptl; > -- > 2.45.2 > -- Peter Xu