On 25.07.2024 20:39, David Hildenbrand wrote: > pte_lockptr() is the only *_lockptr() function that doesn't consume > what would be expected: it consumes a pmd_t pointer instead of a pte_t > pointer. > > Let's change that. The two callers in pgtable-generic.c are easily > adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a > pte_offset_map_nolock() to obtain the lock, even though we won't actually > be traversing the page table. > > This makes the code more similar to the other variants and avoids other > hacks to make the new pte_lockptr() version happy. pte_lockptr() users > reside now only in pgtable-generic.c. > > Maybe, using pte_offset_map_nolock() is the right thing to do because > the PTE table could have been removed in the meantime? At least it sounds > more future proof if we ever have other means of page table reclaim. > > It's not quite clear if holding the PTE table lock is really required: > what if someone else obtains the lock just after we unlock it? But we'll > leave that as is for now, maybe there are good reasons. > > This is a preparation for adapting hugetlb page table locking logic to > take the same locks as core-mm page table walkers would. > > Signed-off-by: David Hildenbrand <david@xxxxxxxxxx> This patch landed in today's linux-next as commit e98970a1d2d4 ("mm: let pte_lockptr() consume a pte_t pointer"). Unfortunately it causes the following issue on most of my ARM 32bit based test boards: Run /sbin/init as init process 8<--- cut here --- Unable to handle kernel NULL pointer dereference at virtual address 00000010 when read [00000010] *pgd=00000000 Internal error: Oops: 5 [#1] PREEMPT SMP ARM Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.10.0-12930-ge98970a1d2d4 #15137 Hardware name: Samsung Exynos (Flattened Device Tree) PC is at __lock_acquire+0x20/0x1604 LR is at lock_acquire+0x21c/0x394 pc : [<c01990e4>] lr : [<c019b2e8>] psr: 20000093 sp : f082dc80 ip : 00000001 fp : c1d20000 r10: 00000000 r9 : c13096c8 r8 : 00000001 r7 : 00000000 r6 : c1d20000 r5 : c1278594 r4 : c1278594 r3 : 00000000 r2 : 00000000 r1 : 00000000 r0 : 00000010 Flags: nzCv IRQs off FIQs on Mode SVC_32 ISA ARM Segment none Control: 10c5387d Table: 4000404a DAC: 00000051 ... Process swapper/0 (pid: 1, stack limit = 0x(ptrval)) Stack: (0xf082dc80 to 0xf082e000) ... Call trace: __lock_acquire from lock_acquire+0x21c/0x394 lock_acquire from _raw_spin_lock+0x3c/0x4c _raw_spin_lock from __pte_offset_map_lock+0x64/0x118 __pte_offset_map_lock from handle_mm_fault+0xee0/0x13c8 handle_mm_fault from __get_user_pages+0x18c/0x848 __get_user_pages from get_user_pages_remote+0xdc/0x474 get_user_pages_remote from get_arg_page+0x90/0x2a4 get_arg_page from copy_string_kernel+0x14c/0x184 copy_string_kernel from kernel_execve+0x94/0x174 kernel_execve from try_to_run_init_process+0xc/0x3c try_to_run_init_process from kernel_init+0xcc/0x12c kernel_init from ret_from_fork+0x14/0x28 Exception stack(0xf082dfb0 to 0xf082dff8) ... ---[ end trace 0000000000000000 ]--- note: swapper/0[1] exited with irqs disabled note: swapper/0[1] exited with preempt_count 1 Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]--- Reverting $subject on top of next-20240730 fixes/hides this issue. > --- > include/linux/mm.h | 7 ++++--- > mm/khugepaged.c | 21 +++++++++++++++------ > mm/pgtable-generic.c | 4 ++-- > 3 files changed, 21 insertions(+), 11 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 2c6ccf088c7be..0472a5090b180 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2873,9 +2873,10 @@ static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) > } > #endif /* ALLOC_SPLIT_PTLOCKS */ > > -static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) > +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte) > { > - return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); > + /* PTE page tables don't currently exceed a single page. */ > + return ptlock_ptr(virt_to_ptdesc(pte)); > } > > static inline bool ptlock_init(struct ptdesc *ptdesc) > @@ -2898,7 +2899,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) > /* > * We use mm->page_table_lock to guard all pagetable pages of the mm. > */ > -static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) > +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte) > { > return &mm->page_table_lock; > } > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index cdd1d8655a76b..f3b3db1046155 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1697,12 +1697,13 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) > i_mmap_lock_read(mapping); > vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { > struct mmu_notifier_range range; > + bool retracted = false; > struct mm_struct *mm; > unsigned long addr; > pmd_t *pmd, pgt_pmd; > spinlock_t *pml; > spinlock_t *ptl; > - bool skipped_uffd = false; > + pte_t *pte; > > /* > * Check vma->anon_vma to exclude MAP_PRIVATE mappings that > @@ -1739,9 +1740,17 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) > mmu_notifier_invalidate_range_start(&range); > > pml = pmd_lock(mm, pmd); > - ptl = pte_lockptr(mm, pmd); > + > + /* > + * No need to check the PTE table content, but we'll grab the > + * PTE table lock while we zap it. > + */ > + pte = pte_offset_map_nolock(mm, pmd, addr, &ptl); > + if (!pte) > + goto unlock_pmd; > if (ptl != pml) > spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); > + pte_unmap(pte); > > /* > * Huge page lock is still held, so normally the page table > @@ -1752,20 +1761,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) > * repeating the anon_vma check protects from one category, > * and repeating the userfaultfd_wp() check from another. > */ > - if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) { > - skipped_uffd = true; > - } else { > + if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { > pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); > pmdp_get_lockless_sync(); > + retracted = true; > } > > if (ptl != pml) > spin_unlock(ptl); > +unlock_pmd: > spin_unlock(pml); > > mmu_notifier_invalidate_range_end(&range); > > - if (!skipped_uffd) { > + if (retracted) { > mm_dec_nr_ptes(mm); > page_table_check_pte_clear_range(mm, addr, pgt_pmd); > pte_free_defer(mm, pmd_pgtable(pgt_pmd)); > diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c > index a78a4adf711ac..13a7705df3f87 100644 > --- a/mm/pgtable-generic.c > +++ b/mm/pgtable-generic.c > @@ -313,7 +313,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, > > pte = __pte_offset_map(pmd, addr, &pmdval); > if (likely(pte)) > - *ptlp = pte_lockptr(mm, &pmdval); > + *ptlp = pte_lockptr(mm, pte); > return pte; > } > > @@ -371,7 +371,7 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, > pte = __pte_offset_map(pmd, addr, &pmdval); > if (unlikely(!pte)) > return pte; > - ptl = pte_lockptr(mm, &pmdval); > + ptl = pte_lockptr(mm, pte); > spin_lock(ptl); > if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) { > *ptlp = ptl; Best regards -- Marek Szyprowski, PhD Samsung R&D Institute Poland