On Mon 17-07-17 15:39:51, Zi Yan wrote: > From: Zi Yan <zi.yan@xxxxxxxxxxxxxx> > > If one of callers of page migration starts to handle thp, > memory management code start to see pmd migration entry, so we need > to prepare for it before enabling. This patch changes various code > point which checks the status of given pmds in order to prevent race > between thp migration and the pmd-related works. I am sorry to nitpick on the changelog but the patch is scary large and it would deserve much better description. What are those "various code point" and how do you "prevent race". How can we double check that none of them were missed? > ChangeLog v1 -> v2: > - introduce pmd_related() (I know the naming is not good, but can't > think up no better name. Any suggesntion is welcomed.) > > Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> > > ChangeLog v2 -> v3: > - add is_swap_pmd() > - a pmd entry should be pmd pointing to pte pages, is_swap_pmd(), > pmd_trans_huge(), pmd_devmap(), or pmd_none() > - pmd_none_or_trans_huge_or_clear_bad() and pmd_trans_unstable() return > true on pmd_migration_entry, so that migration entries are not > treated as pmd page table entries. > > ChangeLog v4 -> v5: > - add explanation in pmd_none_or_trans_huge_or_clear_bad() to state > the equivalence of !pmd_present() and is_pmd_migration_entry() > - fix migration entry wait deadlock code (from v1) in follow_page_mask() > - remove unnecessary code (from v1) in follow_trans_huge_pmd() > - use is_swap_pmd() instead of !pmd_present() for pmd migration entry, > so it will not be confused with pmd_none() > - change author information > > ChangeLog v5 -> v7 > - use macro to disable the code when thp migration is not enabled > > ChangeLog v7 -> v8 > - remove not used code in do_huge_pmd_wp_page() > - copy the comment from change_pte_range() on downgrading > write migration entry to read to change_huge_pmd() > > ChangeLog v8 -> v9 > - fix VM_BUG_ON()s that trigger false alarms. > > Signed-off-by: Zi Yan <zi.yan@xxxxxxxxxxxxxx> > Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> > --- > fs/proc/task_mmu.c | 32 +++++++++++++-------- > include/asm-generic/pgtable.h | 18 +++++++++++- > include/linux/huge_mm.h | 14 ++++++++-- > mm/gup.c | 22 +++++++++++++-- > mm/huge_memory.c | 65 +++++++++++++++++++++++++++++++++++++++---- > mm/memcontrol.c | 5 ++++ > mm/memory.c | 12 ++++++-- > mm/mprotect.c | 4 +-- > mm/mremap.c | 2 +- > 9 files changed, 147 insertions(+), 27 deletions(-) > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > index b836fd61ed87..0f17a7cccb41 100644 > --- a/fs/proc/task_mmu.c > +++ b/fs/proc/task_mmu.c > @@ -596,7 +596,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, > > ptl = pmd_trans_huge_lock(pmd, vma); > if (ptl) { > - smaps_pmd_entry(pmd, addr, walk); > + if (pmd_present(*pmd)) > + smaps_pmd_entry(pmd, addr, walk); > spin_unlock(ptl); > return 0; > } > @@ -938,6 +939,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, > goto out; > } > > + if (!pmd_present(*pmd)) > + goto out; > + > page = pmd_page(*pmd); > > /* Clear accessed and referenced bits. */ > @@ -1217,27 +1221,33 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, > if (ptl) { > u64 flags = 0, frame = 0; > pmd_t pmd = *pmdp; > + struct page *page = NULL; > > if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) > flags |= PM_SOFT_DIRTY; > > - /* > - * Currently pmd for thp is always present because thp > - * can not be swapped-out, migrated, or HWPOISONed > - * (split in such cases instead.) > - * This if-check is just to prepare for future implementation. > - */ > if (pmd_present(pmd)) { > - struct page *page = pmd_page(pmd); > - > - if (page_mapcount(page) == 1) > - flags |= PM_MMAP_EXCLUSIVE; > + page = pmd_page(pmd); > > flags |= PM_PRESENT; > if (pm->show_pfn) > frame = pmd_pfn(pmd) + > ((addr & ~PMD_MASK) >> PAGE_SHIFT); > } > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > + else if (is_swap_pmd(pmd)) { > + swp_entry_t entry = pmd_to_swp_entry(pmd); > + > + frame = swp_type(entry) | > + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); > + flags |= PM_SWAP; > + VM_BUG_ON(!is_pmd_migration_entry(pmd)); > + page = migration_entry_to_page(entry); > + } > +#endif > + > + if (page && page_mapcount(page) == 1) > + flags |= PM_MMAP_EXCLUSIVE; > > for (; addr != end; addr += PAGE_SIZE) { > pagemap_entry_t pme = make_pme(frame, flags); > diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h > index 7dfa767dc680..8937d51c2834 100644 > --- a/include/asm-generic/pgtable.h > +++ b/include/asm-generic/pgtable.h > @@ -834,7 +834,23 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > barrier(); > #endif > - if (pmd_none(pmdval) || pmd_trans_huge(pmdval)) > + /* > + * !pmd_present() checks for pmd migration entries > + * > + * The complete check uses is_pmd_migration_entry() in linux/swapops.h > + * But using that requires moving current function and pmd_trans_unstable() > + * to linux/swapops.h to resovle dependency, which is too much code move. > + * > + * !pmd_present() is equivalent to is_pmd_migration_entry() currently, > + * because !pmd_present() pages can only be under migration not swapped > + * out. > + * > + * pmd_none() is preseved for future condition checks on pmd migration > + * entries and not confusing with this function name, although it is > + * redundant with !pmd_present(). > + */ > + if (pmd_none(pmdval) || pmd_trans_huge(pmdval) || > + (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval))) > return 1; > if (unlikely(pmd_bad(pmdval))) { > pmd_clear_bad(pmd); > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index d8f35a0865dc..14bc21c2ee7f 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -147,7 +147,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > #define split_huge_pmd(__vma, __pmd, __address) \ > do { \ > pmd_t *____pmd = (__pmd); \ > - if (pmd_trans_huge(*____pmd) \ > + if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ > || pmd_devmap(*____pmd)) \ > __split_huge_pmd(__vma, __pmd, __address, \ > false, NULL); \ > @@ -178,12 +178,18 @@ extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, > struct vm_area_struct *vma); > extern spinlock_t *__pud_trans_huge_lock(pud_t *pud, > struct vm_area_struct *vma); > + > +static inline int is_swap_pmd(pmd_t pmd) > +{ > + return !pmd_none(pmd) && !pmd_present(pmd); > +} > + > /* mmap_sem must be held on entry */ > static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, > struct vm_area_struct *vma) > { > VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); > - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) > + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) > return __pmd_trans_huge_lock(pmd, vma); > else > return NULL; > @@ -299,6 +305,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, > long adjust_next) > { > } > +static inline int is_swap_pmd(pmd_t pmd) > +{ > + return 0; > +} > static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, > struct vm_area_struct *vma) > { > diff --git a/mm/gup.c b/mm/gup.c > index 23f01c40c88f..d81dd886d5d9 100644 > --- a/mm/gup.c > +++ b/mm/gup.c > @@ -234,6 +234,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, > return page; > return no_page_table(vma, flags); > } > +retry: > + if (!pmd_present(*pmd)) { > + if (likely(!(flags & FOLL_MIGRATION))) > + return no_page_table(vma, flags); > + VM_BUG_ON(thp_migration_supported() && > + !is_pmd_migration_entry(*pmd)); > + if (is_pmd_migration_entry(*pmd)) > + pmd_migration_entry_wait(mm, pmd); > + goto retry; > + } > if (pmd_devmap(*pmd)) { > ptl = pmd_lock(mm, pmd); > page = follow_devmap_pmd(vma, address, pmd, flags); > @@ -247,7 +257,15 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, > if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) > return no_page_table(vma, flags); > > +retry_locked: > ptl = pmd_lock(mm, pmd); > + if (unlikely(!pmd_present(*pmd))) { > + spin_unlock(ptl); > + if (likely(!(flags & FOLL_MIGRATION))) > + return no_page_table(vma, flags); > + pmd_migration_entry_wait(mm, pmd); > + goto retry_locked; > + } > if (unlikely(!pmd_trans_huge(*pmd))) { > spin_unlock(ptl); > return follow_page_pte(vma, address, pmd, flags); > @@ -424,7 +442,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, > pud = pud_offset(p4d, address); > BUG_ON(pud_none(*pud)); > pmd = pmd_offset(pud, address); > - if (pmd_none(*pmd)) > + if (!pmd_present(*pmd)) > return -EFAULT; > VM_BUG_ON(pmd_trans_huge(*pmd)); > pte = pte_offset_map(pmd, address); > @@ -1534,7 +1552,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, > pmd_t pmd = READ_ONCE(*pmdp); > > next = pmd_addr_end(addr, end); > - if (pmd_none(pmd)) > + if (!pmd_present(pmd)) > return 0; > > if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 9668f8cb8317..dc7830e4993f 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -914,6 +914,23 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, > > ret = -EAGAIN; > pmd = *src_pmd; > + > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > + if (unlikely(is_swap_pmd(pmd))) { > + swp_entry_t entry = pmd_to_swp_entry(pmd); > + > + VM_BUG_ON(!is_pmd_migration_entry(pmd)); > + if (is_write_migration_entry(entry)) { > + make_migration_entry_read(&entry); > + pmd = swp_entry_to_pmd(entry); > + set_pmd_at(src_mm, addr, src_pmd, pmd); > + } > + set_pmd_at(dst_mm, addr, dst_pmd, pmd); > + ret = 0; > + goto out_unlock; > + } > +#endif > + > if (unlikely(!pmd_trans_huge(pmd))) { > pte_free(dst_mm, pgtable); > goto out_unlock; > @@ -1556,6 +1573,12 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > if (is_huge_zero_pmd(orig_pmd)) > goto out; > > + if (unlikely(!pmd_present(orig_pmd))) { > + VM_BUG_ON(thp_migration_supported() && > + !is_pmd_migration_entry(orig_pmd)); > + goto out; > + } > + > page = pmd_page(orig_pmd); > /* > * If other processes are mapping this page, we couldn't discard > @@ -1767,6 +1790,25 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > preserve_write = prot_numa && pmd_write(*pmd); > ret = 1; > > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > + if (is_swap_pmd(*pmd)) { > + swp_entry_t entry = pmd_to_swp_entry(*pmd); > + > + VM_BUG_ON(!is_pmd_migration_entry(*pmd)); > + if (is_write_migration_entry(entry)) { > + pmd_t newpmd; > + /* > + * A protection check is difficult so > + * just be safe and disable write > + */ > + make_migration_entry_read(&entry); > + newpmd = swp_entry_to_pmd(entry); > + set_pmd_at(mm, addr, pmd, newpmd); > + } > + goto unlock; > + } > +#endif > + > /* > * Avoid trapping faults against the zero page. The read-only > * data is likely to be read-cached on the local CPU and > @@ -1832,7 +1874,8 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) > { > spinlock_t *ptl; > ptl = pmd_lock(vma->vm_mm, pmd); > - if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) > + if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || > + pmd_devmap(*pmd))) > return ptl; > spin_unlock(ptl); > return NULL; > @@ -1950,14 +1993,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > struct page *page; > pgtable_t pgtable; > pmd_t _pmd; > - bool young, write, dirty, soft_dirty; > + bool young, write, dirty, soft_dirty, pmd_migration = false; > unsigned long addr; > int i; > > VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); > VM_BUG_ON_VMA(vma->vm_start > haddr, vma); > VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); > - VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); > + VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) > + && !pmd_devmap(*pmd)); > > count_vm_event(THP_SPLIT_PMD); > > @@ -1982,7 +2026,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > return __split_huge_zero_page_pmd(vma, haddr, pmd); > } > > - page = pmd_page(*pmd); > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > + pmd_migration = is_pmd_migration_entry(*pmd); > + if (pmd_migration) { > + swp_entry_t entry; > + > + entry = pmd_to_swp_entry(*pmd); > + page = pfn_to_page(swp_offset(entry)); > + } else > +#endif > + page = pmd_page(*pmd); > VM_BUG_ON_PAGE(!page_count(page), page); > page_ref_add(page, HPAGE_PMD_NR - 1); > write = pmd_write(*pmd); > @@ -2001,7 +2054,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > * transferred to avoid any possibility of altering > * permissions across VMAs. > */ > - if (freeze) { > + if (freeze || pmd_migration) { > swp_entry_t swp_entry; > swp_entry = make_migration_entry(page + i, write); > entry = swp_entry_to_pte(swp_entry); > @@ -2100,7 +2153,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > page = pmd_page(*pmd); > if (PageMlocked(page)) > clear_page_mlock(page); > - } else if (!pmd_devmap(*pmd)) > + } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) > goto out; > __split_huge_pmd_locked(vma, pmd, haddr, freeze); > out: > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 544d47e5cbbd..8d9e9c13fe4f 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -4638,6 +4638,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, > struct page *page = NULL; > enum mc_target_type ret = MC_TARGET_NONE; > > + if (unlikely(is_swap_pmd(pmd))) { > + VM_BUG_ON(thp_migration_supported() && > + !is_pmd_migration_entry(pmd)); > + return ret; > + } > page = pmd_page(pmd); > VM_BUG_ON_PAGE(!page || !PageHead(page), page); > if (!(mc.flags & MOVE_ANON)) > diff --git a/mm/memory.c b/mm/memory.c > index 0e517be91a89..000d54dc1c68 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -1036,7 +1036,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src > src_pmd = pmd_offset(src_pud, addr); > do { > next = pmd_addr_end(addr, end); > - if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { > + if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) > + || pmd_devmap(*src_pmd)) { > int err; > VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); > err = copy_huge_pmd(dst_mm, src_mm, > @@ -1296,7 +1297,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, > pmd = pmd_offset(pud, addr); > do { > next = pmd_addr_end(addr, end); > - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { > + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { > if (next - addr != HPAGE_PMD_SIZE) { > VM_BUG_ON_VMA(vma_is_anonymous(vma) && > !rwsem_is_locked(&tlb->mm->mmap_sem), vma); > @@ -3804,6 +3805,13 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > pmd_t orig_pmd = *vmf.pmd; > > barrier(); > + if (unlikely(is_swap_pmd(orig_pmd))) { > + VM_BUG_ON(thp_migration_supported() && > + !is_pmd_migration_entry(orig_pmd)); > + if (is_pmd_migration_entry(orig_pmd)) > + pmd_migration_entry_wait(mm, vmf.pmd); > + return 0; > + } > if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { > if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) > return do_huge_pmd_numa_page(&vmf, orig_pmd); > diff --git a/mm/mprotect.c b/mm/mprotect.c > index 1a8c9ca83e48..d60a1eedcc54 100644 > --- a/mm/mprotect.c > +++ b/mm/mprotect.c > @@ -148,7 +148,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, > unsigned long this_pages; > > next = pmd_addr_end(addr, end); > - if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) > + if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) > && pmd_none_or_clear_bad(pmd)) > continue; > > @@ -158,7 +158,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, > mmu_notifier_invalidate_range_start(mm, mni_start, end); > } > > - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { > + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { > if (next - addr != HPAGE_PMD_SIZE) { > __split_huge_pmd(vma, pmd, addr, false, NULL); > } else { > diff --git a/mm/mremap.c b/mm/mremap.c > index cd8a1b199ef9..1c49b9fb994a 100644 > --- a/mm/mremap.c > +++ b/mm/mremap.c > @@ -222,7 +222,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, > new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); > if (!new_pmd) > break; > - if (pmd_trans_huge(*old_pmd)) { > + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { > if (extent == HPAGE_PMD_SIZE) { > bool moved; > /* See comment in move_ptes() */ > -- > 2.11.0 -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>