# sorry for late reply ... On Fri, Nov 18, 2016 at 02:56:24AM +0300, Kirill A. Shutemov wrote: > On Tue, Nov 08, 2016 at 08:31:52AM +0900, Naoya Horiguchi wrote: > > If one of callers of page migration starts to handle thp, memory management code > > start to see pmd migration entry, so we need to prepare for it before enabling. > > This patch changes various code point which checks the status of given pmds in > > order to prevent race between thp migration and the pmd-related works. > > > > Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> > > --- > > ChangeLog v1 -> v2: > > - introduce pmd_related() (I know the naming is not good, but can't think up > > no better name. Any suggesntion is welcomed.) > > --- > > arch/x86/mm/gup.c | 4 +-- > > fs/proc/task_mmu.c | 23 +++++++------ > > include/linux/huge_mm.h | 9 ++++- > > mm/gup.c | 10 ++++-- > > mm/huge_memory.c | 88 ++++++++++++++++++++++++++++++++++++++++--------- > > mm/madvise.c | 2 +- > > mm/memcontrol.c | 2 ++ > > mm/memory.c | 6 +++- > > mm/mprotect.c | 2 ++ > > mm/mremap.c | 2 +- > > 10 files changed, 114 insertions(+), 34 deletions(-) > > > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/arch/x86/mm/gup.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/arch/x86/mm/gup.c > > index 0d4fb3e..78a153d 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/arch/x86/mm/gup.c > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/arch/x86/mm/gup.c > > @@ -222,9 +222,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, > > pmd_t pmd = *pmdp; > > > > next = pmd_addr_end(addr, end); > > - if (pmd_none(pmd)) > > + if (!pmd_present(pmd)) > > return 0; > > - if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { > > + if (unlikely(pmd_large(pmd))) { > > /* > > * NUMA hinting faults need to be handled in the GUP > > * slowpath for accounting purposes and so that they > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/fs/proc/task_mmu.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/fs/proc/task_mmu.c > > index 35b92d8..c1f9cf4 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/fs/proc/task_mmu.c > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/fs/proc/task_mmu.c > > @@ -596,7 +596,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, > > > > ptl = pmd_trans_huge_lock(pmd, vma); > > if (ptl) { > > - smaps_pmd_entry(pmd, addr, walk); > > + if (pmd_present(*pmd)) > > + smaps_pmd_entry(pmd, addr, walk); > > spin_unlock(ptl); > > return 0; > > } > > @@ -929,6 +930,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, > > goto out; > > } > > > > + if (!pmd_present(*pmd)) > > + goto out; > > + > > Hm. Looks like clear_soft_dirty_pmd() should handle !present. It doesn't. > > Ah.. Found it in 08/12. > > > page = pmd_page(*pmd); > > > > /* Clear accessed and referenced bits. */ > > @@ -1208,19 +1212,18 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, > > if (ptl) { > > u64 flags = 0, frame = 0; > > pmd_t pmd = *pmdp; > > + struct page *page; > > > > if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) > > flags |= PM_SOFT_DIRTY; > > > > - /* > > - * Currently pmd for thp is always present because thp > > - * can not be swapped-out, migrated, or HWPOISONed > > - * (split in such cases instead.) > > - * This if-check is just to prepare for future implementation. > > - */ > > - if (pmd_present(pmd)) { > > - struct page *page = pmd_page(pmd); > > - > > + if (is_pmd_migration_entry(pmd)) { > > + swp_entry_t entry = pmd_to_swp_entry(pmd); > > + frame = swp_type(entry) | > > + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); > > + page = migration_entry_to_page(entry); > > + } else if (pmd_present(pmd)) { > > + page = pmd_page(pmd); > > if (page_mapcount(page) == 1) > > flags |= PM_MMAP_EXCLUSIVE; > > > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/include/linux/huge_mm.h v4.9-rc2-mmotm-2016-10-27-18-27_patched/include/linux/huge_mm.h > > index fcbca51..3c252cd 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/include/linux/huge_mm.h > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/include/linux/huge_mm.h > > @@ -125,12 +125,19 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma, > > long adjust_next); > > extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, > > struct vm_area_struct *vma); > > + > > +static inline int pmd_related(pmd_t pmd) > > +{ > > + return !pmd_none(pmd) && > > + (!pmd_present(pmd) || pmd_trans_huge(pmd) || pmd_devmap(pmd)); > > +} > > + > > I would rather create is_swap_pmd() -- (!none && !present) and leave the > reset open-codded. OK, I do this. > > > /* mmap_sem must be held on entry */ > > static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, > > struct vm_area_struct *vma) > > { > > VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); > > - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) > > + if (pmd_related(*pmd)) > > return __pmd_trans_huge_lock(pmd, vma); > > else > > return NULL; > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/gup.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/gup.c > > index e50178c..2dc4978 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/gup.c > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/gup.c > > @@ -267,6 +267,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, > > } > > if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) > > return no_page_table(vma, flags); > > + if (!pmd_present(*pmd)) > > + return no_page_table(vma, flags); > > Don't we need FOLL_MIGRATION, like on pte side? That's better, applied. > > if (pmd_devmap(*pmd)) { > > ptl = pmd_lock(mm, pmd); > > page = follow_devmap_pmd(vma, address, pmd, flags); > > @@ -278,6 +280,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, > > return follow_page_pte(vma, address, pmd, flags); > > > > ptl = pmd_lock(mm, pmd); > > + if (unlikely(!pmd_present(*pmd))) { > > + spin_unlock(ptl); > > + return no_page_table(vma, flags); > > + } > > Ditto. > > > if (unlikely(!pmd_trans_huge(*pmd))) { > > spin_unlock(ptl); > > return follow_page_pte(vma, address, pmd, flags); > > @@ -333,7 +339,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, > > pud = pud_offset(pgd, address); > > BUG_ON(pud_none(*pud)); > > pmd = pmd_offset(pud, address); > > - if (pmd_none(*pmd)) > > + if (!pmd_present(*pmd)) > > return -EFAULT; > > VM_BUG_ON(pmd_trans_huge(*pmd)); > > pte = pte_offset_map(pmd, address); > > @@ -1357,7 +1363,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, > > pmd_t pmd = READ_ONCE(*pmdp); > > > > next = pmd_addr_end(addr, end); > > - if (pmd_none(pmd)) > > + if (!pmd_present(pmd)) > > return 0; > > > > if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/huge_memory.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/huge_memory.c > > index b3022b3..4e9090c 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/huge_memory.c > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/huge_memory.c > > @@ -825,6 +825,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, > > > > ret = -EAGAIN; > > pmd = *src_pmd; > > + > > + if (unlikely(is_pmd_migration_entry(pmd))) { > > + swp_entry_t entry = pmd_to_swp_entry(pmd); > > + > > + if (is_write_migration_entry(entry)) { > > + make_migration_entry_read(&entry); > > + pmd = swp_entry_to_pmd(entry); > > + set_pmd_at(src_mm, addr, src_pmd, pmd); > > + } > > I think we should put at least WARN_ONCE() in 'else' here. We don't want > to miss such places when swap will be supported (or other swap entry type). I guess that you mean 'else' branch of outer if-block, right? > > + set_pmd_at(dst_mm, addr, dst_pmd, pmd); > > + ret = 0; > > + goto out_unlock; > > + } Maybe inserting the below here seems OK. WARN_ONCE(!pmd_present(pmd), "Unknown non-present format on pmd.\n"); > > + > > if (unlikely(!pmd_trans_huge(pmd))) { > > pte_free(dst_mm, pgtable); > > goto out_unlock; > > @@ -1013,6 +1027,9 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) > > if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) > > goto out_unlock; > > > > + if (unlikely(!pmd_present(orig_pmd))) > > + goto out_unlock; > > + > > page = pmd_page(orig_pmd); > > VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); > > /* > > @@ -1137,7 +1154,14 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, > > if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) > > goto out; > > > > - page = pmd_page(*pmd); > > + if (is_pmd_migration_entry(*pmd)) { > > + swp_entry_t entry; > > + entry = pmd_to_swp_entry(*pmd); > > + page = pfn_to_page(swp_offset(entry)); > > + if (!is_migration_entry(entry)) > > + goto out; > > follow_page_pte() does different thing: wait for page to be migrated and > retry. Any reason you don't do the same? No, just my self-check wasn't enough. > > + } else > > + page = pmd_page(*pmd); > > VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); > > if (flags & FOLL_TOUCH) > > touch_pmd(vma, addr, pmd); > > @@ -1332,6 +1356,9 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > > if (is_huge_zero_pmd(orig_pmd)) > > goto out; > > > > + if (unlikely(!pmd_present(orig_pmd))) > > + goto out; > > + > > page = pmd_page(orig_pmd); > > /* > > * If other processes are mapping this page, we couldn't discard > > @@ -1410,20 +1437,35 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > > tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); > > } else { > > struct page *page = pmd_page(orig_pmd); > > - page_remove_rmap(page, true); > > - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); > > - VM_BUG_ON_PAGE(!PageHead(page), page); > > - if (PageAnon(page)) { > > - pgtable_t pgtable; > > - pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); > > - pte_free(tlb->mm, pgtable); > > - atomic_long_dec(&tlb->mm->nr_ptes); > > - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); > > + int migration = 0; > > + > > + if (!is_pmd_migration_entry(orig_pmd)) { > > + page_remove_rmap(page, true); > > + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); > > + VM_BUG_ON_PAGE(!PageHead(page), page); > > + if (PageAnon(page)) { > > + pgtable_t pgtable; > > + pgtable = pgtable_trans_huge_withdraw(tlb->mm, > > + pmd); > > + pte_free(tlb->mm, pgtable); > > + atomic_long_dec(&tlb->mm->nr_ptes); > > + add_mm_counter(tlb->mm, MM_ANONPAGES, > > + -HPAGE_PMD_NR); > > + } else { > > + add_mm_counter(tlb->mm, MM_FILEPAGES, > > + -HPAGE_PMD_NR); > > + } > > } else { > > - add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); > > + swp_entry_t entry; > > + > > + entry = pmd_to_swp_entry(orig_pmd); > > + free_swap_and_cache(entry); /* waring in failure? */ > > + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); > > + migration = 1; > > } > > spin_unlock(ptl); > > - tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); > > + if (!migration) > > + tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); > > } > > return 1; > > } > > @@ -1496,14 +1538,27 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > > bool preserve_write = prot_numa && pmd_write(*pmd); > > ret = 1; > > > > + if (!pmd_present(*pmd)) > > + goto unlock; > > /* > > * Avoid trapping faults against the zero page. The read-only > > * data is likely to be read-cached on the local CPU and > > * local/remote hits to the zero page are not interesting. > > */ > > - if (prot_numa && is_huge_zero_pmd(*pmd)) { > > - spin_unlock(ptl); > > - return ret; > > + if (prot_numa && is_huge_zero_pmd(*pmd)) > > + goto unlock; > > + > > + if (is_pmd_migration_entry(*pmd)) { > > Hm? But we filtered out !present above? the !present check must come after this if-block with WARN_ONCE(). > > + swp_entry_t entry = pmd_to_swp_entry(*pmd); > > + > > + if (is_write_migration_entry(entry)) { > > + pmd_t newpmd; > > + > > + make_migration_entry_read(&entry); > > + newpmd = swp_entry_to_pmd(entry); > > + set_pmd_at(mm, addr, pmd, newpmd); > > + } > > + goto unlock; > > } > > > > if (!prot_numa || !pmd_protnone(*pmd)) { > > @@ -1516,6 +1571,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > > BUG_ON(vma_is_anonymous(vma) && !preserve_write && > > pmd_write(entry)); > > } > > +unlock: > > spin_unlock(ptl); > > } > > > > @@ -1532,7 +1588,7 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) > > { > > spinlock_t *ptl; > > ptl = pmd_lock(vma->vm_mm, pmd); > > - if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) > > + if (likely(pmd_related(*pmd))) > > return ptl; > > spin_unlock(ptl); > > return NULL; > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/madvise.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/madvise.c > > index 0e3828e..eaa2b02 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/madvise.c > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/madvise.c > > @@ -274,7 +274,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, > > unsigned long next; > > > > next = pmd_addr_end(addr, end); > > - if (pmd_trans_huge(*pmd)) > > + if (pmd_related(*pmd)) > > I don't see a point going for madvise_free_huge_pmd(), just to fall off on > !present inside. Sorry, this code was wrong. I should've done like below simply: + if (!pmd_present(*pmd)) + return 0; if (pmd_trans_huge(*pmd)) ... > And is it safe for devmap? I'm not sure, so let's keep it as-is. Thanks, Naoya Horiguchi > > > if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) > > goto next; > > > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/memcontrol.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/memcontrol.c > > index 91dfc7c..ebc2c42 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/memcontrol.c > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/memcontrol.c > > @@ -4635,6 +4635,8 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, > > struct page *page = NULL; > > enum mc_target_type ret = MC_TARGET_NONE; > > > > + if (unlikely(!pmd_present(pmd))) > > + return ret; > > page = pmd_page(pmd); > > VM_BUG_ON_PAGE(!page || !PageHead(page), page); > > if (!(mc.flags & MOVE_ANON)) > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/memory.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/memory.c > > index 94b5e2c..33fa439 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/memory.c > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/memory.c > > @@ -999,7 +999,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src > > src_pmd = pmd_offset(src_pud, addr); > > do { > > next = pmd_addr_end(addr, end); > > - if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { > > + if (pmd_related(*src_pmd)) { > > int err; > > VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); > > err = copy_huge_pmd(dst_mm, src_mm, > > @@ -3591,6 +3591,10 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > > int ret; > > > > barrier(); > > + if (unlikely(is_pmd_migration_entry(orig_pmd))) { > > + pmd_migration_entry_wait(mm, fe.pmd); > > + return 0; > > + } > > if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { > > if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) > > return do_huge_pmd_numa_page(&fe, orig_pmd); > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/mprotect.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/mprotect.c > > index c5ba2aa..81186e3 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/mprotect.c > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/mprotect.c > > @@ -164,6 +164,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, > > unsigned long this_pages; > > > > next = pmd_addr_end(addr, end); > > + if (!pmd_present(*pmd)) > > + continue; > > if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) > > && pmd_none_or_clear_bad(pmd)) > > continue; > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/mremap.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/mremap.c > > index da22ad2..a94a698 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/mremap.c > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/mremap.c > > @@ -194,7 +194,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, > > new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); > > if (!new_pmd) > > break; > > - if (pmd_trans_huge(*old_pmd)) { > > + if (pmd_related(*old_pmd)) { > > if (extent == HPAGE_PMD_SIZE) { > > bool moved; > > /* See comment in move_ptes() */ > > -- > > 2.7.0 > > > > -- > > To unsubscribe, send a message with 'unsubscribe linux-mm' in > > the body to majordomo@xxxxxxxxx. For more info on Linux MM, > > see: http://www.linux-mm.org/ . > > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> > > -- > Kirill A. Shutemov > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href