From: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> If one of callers of page migration starts to handle thp, memory management code start to see pmd migration entry, so we need to prepare for it before enabling. This patch changes various code point which checks the status of given pmds in order to prevent race between thp migration and the pmd-related works. ChangeLog v1 -> v2: - introduce pmd_related() (I know the naming is not good, but can't think up no better name. Any suggesntion is welcomed.) Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> ChangeLog v2 -> v3: - add is_swap_pmd() - a pmd entry should be is_swap_pmd(), pmd_trans_huge(), pmd_devmap(), or pmd_none() - use pmdp_huge_clear_flush() instead of pmdp_huge_get_and_clear() - flush_cache_range() while set_pmd_migration_entry() - pmd_none_or_trans_huge_or_clear_bad() and pmd_trans_unstable() return true on pmd_migration_entry, so that migration entries are not treated as pmd page table entries. Signed-off-by: Zi Yan <zi.yan@xxxxxxxxxxxxxx> --- arch/x86/mm/gup.c | 4 +-- fs/proc/task_mmu.c | 22 ++++++++----- include/asm-generic/pgtable.h | 71 ---------------------------------------- include/linux/huge_mm.h | 21 ++++++++++-- include/linux/swapops.h | 74 +++++++++++++++++++++++++++++++++++++++++ mm/gup.c | 20 ++++++++++-- mm/huge_memory.c | 76 ++++++++++++++++++++++++++++++++++++------- mm/madvise.c | 2 ++ mm/memcontrol.c | 2 ++ mm/memory.c | 9 +++-- mm/memory_hotplug.c | 13 +++++++- mm/mempolicy.c | 1 + mm/mprotect.c | 6 ++-- mm/mremap.c | 2 +- mm/pagewalk.c | 2 ++ 15 files changed, 221 insertions(+), 104 deletions(-) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 0d4fb3ebbbac..78a153d90064 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -222,9 +222,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + if (!pmd_present(pmd)) return 0; - if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { + if (unlikely(pmd_large(pmd))) { /* * NUMA hinting faults need to be handled in the GUP * slowpath for accounting purposes and so that they diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6c07c7813b26..1e64d6898c68 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -596,7 +596,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - smaps_pmd_entry(pmd, addr, walk); + if (pmd_present(*pmd)) + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); return 0; } @@ -929,6 +930,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, goto out; } + if (!pmd_present(*pmd)) + goto out; + page = pmd_page(*pmd); /* Clear accessed and referenced bits. */ @@ -1208,19 +1212,19 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (ptl) { u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; + struct page *page; if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; - /* - * Currently pmd for thp is always present because thp - * can not be swapped-out, migrated, or HWPOISONed - * (split in such cases instead.) - * This if-check is just to prepare for future implementation. - */ - if (pmd_present(pmd)) { - struct page *page = pmd_page(pmd); + if (is_pmd_migration_entry(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + frame = swp_type(entry) | + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + page = migration_entry_to_page(entry); + } else if (pmd_present(pmd)) { + page = pmd_page(pmd); if (page_mapcount(page) == 1) flags |= PM_MMAP_EXCLUSIVE; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index b71a431ed649..6cf9e9b5a7be 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -726,77 +726,6 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif -/* - * This function is meant to be used by sites walking pagetables with - * the mmap_sem hold in read mode to protect against MADV_DONTNEED and - * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd - * into a null pmd and the transhuge page fault can convert a null pmd - * into an hugepmd or into a regular pmd (if the hugepage allocation - * fails). While holding the mmap_sem in read mode the pmd becomes - * stable and stops changing under us only if it's not null and not a - * transhuge pmd. When those races occurs and this function makes a - * difference vs the standard pmd_none_or_clear_bad, the result is - * undefined so behaving like if the pmd was none is safe (because it - * can return none anyway). The compiler level barrier() is critically - * important to compute the two checks atomically on the same pmdval. - * - * For 32bit kernels with a 64bit large pmd_t this automatically takes - * care of reading the pmd atomically to avoid SMP race conditions - * against pmd_populate() when the mmap_sem is hold for reading by the - * caller (a special atomic read not done by "gcc" as in the generic - * version above, is also needed when THP is disabled because the page - * fault can populate the pmd from under us). - */ -static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) -{ - pmd_t pmdval = pmd_read_atomic(pmd); - /* - * The barrier will stabilize the pmdval in a register or on - * the stack so that it will stop changing under the code. - * - * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE, - * pmd_read_atomic is allowed to return a not atomic pmdval - * (for example pointing to an hugepage that has never been - * mapped in the pmd). The below checks will only care about - * the low part of the pmd with 32bit PAE x86 anyway, with the - * exception of pmd_none(). So the important thing is that if - * the low part of the pmd is found null, the high part will - * be also null or the pmd_none() check below would be - * confused. - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - barrier(); -#endif - if (pmd_none(pmdval) || pmd_trans_huge(pmdval)) - return 1; - if (unlikely(pmd_bad(pmdval))) { - pmd_clear_bad(pmd); - return 1; - } - return 0; -} - -/* - * This is a noop if Transparent Hugepage Support is not built into - * the kernel. Otherwise it is equivalent to - * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in - * places that already verified the pmd is not none and they want to - * walk ptes while holding the mmap sem in read mode (write mode don't - * need this). If THP is not enabled, the pmd can't go away under the - * code even if MADV_DONTNEED runs, but if THP is enabled we need to - * run a pmd_trans_unstable before walking the ptes after - * split_huge_page_pmd returns (because it may have run when the pmd - * become null, but then a page fault can map in a THP and not a - * regular page). - */ -static inline int pmd_trans_unstable(pmd_t *pmd) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - return pmd_none_or_trans_huge_or_clear_bad(pmd); -#else - return 0; -#endif -} #ifndef CONFIG_NUMA_BALANCING /* diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 83a8d42f9d55..c2e5a4eab84a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -131,7 +131,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ - if (pmd_trans_huge(*____pmd) \ + if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false, NULL); \ @@ -162,12 +162,18 @@ extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); extern spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); + +static inline int is_swap_pmd(pmd_t pmd) +{ + return !pmd_none(pmd) && !pmd_present(pmd); +} + /* mmap_sem must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma); else return NULL; @@ -192,6 +198,12 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags); struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags); +static inline int hpage_order(struct page *page) +{ + if (unlikely(PageTransHuge(page))) + return HPAGE_PMD_ORDER; + return 0; +} extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); @@ -232,6 +244,7 @@ static inline bool thp_migration_supported(void) #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) #define hpage_nr_pages(x) 1 +#define hpage_order(x) 0 #define transparent_hugepage_enabled(__vma) 0 @@ -274,6 +287,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, long adjust_next) { } +static inline int is_swap_pmd(pmd_t pmd) +{ + return 0; +} static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 6625bea13869..50e4aa7e7ff9 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -229,6 +229,80 @@ static inline int is_pmd_migration_entry(pmd_t pmd) } #endif +/* + * This function is meant to be used by sites walking pagetables with + * the mmap_sem hold in read mode to protect against MADV_DONTNEED and + * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd + * into a null pmd and the transhuge page fault can convert a null pmd + * into an hugepmd or into a regular pmd (if the hugepage allocation + * fails). While holding the mmap_sem in read mode the pmd becomes + * stable and stops changing under us only if it's not null and not a + * transhuge pmd. When those races occurs and this function makes a + * difference vs the standard pmd_none_or_clear_bad, the result is + * undefined so behaving like if the pmd was none is safe (because it + * can return none anyway). The compiler level barrier() is critically + * important to compute the two checks atomically on the same pmdval. + * + * For 32bit kernels with a 64bit large pmd_t this automatically takes + * care of reading the pmd atomically to avoid SMP race conditions + * against pmd_populate() when the mmap_sem is hold for reading by the + * caller (a special atomic read not done by "gcc" as in the generic + * version above, is also needed when THP is disabled because the page + * fault can populate the pmd from under us). + */ +static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) +{ + pmd_t pmdval = pmd_read_atomic(pmd); + /* + * The barrier will stabilize the pmdval in a register or on + * the stack so that it will stop changing under the code. + * + * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE, + * pmd_read_atomic is allowed to return a not atomic pmdval + * (for example pointing to an hugepage that has never been + * mapped in the pmd). The below checks will only care about + * the low part of the pmd with 32bit PAE x86 anyway, with the + * exception of pmd_none(). So the important thing is that if + * the low part of the pmd is found null, the high part will + * be also null or the pmd_none() check below would be + * confused. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + barrier(); +#endif + if (pmd_none(pmdval) || pmd_trans_huge(pmdval) + || is_pmd_migration_entry(pmdval)) + return 1; + if (unlikely(pmd_bad(pmdval))) { + pmd_clear_bad(pmd); + return 1; + } + return 0; +} + +/* + * This is a noop if Transparent Hugepage Support is not built into + * the kernel. Otherwise it is equivalent to + * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in + * places that already verified the pmd is not none and they want to + * walk ptes while holding the mmap sem in read mode (write mode don't + * need this). If THP is not enabled, the pmd can't go away under the + * code even if MADV_DONTNEED runs, but if THP is enabled we need to + * run a pmd_trans_unstable before walking the ptes after + * split_huge_page_pmd returns (because it may have run when the pmd + * become null, but then a page fault can map in a THP and not a + * regular page). + */ +static inline int pmd_trans_unstable(pmd_t *pmd) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + return pmd_none_or_trans_huge_or_clear_bad(pmd); +#else + return 0; +#endif +} + + #ifdef CONFIG_MEMORY_FAILURE extern atomic_long_t num_poisoned_pages __read_mostly; diff --git a/mm/gup.c b/mm/gup.c index 1e67461b2733..82e0304e5d29 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -274,6 +274,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma, } if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); + if (!pmd_present(*pmd)) { +retry: + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + pmd_migration_entry_wait(mm, pmd); + goto retry; + } if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); @@ -285,6 +292,15 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return follow_page_pte(vma, address, pmd, flags); ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_present(*pmd))) { +retry_locked: + if (likely(!(flags & FOLL_MIGRATION))) { + spin_unlock(ptl); + return no_page_table(vma, flags); + } + pmd_migration_entry_wait(mm, pmd); + goto retry_locked; + } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags); @@ -340,7 +356,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pud = pud_offset(pgd, address); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) return -EFAULT; VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, address); @@ -1368,7 +1384,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + if (!pmd_present(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fd54bbdc16cf..4ac923539372 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -897,6 +897,21 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; + + if (unlikely(is_pmd_migration_entry(pmd))) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + if (is_write_migration_entry(entry)) { + make_migration_entry_read(&entry); + pmd = swp_entry_to_pmd(entry); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + ret = 0; + goto out_unlock; + } + WARN_ONCE(!pmd_present(pmd), "Uknown non-present format on pmd.\n"); + if (unlikely(!pmd_trans_huge(pmd))) { pte_free(dst_mm, pgtable); goto out_unlock; @@ -1203,6 +1218,9 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_unlock; + if (unlikely(!pmd_present(orig_pmd))) + goto out_unlock; + page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); /* @@ -1337,7 +1355,15 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) goto out; - page = pmd_page(*pmd); + if (is_pmd_migration_entry(*pmd)) { + swp_entry_t entry; + + entry = pmd_to_swp_entry(*pmd); + page = pfn_to_page(swp_offset(entry)); + if (!is_migration_entry(entry)) + goto out; + } else + page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd); @@ -1533,6 +1559,9 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_huge_zero_pmd(orig_pmd)) goto out; + if (unlikely(!pmd_present(orig_pmd))) + goto out; + page = pmd_page(orig_pmd); /* * If other processes are mapping this page, we couldn't discard @@ -1659,7 +1688,8 @@ int __zap_huge_pmd_locked(struct mmu_gather *tlb, struct vm_area_struct *vma, free_swap_and_cache(entry); /* waring in failure? */ migration = 1; } - tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); + if (!migration) + tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); } return 1; @@ -1775,10 +1805,22 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * data is likely to be read-cached on the local CPU and * local/remote hits to the zero page are not interesting. */ - if (prot_numa && is_huge_zero_pmd(*pmd)) { - spin_unlock(ptl); - return ret; - } + if (prot_numa && is_huge_zero_pmd(*pmd)) + goto unlock; + + if (is_pmd_migration_entry(*pmd)) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); + + if (is_write_migration_entry(entry)) { + pmd_t newpmd; + + make_migration_entry_read(&entry); + newpmd = swp_entry_to_pmd(entry); + set_pmd_at(mm, addr, pmd, newpmd); + } + goto unlock; + } else if (!pmd_present(*pmd)) + WARN_ONCE(1, "Uknown non-present format on pmd.\n"); if (!prot_numa || !pmd_protnone(*pmd)) { entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); @@ -1790,6 +1832,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); } +unlock: spin_unlock(ptl); } @@ -1806,7 +1849,8 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || + pmd_devmap(*pmd))) return ptl; spin_unlock(ptl); return NULL; @@ -1924,7 +1968,7 @@ void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t _pmd; - bool young, write, dirty, soft_dirty; + bool young, write, dirty, soft_dirty, pmd_migration; unsigned long addr; int i; unsigned long haddr = address & HPAGE_PMD_MASK; @@ -1932,7 +1976,8 @@ void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); - VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); + VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) + && !pmd_devmap(*pmd)); count_vm_event(THP_SPLIT_PMD); @@ -1960,7 +2005,14 @@ void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, goto out; } - page = pmd_page(*pmd); + pmd_migration = is_pmd_migration_entry(*pmd); + if (pmd_migration) { + swp_entry_t entry; + + entry = pmd_to_swp_entry(*pmd); + page = pfn_to_page(swp_offset(entry)); + } else + page = pmd_page(*pmd); VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); write = pmd_write(*pmd); @@ -1979,7 +2031,7 @@ void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * transferred to avoid any possibility of altering * permissions across VMAs. */ - if (freeze) { + if (freeze || pmd_migration) { swp_entry_t swp_entry; swp_entry = make_migration_entry(page + i, write); entry = swp_entry_to_pte(swp_entry); @@ -2077,7 +2129,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, page = pmd_page(*pmd); if (PageMlocked(page)) clear_page_mlock(page); - } else if (!pmd_devmap(*pmd)) + } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, address, freeze); out: diff --git a/mm/madvise.c b/mm/madvise.c index e424a06e9f2b..0497a502351f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -310,6 +310,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long next; next = pmd_addr_end(addr, end); + if (!pmd_present(*pmd)) + return 0; if (pmd_trans_huge(*pmd)) if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) goto next; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 44fb1e80701a..09bce3f0d622 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4633,6 +4633,8 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, struct page *page = NULL; enum mc_target_type ret = MC_TARGET_NONE; + if (unlikely(!pmd_present(pmd))) + return ret; page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!(mc.flags & MOVE_ANON)) diff --git a/mm/memory.c b/mm/memory.c index 7cfdd5208ef5..bf10b19e02d3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -999,7 +999,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { + if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) + || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); err = copy_huge_pmd(dst_mm, src_mm, @@ -1240,7 +1241,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, ptl = pmd_lock(vma->vm_mm, pmd); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON_VMA(vma_is_anonymous(vma) && !rwsem_is_locked(&tlb->mm->mmap_sem), vma); @@ -3697,6 +3698,10 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, pmd_t orig_pmd = *vmf.pmd; barrier(); + if (unlikely(is_pmd_migration_entry(orig_pmd))) { + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { vmf.flags |= FAULT_FLAG_SIZE_PMD; if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 19b460acb5e1..9cb4c83151a8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1570,6 +1570,7 @@ static struct page *new_node_page(struct page *page, unsigned long private, int nid = page_to_nid(page); nodemask_t nmask = node_states[N_MEMORY]; struct page *new_page = NULL; + unsigned int order = 0; /* * TODO: allocate a destination hugepage from a nearest neighbor node, @@ -1580,6 +1581,11 @@ static struct page *new_node_page(struct page *page, unsigned long private, return alloc_huge_page_node(page_hstate(compound_head(page)), next_node_in(nid, nmask)); + if (thp_migration_supported() && PageTransHuge(page)) { + order = hpage_order(page); + gfp_mask |= GFP_TRANSHUGE; + } + node_clear(nid, nmask); if (PageHighMem(page) @@ -1593,6 +1599,9 @@ static struct page *new_node_page(struct page *page, unsigned long private, new_page = __alloc_pages(gfp_mask, 0, node_zonelist(nid, gfp_mask)); + if (new_page && order == hpage_order(page)) + prep_transhuge_page(new_page); + return new_page; } @@ -1622,7 +1631,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (isolate_huge_page(page, &source)) move_pages -= 1 << compound_order(head); continue; - } + } else if (thp_migration_supported() && PageTransHuge(page)) + pfn = page_to_pfn(compound_head(page)) + + hpage_nr_pages(page) - 1; if (!get_page_unless_zero(page)) continue; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5cc6a99918ab..021ff13b9a7a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -94,6 +94,7 @@ #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> +#include <linux/swapops.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> diff --git a/mm/mprotect.c b/mm/mprotect.c index 98acf7d5cef2..bfbe66798a7a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -150,7 +150,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, unsigned long this_pages; next = pmd_addr_end(addr, end); - if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) + if (!pmd_present(*pmd)) + continue; + if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) && pmd_none_or_clear_bad(pmd)) continue; @@ -160,7 +162,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mni_start, end); } - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { __split_huge_pmd(vma, pmd, addr, false, NULL); if (pmd_trans_unstable(pmd)) diff --git a/mm/mremap.c b/mm/mremap.c index 8233b0105c82..5d537ce12adc 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -213,7 +213,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; - if (pmd_trans_huge(*old_pmd)) { + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE) { bool moved; /* See comment in move_ptes() */ diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 03761577ae86..114fc2b5a370 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -2,6 +2,8 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/swapops.h> static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) -- 2.11.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>