On Mon, Nov 14, 2016 at 02:45:03PM +0300, Kirill A. Shutemov wrote: > On Tue, Nov 08, 2016 at 08:31:50AM +0900, Naoya Horiguchi wrote: > > This patch prepares thp migration's core code. These code will be open when > > unmap_and_move() stops unconditionally splitting thp and get_new_page() starts > > to allocate destination thps. > > > > Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> > > --- > > ChangeLog v1 -> v2: > > - support pte-mapped thp, doubly-mapped thp > > --- > > arch/x86/include/asm/pgtable_64.h | 2 + > > include/linux/swapops.h | 61 +++++++++++++++ > > mm/huge_memory.c | 154 ++++++++++++++++++++++++++++++++++++++ > > mm/migrate.c | 44 ++++++++++- > > mm/pgtable-generic.c | 3 +- > > 5 files changed, 262 insertions(+), 2 deletions(-) > > > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/arch/x86/include/asm/pgtable_64.h v4.9-rc2-mmotm-2016-10-27-18-27_patched/arch/x86/include/asm/pgtable_64.h > > index 1cc82ec..3a1b48e 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/arch/x86/include/asm/pgtable_64.h > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/arch/x86/include/asm/pgtable_64.h > > @@ -167,7 +167,9 @@ static inline int pgd_large(pgd_t pgd) { return 0; } > > ((type) << (SWP_TYPE_FIRST_BIT)) \ > > | ((offset) << SWP_OFFSET_FIRST_BIT) }) > > #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) > > +#define __pmd_to_swp_entry(pte) ((swp_entry_t) { pmd_val((pmd)) }) > > #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) > > +#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val }) > > > > extern int kern_addr_valid(unsigned long addr); > > extern void cleanup_highmap(void); > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/include/linux/swapops.h v4.9-rc2-mmotm-2016-10-27-18-27_patched/include/linux/swapops.h > > index 5c3a5f3..b6b22a2 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/include/linux/swapops.h > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/include/linux/swapops.h > > @@ -163,6 +163,67 @@ static inline int is_write_migration_entry(swp_entry_t entry) > > > > #endif > > > > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > > +extern void set_pmd_migration_entry(struct page *page, > > + struct vm_area_struct *vma, unsigned long address); > > + > > +extern int remove_migration_pmd(struct page *new, pmd_t *pmd, > > + struct vm_area_struct *vma, unsigned long addr, void *old); > > + > > +extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); > > + > > +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) > > +{ > > + swp_entry_t arch_entry; > > + > > + arch_entry = __pmd_to_swp_entry(pmd); > > + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); > > +} > > + > > +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) > > +{ > > + swp_entry_t arch_entry; > > + > > + arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); > > + return __swp_entry_to_pmd(arch_entry); > > +} > > + > > +static inline int is_pmd_migration_entry(pmd_t pmd) > > +{ > > + return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); > > +} > > +#else > > +static inline void set_pmd_migration_entry(struct page *page, > > + struct vm_area_struct *vma, unsigned long address) > > +{ > > VM_BUG()? Or BUILD_BUG()? These should be compiled out, so BUILD_BUG() seems better to me. 3 routines below will be done in the same manner. > > +} > > + > > +static inline int remove_migration_pmd(struct page *new, pmd_t *pmd, > > + struct vm_area_struct *vma, unsigned long addr, void *old) > > +{ > > + return 0; > > Ditto. > > > +} > > + > > +static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } > > + > > +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) > > +{ > > + return swp_entry(0, 0); > > Ditto. > > > +} > > + > > +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) > > +{ > > + pmd_t pmd = {}; > > Ditto. > > > + return pmd; > > +} > > + > > +static inline int is_pmd_migration_entry(pmd_t pmd) > > +{ > > + return 0; > > +} > > +#endif > > + > > #ifdef CONFIG_MEMORY_FAILURE > > > > extern atomic_long_t num_poisoned_pages __read_mostly; > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/huge_memory.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/huge_memory.c > > index 0509d17..b3022b3 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/huge_memory.c > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/huge_memory.c > > @@ -2310,3 +2310,157 @@ static int __init split_huge_pages_debugfs(void) > > } > > late_initcall(split_huge_pages_debugfs); > > #endif > > + > > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > > +void set_pmd_migration_entry(struct page *page, struct vm_area_struct *vma, > > + unsigned long addr) > > +{ > > + struct mm_struct *mm = vma->vm_mm; > > + pgd_t *pgd; > > + pud_t *pud; > > + pmd_t *pmd; > > + pmd_t pmdval; > > + swp_entry_t entry; > > + spinlock_t *ptl; > > + > > + pgd = pgd_offset(mm, addr); > > + if (!pgd_present(*pgd)) > > + return; > > + pud = pud_offset(pgd, addr); > > + if (!pud_present(*pud)) > > + return; > > + pmd = pmd_offset(pud, addr); > > + pmdval = *pmd; > > + barrier(); > > + if (!pmd_present(pmdval)) > > + return; > > + > > + mmu_notifier_invalidate_range_start(mm, addr, addr + HPAGE_PMD_SIZE); > > + if (pmd_trans_huge(pmdval)) { > > + pmd_t pmdswp; > > + > > + ptl = pmd_lock(mm, pmd); > > + if (!pmd_present(*pmd)) > > + goto unlock_pmd; > > + if (unlikely(!pmd_trans_huge(*pmd))) > > + goto unlock_pmd; > > Just check *pmd == pmdval? OK. > > > + if (pmd_page(*pmd) != page) > > + goto unlock_pmd; > > + > > + pmdval = pmdp_huge_get_and_clear(mm, addr, pmd); > > + if (pmd_dirty(pmdval)) > > + set_page_dirty(page); > > + entry = make_migration_entry(page, pmd_write(pmdval)); > > + pmdswp = swp_entry_to_pmd(entry); > > + pmdswp = pmd_mkhuge(pmdswp); > > + set_pmd_at(mm, addr, pmd, pmdswp); > > + page_remove_rmap(page, true); > > + put_page(page); > > +unlock_pmd: > > + spin_unlock(ptl); > > + } else { /* pte-mapped thp */ > > + pte_t *pte; > > + pte_t pteval; > > + struct page *tmp = compound_head(page); > > + unsigned long address = addr & HPAGE_PMD_MASK; > > + pte_t swp_pte; > > + int i; > > + > > + pte = pte_offset_map(pmd, address); > > + ptl = pte_lockptr(mm, pmd); > > + spin_lock(ptl); > > pte_offset_map_lock() ? Right. > > + for (i = 0; i < HPAGE_PMD_NR; i++, pte++, tmp++) { > > + if (!(pte_present(*pte) && > > + page_to_pfn(tmp) == pte_pfn(*pte))) > > if (!pte_present(*pte) || pte_page(*pte) != tmp) ? Yes, this is shorter/simpler. > > > + continue; > > + pteval = ptep_clear_flush(vma, address, pte); > > + if (pte_dirty(pteval)) > > + set_page_dirty(tmp); > > + entry = make_migration_entry(tmp, pte_write(pteval)); > > + swp_pte = swp_entry_to_pte(entry); > > + set_pte_at(mm, address, pte, swp_pte); > > + page_remove_rmap(tmp, false); > > + put_page(tmp); > > + } > > + pte_unmap_unlock(pte, ptl); > > + } > > + mmu_notifier_invalidate_range_end(mm, addr, addr + HPAGE_PMD_SIZE); > > + return; > > +} > > + > > +int remove_migration_pmd(struct page *new, pmd_t *pmd, > > + struct vm_area_struct *vma, unsigned long addr, void *old) > > +{ > > + struct mm_struct *mm = vma->vm_mm; > > + spinlock_t *ptl; > > + pmd_t pmde; > > + swp_entry_t entry; > > + > > + pmde = *pmd; > > + barrier(); > > + > > + if (!pmd_present(pmde)) { > > + if (is_migration_entry(pmd_to_swp_entry(pmde))) { > > if (!is_migration_entry(pmd_to_swp_entry(pmde))) > return SWAP_AGAIN; > > And one level less indentation below. OK. > > + unsigned long mmun_start = addr & HPAGE_PMD_MASK; > > + unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; > > + > > + ptl = pmd_lock(mm, pmd); > > + entry = pmd_to_swp_entry(*pmd); > > + if (migration_entry_to_page(entry) != old) > > + goto unlock_ptl; > > + get_page(new); > > + pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot)); > > + if (is_write_migration_entry(entry)) > > + pmde = maybe_pmd_mkwrite(pmde, vma); > > + flush_cache_range(vma, mmun_start, mmun_end); > > + page_add_anon_rmap(new, vma, mmun_start, true); > > + pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); > > + set_pmd_at(mm, mmun_start, pmd, pmde); > > + flush_tlb_range(vma, mmun_start, mmun_end); > > + if (vma->vm_flags & VM_LOCKED) > > + mlock_vma_page(new); > > + update_mmu_cache_pmd(vma, addr, pmd); > > +unlock_ptl: > > + spin_unlock(ptl); > > return SWAP_AGAIN; > > And one level less indentation below. > > > + } > > + } else { /* pte-mapped thp */ > > + pte_t *ptep; > > + pte_t pte; > > + int i; > > + struct page *tmpnew = compound_head(new); > > + struct page *tmpold = compound_head((struct page *)old); > > + unsigned long address = addr & HPAGE_PMD_MASK; > > + > > + ptep = pte_offset_map(pmd, addr); > > + ptl = pte_lockptr(mm, pmd); > > + spin_lock(ptl); > > pte_offset_map_lock() ? > > > + > > + for (i = 0; i < HPAGE_PMD_NR; > > + i++, ptep++, tmpnew++, tmpold++, address += PAGE_SIZE) { > > + pte = *ptep; > > + if (!is_swap_pte(pte)) > > + continue; > > + entry = pte_to_swp_entry(pte); > > + if (!is_migration_entry(entry) || > > + migration_entry_to_page(entry) != tmpold) > > + continue; > > + get_page(tmpnew); > > + pte = pte_mkold(mk_pte(tmpnew, > > + READ_ONCE(vma->vm_page_prot))); > > READ_ONCE()? Do we get here under mmap_sem, right? Some callers of page migration (mbind, move_pages, migrate_pages, cpuset) do get mmap_sem, but others (memory hotremove, soft offline) don't. For this part, I borrowed some code from remove_migration_pte() which was updated at the following commit: commit 6d2329f8872f23e46a19d240930571510ce525eb Author: Andrea Arcangeli <aarcange@xxxxxxxxxx> Date: Fri Oct 7 17:01:22 2016 -0700 mm: vm_page_prot: update with WRITE_ONCE/READ_ONCE Thank you for reviewing in detail! Naoya Horiguchi > > + if (pte_swp_soft_dirty(*ptep)) > > + pte = pte_mksoft_dirty(pte); > > + if (is_write_migration_entry(entry)) > > + pte = maybe_mkwrite(pte, vma); > > + flush_dcache_page(tmpnew); > > + set_pte_at(mm, address, ptep, pte); > > + if (PageAnon(new)) > > + page_add_anon_rmap(tmpnew, vma, address, false); > > + else > > + page_add_file_rmap(tmpnew, false); > > + update_mmu_cache(vma, address, ptep); > > + } > > + pte_unmap_unlock(ptep, ptl); > > + } > > + return SWAP_AGAIN; > > +} > > +#endif > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/migrate.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/migrate.c > > index 66ce6b4..54f2eb6 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/migrate.c > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/migrate.c > > @@ -198,6 +198,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, > > { > > struct mm_struct *mm = vma->vm_mm; > > swp_entry_t entry; > > + pgd_t *pgd; > > + pud_t *pud; > > pmd_t *pmd; > > pte_t *ptep, pte; > > spinlock_t *ptl; > > @@ -208,10 +210,29 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, > > goto out; > > ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); > > } else { > > - pmd = mm_find_pmd(mm, addr); > > + pmd_t pmde; > > + > > + pgd = pgd_offset(mm, addr); > > + if (!pgd_present(*pgd)) > > + goto out; > > + pud = pud_offset(pgd, addr); > > + if (!pud_present(*pud)) > > + goto out; > > + pmd = pmd_offset(pud, addr); > > if (!pmd) > > goto out; > > > > + if (PageTransCompound(new)) { > > + remove_migration_pmd(new, pmd, vma, addr, old); > > + goto out; > > + } > > + > > + pmde = *pmd; > > + barrier(); > > + > > + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) > > + goto out; > > + > > ptep = pte_offset_map(pmd, addr); > > > > /* > > @@ -344,6 +365,27 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, > > __migration_entry_wait(mm, pte, ptl); > > } > > > > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > > +void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) > > +{ > > + spinlock_t *ptl; > > + struct page *page; > > + > > + ptl = pmd_lock(mm, pmd); > > + if (!is_pmd_migration_entry(*pmd)) > > + goto unlock; > > + page = migration_entry_to_page(pmd_to_swp_entry(*pmd)); > > + if (!get_page_unless_zero(page)) > > + goto unlock; > > + spin_unlock(ptl); > > + wait_on_page_locked(page); > > + put_page(page); > > + return; > > +unlock: > > + spin_unlock(ptl); > > +} > > +#endif > > + > > #ifdef CONFIG_BLOCK > > /* Returns true if all buffers are successfully locked */ > > static bool buffer_migrate_lock_buffers(struct buffer_head *head, > > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/pgtable-generic.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/pgtable-generic.c > > index 71c5f91..6012343 100644 > > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/pgtable-generic.c > > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/pgtable-generic.c > > @@ -118,7 +118,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, > > { > > pmd_t pmd; > > VM_BUG_ON(address & ~HPAGE_PMD_MASK); > > - VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); > > + VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && > > + !pmd_devmap(*pmdp)); > > pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); > > flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); > > return pmd; > > -- > > 2.7.0 > > > > -- > Kirill A. Shutemov > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href