On Fri, Dec 07, 2012 at 10:23:47AM +0000, Mel Gorman wrote: >Note: This is very heavily based on a patch from Peter Zijlstra with > fixes from Ingo Molnar, Hugh Dickins and Johannes Weiner. That patch > put a lot of migration logic into mm/huge_memory.c where it does > not belong. This version puts tries to share some of the migration > logic with migrate_misplaced_page. However, it should be noted > that now migrate.c is doing more with the pagetable manipulation > than is preferred. The end result is barely recognisable so as > before, the signed-offs had to be removed but will be re-added if > the original authors are ok with it. > >Add THP migration for the NUMA working set scanning fault case. > >It uses the page lock to serialize. No migration pte dance is >necessary because the pte is already unmapped when we decide >to migrate. > >[dhillf@xxxxxxxxx: Fix memory leak on isolation failure] >[dhillf@xxxxxxxxx: Fix transfer of last_nid information] >Signed-off-by: Mel Gorman <mgorman@xxxxxxx> >--- > include/linux/migrate.h | 15 +++ > mm/huge_memory.c | 59 ++++++++---- > mm/internal.h | 7 +- > mm/memcontrol.c | 7 +- > mm/migrate.c | 231 ++++++++++++++++++++++++++++++++++++++--------- > 5 files changed, 255 insertions(+), 64 deletions(-) > >diff --git a/include/linux/migrate.h b/include/linux/migrate.h >index 6229177..ed5a6c5 100644 >--- a/include/linux/migrate.h >+++ b/include/linux/migrate.h >@@ -79,6 +79,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, > extern int migrate_misplaced_page(struct page *page, int node); > extern int migrate_misplaced_page(struct page *page, int node); > extern bool migrate_ratelimited(int node); >+extern int migrate_misplaced_transhuge_page(struct mm_struct *mm, >+ struct vm_area_struct *vma, >+ pmd_t *pmd, pmd_t entry, >+ unsigned long address, >+ struct page *page, int node); >+ > #else > static inline int migrate_misplaced_page(struct page *page, int node) > { >@@ -88,6 +94,15 @@ static inline bool migrate_ratelimited(int node) > { > return false; > } >+ >+static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, >+ struct vm_area_struct *vma, >+ pmd_t *pmd, pmd_t entry, >+ unsigned long address, >+ struct page *page, int node) >+{ >+ return -EAGAIN; >+} > #endif /* CONFIG_BALANCE_NUMA */ > > #endif /* _LINUX_MIGRATE_H */ >diff --git a/mm/huge_memory.c b/mm/huge_memory.c >index 1327a03..61b66f8 100644 >--- a/mm/huge_memory.c >+++ b/mm/huge_memory.c >@@ -600,7 +600,7 @@ out: > } > __setup("transparent_hugepage=", setup_transparent_hugepage); > >-static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) >+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) > { > if (likely(vma->vm_flags & VM_WRITE)) > pmd = pmd_mkwrite(pmd); >@@ -1022,10 +1022,12 @@ out: > int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, > unsigned long addr, pmd_t pmd, pmd_t *pmdp) > { >- struct page *page = NULL; >+ struct page *page; > unsigned long haddr = addr & HPAGE_PMD_MASK; > int target_nid; > int current_nid = -1; >+ bool migrated; >+ bool page_locked = false; > > spin_lock(&mm->page_table_lock); > if (unlikely(!pmd_same(pmd, *pmdp))) >@@ -1033,42 +1035,61 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, > > page = pmd_page(pmd); > get_page(page); >- spin_unlock(&mm->page_table_lock); > current_nid = page_to_nid(page); > count_vm_numa_event(NUMA_HINT_FAULTS); > if (current_nid == numa_node_id()) > count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); > > target_nid = mpol_misplaced(page, vma, haddr); >- if (target_nid == -1) >+ if (target_nid == -1) { >+ put_page(page); > goto clear_pmdnuma; >+ } > >- /* >- * Due to lacking code to migrate thp pages, we'll split >- * (which preserves the special PROT_NONE) and re-take the >- * fault on the normal pages. >- */ >- split_huge_page(page); >- put_page(page); >- >- return 0; >+ /* Acquire the page lock to serialise THP migrations */ >+ spin_unlock(&mm->page_table_lock); >+ lock_page(page); >+ page_locked = true; > >-clear_pmdnuma: >+ /* Confirm the PTE did not while locked */ > spin_lock(&mm->page_table_lock); >- if (unlikely(!pmd_same(pmd, *pmdp))) >+ if (unlikely(!pmd_same(pmd, *pmdp))) { >+ unlock_page(page); >+ put_page(page); > goto out_unlock; >+ } >+ spin_unlock(&mm->page_table_lock); >+ >+ /* Migrate the THP to the requested node */ >+ migrated = migrate_misplaced_transhuge_page(mm, vma, >+ pmdp, pmd, addr, >+ page, target_nid); >+ if (migrated) >+ current_nid = target_nid; >+ else { >+ spin_lock(&mm->page_table_lock); >+ if (unlikely(!pmd_same(pmd, *pmdp))) { >+ unlock_page(page); >+ goto out_unlock; >+ } >+ goto clear_pmdnuma; >+ } >+ >+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); >+ return 0; > >+clear_pmdnuma: > pmd = pmd_mknonnuma(pmd); > set_pmd_at(mm, haddr, pmdp, pmd); > VM_BUG_ON(pmd_numa(*pmdp)); > update_mmu_cache_pmd(vma, addr, pmdp); >+ if (page_locked) >+ unlock_page(page); > > out_unlock: > spin_unlock(&mm->page_table_lock); >- if (page) { >- put_page(page); >- task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false); >- } >+ if (current_nid != -1) >+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); > return 0; > } > >diff --git a/mm/internal.h b/mm/internal.h >index a4fa284..7e60ac8 100644 >--- a/mm/internal.h >+++ b/mm/internal.h >@@ -212,15 +212,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) > { > if (TestClearPageMlocked(page)) { > unsigned long flags; >+ int nr_pages = hpage_nr_pages(page); > > local_irq_save(flags); >- __dec_zone_page_state(page, NR_MLOCK); >+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); > SetPageMlocked(newpage); >- __inc_zone_page_state(newpage, NR_MLOCK); >+ __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); > local_irq_restore(flags); > } > } > >+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); >+ > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > extern unsigned long vma_address(struct page *page, > struct vm_area_struct *vma); >diff --git a/mm/memcontrol.c b/mm/memcontrol.c >index dd39ba0..d97af96 100644 >--- a/mm/memcontrol.c >+++ b/mm/memcontrol.c >@@ -3288,15 +3288,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, > struct mem_cgroup **memcgp) > { > struct mem_cgroup *memcg = NULL; >+ unsigned int nr_pages = 1; > struct page_cgroup *pc; > enum charge_type ctype; > > *memcgp = NULL; > >- VM_BUG_ON(PageTransHuge(page)); > if (mem_cgroup_disabled()) > return; > >+ if (PageTransHuge(page)) >+ nr_pages <<= compound_order(page); >+ > pc = lookup_page_cgroup(page); > lock_page_cgroup(pc); > if (PageCgroupUsed(pc)) { >@@ -3358,7 +3361,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, > * charged to the res_counter since we plan on replacing the > * old one and only one page is going to be left afterwards. > */ >- __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); >+ __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); > } > > /* remove redundant charge if migration failed*/ >diff --git a/mm/migrate.c b/mm/migrate.c >index 6bc9745..4b1b239 100644 >--- a/mm/migrate.c >+++ b/mm/migrate.c >@@ -410,7 +410,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, > */ > void migrate_page_copy(struct page *newpage, struct page *page) > { >- if (PageHuge(page)) >+ if (PageHuge(page) || PageTransHuge(page)) > copy_huge_page(newpage, page); > else > copy_highpage(newpage, page); >@@ -1491,25 +1491,10 @@ bool migrate_ratelimited(int node) > return true; > } > >-/* >- * Attempt to migrate a misplaced page to the specified destination >- * node. Caller is expected to have an elevated reference count on >- * the page that will be dropped by this function before returning. >- */ >-int migrate_misplaced_page(struct page *page, int node) >+/* Returns true if the node is migrate rate-limited after the update */ >+bool numamigrate_update_ratelimit(pg_data_t *pgdat) > { >- pg_data_t *pgdat = NODE_DATA(node); >- int isolated = 0; >- LIST_HEAD(migratepages); >- >- /* >- * Don't migrate pages that are mapped in multiple processes. >- * TODO: Handle false sharing detection instead of this hammer >- */ >- if (page_mapcount(page) != 1) { >- put_page(page); >- goto out; >- } >+ bool rate_limited = false; > > /* > * Rate-limit the amount of data that is being migrated to a node. >@@ -1522,13 +1507,18 @@ int migrate_misplaced_page(struct page *page, int node) > pgdat->balancenuma_migrate_next_window = jiffies + > msecs_to_jiffies(migrate_interval_millisecs); > } >- if (pgdat->balancenuma_migrate_nr_pages > ratelimit_pages) { >- spin_unlock(&pgdat->balancenuma_migrate_lock); >- put_page(page); >- goto out; >- } >- pgdat->balancenuma_migrate_nr_pages++; >+ if (pgdat->balancenuma_migrate_nr_pages > ratelimit_pages) >+ rate_limited = true; >+ else >+ pgdat->balancenuma_migrate_nr_pages++; > spin_unlock(&pgdat->balancenuma_migrate_lock); >+ >+ return rate_limited; >+} >+ >+int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) >+{ >+ int ret = 0; > > /* Avoid migrating to a node that is nearly full */ > if (migrate_balanced_pgdat(pgdat, 1)) { Hi Mel Gorman, This parameter nr_migrate_pags = 1 is not correct, since balancenuma also support THP in this patchset, the parameter should be 1 <= compound_order(page) instead of 1. I'd rather change to something like: + unsigned int nr_pages = 1; + + if (PageTransHuge(page)) { + nr_pages <= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } /* Avoid migrating to a node that is nearly full */ - if (migrate_balanced_pgdat(pgdat, 1)) { + if (migrate_balanced_pgdat(pgdat, nr_pages)) { Regards, Wanpeng Li >@@ -1536,13 +1526,18 @@ int migrate_misplaced_page(struct page *page, int node) > > if (isolate_lru_page(page)) { > put_page(page); >- goto out; >+ return 0; > } >- isolated = 1; > >+ /* Page is isolated */ >+ ret = 1; > page_lru = page_is_file_cache(page); >- inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); >- list_add(&page->lru, &migratepages); >+ if (!PageTransHuge(page)) >+ inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); >+ else >+ mod_zone_page_state(page_zone(page), >+ NR_ISOLATED_ANON + page_lru, >+ HPAGE_PMD_NR); > } > > /* >@@ -1555,23 +1550,177 @@ int migrate_misplaced_page(struct page *page, int node) > */ > put_page(page); > >- if (isolated) { >- int nr_remaining; >- >- nr_remaining = migrate_pages(&migratepages, >- alloc_misplaced_dst_page, >- node, false, MIGRATE_ASYNC, >- MR_NUMA_MISPLACED); >- if (nr_remaining) { >- putback_lru_pages(&migratepages); >- isolated = 0; >- } else >- count_vm_numa_event(NUMA_PAGE_MIGRATE); >+ return ret; >+} >+ >+/* >+ * Attempt to migrate a misplaced page to the specified destination >+ * node. Caller is expected to have an elevated reference count on >+ * the page that will be dropped by this function before returning. >+ */ >+int migrate_misplaced_page(struct page *page, int node) >+{ >+ pg_data_t *pgdat = NODE_DATA(node); >+ int isolated = 0; >+ int nr_remaining; >+ LIST_HEAD(migratepages); >+ >+ /* >+ * Don't migrate pages that are mapped in multiple processes. >+ * TODO: Handle false sharing detection instead of this hammer >+ */ >+ if (page_mapcount(page) != 1) { >+ put_page(page); >+ goto out; > } >+ >+ /* >+ * Rate-limit the amount of data that is being migrated to a node. >+ * Optimal placement is no good if the memory bus is saturated and >+ * all the time is being spent migrating! >+ */ >+ if (numamigrate_update_ratelimit(pgdat)) { >+ put_page(page); >+ goto out; >+ } >+ >+ isolated = numamigrate_isolate_page(pgdat, page); >+ if (!isolated) >+ goto out; >+ >+ list_add(&page->lru, &migratepages); >+ nr_remaining = migrate_pages(&migratepages, >+ alloc_misplaced_dst_page, >+ node, false, MIGRATE_ASYNC, >+ MR_NUMA_MISPLACED); >+ if (nr_remaining) { >+ putback_lru_pages(&migratepages); >+ isolated = 0; >+ } else >+ count_vm_numa_event(NUMA_PAGE_MIGRATE); > BUG_ON(!list_empty(&migratepages)); > out: > return isolated; > } >+ >+int migrate_misplaced_transhuge_page(struct mm_struct *mm, >+ struct vm_area_struct *vma, >+ pmd_t *pmd, pmd_t entry, >+ unsigned long address, >+ struct page *page, int node) >+{ >+ unsigned long haddr = address & HPAGE_PMD_MASK; >+ pg_data_t *pgdat = NODE_DATA(node); >+ int isolated = 0; >+ struct page *new_page = NULL; >+ struct mem_cgroup *memcg = NULL; >+ int page_lru = page_is_file_cache(page); >+ >+ /* >+ * Don't migrate pages that are mapped in multiple processes. >+ * TODO: Handle false sharing detection instead of this hammer >+ */ >+ if (page_mapcount(page) != 1) >+ goto out_dropref; >+ >+ /* >+ * Rate-limit the amount of data that is being migrated to a node. >+ * Optimal placement is no good if the memory bus is saturated and >+ * all the time is being spent migrating! >+ */ >+ if (numamigrate_update_ratelimit(pgdat)) >+ goto out_dropref; >+ >+ new_page = alloc_pages_node(node, >+ (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); >+ if (!new_page) >+ goto out_dropref; >+ page_xchg_last_nid(new_page, page_last_nid(page)); >+ >+ isolated = numamigrate_isolate_page(pgdat, page); >+ if (!isolated) { >+ put_page(new_page); >+ goto out_keep_locked; >+ } >+ >+ /* Prepare a page as a migration target */ >+ __set_page_locked(new_page); >+ SetPageSwapBacked(new_page); >+ >+ /* anon mapping, we can simply copy page->mapping to the new page: */ >+ new_page->mapping = page->mapping; >+ new_page->index = page->index; >+ migrate_page_copy(new_page, page); >+ WARN_ON(PageLRU(new_page)); >+ >+ /* Recheck the target PMD */ >+ spin_lock(&mm->page_table_lock); >+ if (unlikely(!pmd_same(*pmd, entry))) { >+ spin_unlock(&mm->page_table_lock); >+ >+ /* Reverse changes made by migrate_page_copy() */ >+ if (TestClearPageActive(new_page)) >+ SetPageActive(page); >+ if (TestClearPageUnevictable(new_page)) >+ SetPageUnevictable(page); >+ mlock_migrate_page(page, new_page); >+ >+ unlock_page(new_page); >+ put_page(new_page); /* Free it */ >+ >+ unlock_page(page); >+ putback_lru_page(page); >+ >+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); >+ goto out; >+ } >+ >+ /* >+ * Traditional migration needs to prepare the memcg charge >+ * transaction early to prevent the old page from being >+ * uncharged when installing migration entries. Here we can >+ * save the potential rollback and start the charge transfer >+ * only when migration is already known to end successfully. >+ */ >+ mem_cgroup_prepare_migration(page, new_page, &memcg); >+ >+ entry = mk_pmd(new_page, vma->vm_page_prot); >+ entry = pmd_mknonnuma(entry); >+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); >+ entry = pmd_mkhuge(entry); >+ >+ page_add_new_anon_rmap(new_page, vma, haddr); >+ >+ set_pmd_at(mm, haddr, pmd, entry); >+ update_mmu_cache_pmd(vma, address, entry); >+ page_remove_rmap(page); >+ /* >+ * Finish the charge transaction under the page table lock to >+ * prevent split_huge_page() from dividing up the charge >+ * before it's fully transferred to the new page. >+ */ >+ mem_cgroup_end_migration(memcg, page, new_page, true); >+ spin_unlock(&mm->page_table_lock); >+ >+ unlock_page(new_page); >+ unlock_page(page); >+ put_page(page); /* Drop the rmap reference */ >+ put_page(page); /* Drop the LRU isolation reference */ >+ >+ count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); >+ count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); >+ >+out: >+ mod_zone_page_state(page_zone(page), >+ NR_ISOLATED_ANON + page_lru, >+ -HPAGE_PMD_NR); >+ return isolated; >+ >+out_dropref: >+ put_page(page); >+out_keep_locked: >+ return 0; >+} > #endif /* CONFIG_BALANCE_NUMA */ > > #endif /* CONFIG_NUMA */ >-- >1.7.9.2 > >-- >To unsubscribe, send a message with 'unsubscribe linux-mm' in >the body to majordomo@xxxxxxxxx. For more info on Linux MM, >see: http://www.linux-mm.org/ . >Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>