Re: numa/core regressions fixed - more testers wanted

Hugh Dickins <hughd@xxxxxxxxxx> · Tue, 20 Nov 2012 20:10:23 -0800 (PST)

On Tue, 20 Nov 2012, Rik van Riel wrote:
> On 11/20/2012 08:54 PM, Andrew Theurer wrote:
> 
> > I can confirm single JVM JBB is working well for me.  I see a 30%
> > improvement over autoNUMA.  What I can't make sense of is some perf
> > stats (taken at 80 warehouses on 4 x WST-EX, 512GB memory):
> 
> AutoNUMA does not have native THP migration, that may explain some
> of the difference.

When I made some fixes to the sched/numa native THP migration,
I did also try porting that (with Hannes's memcg fixes) to AutoNUMA.

Here's the patch below: it appeared to be working just fine, but
you might find that it doesn't quite apply to whatever tree you're
using.  I started from 3.6 autonuma28fast in aa.git, but had folded
in some of the equally applicable TLB flush optimizations too.

There's also a little "Hack, remove after THP native migration"
retuning in mm/huge_memory.c which should probably be removed too.

No signoffs, but it's from work by Peter and Ingo and Hannes and Hugh.
---

 include/linux/huge_mm.h |    4 -
 mm/autonuma.c           |   59 +++++-----------
 mm/huge_memory.c        |  140 +++++++++++++++++++++++++++++++++-----
 mm/internal.h           |    5 -
 mm/memcontrol.c         |    7 +
 mm/memory.c             |    4 -
 mm/migrate.c            |    2 
 7 files changed, 158 insertions(+), 63 deletions(-)

--- 306aa/include/linux/huge_mm.h	2012-11-04 10:21:30.965548793 -0800
+++ 306AA/include/linux/huge_mm.h	2012-11-04 17:14:32.232651475 -0800
@@ -11,8 +11,8 @@ extern int copy_huge_pmd(struct mm_struc
 extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			       unsigned long address, pmd_t *pmd,
 			       pmd_t orig_pmd);
-extern int huge_pmd_numa_fixup(struct mm_struct *mm, unsigned long addr,
-			       pmd_t pmd, pmd_t *pmdp);
+extern int huge_pmd_numa_fixup(struct mm_struct *mm, struct vm_area_struct *vma,
+			     unsigned long address, pmd_t *pmd, pmd_t orig_pmd);
 extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm);
 extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
 					  unsigned long addr,
--- 306aa/mm/autonuma.c	2012-11-04 10:22:13.993549816 -0800
+++ 306AA/mm/autonuma.c	2012-11-04 21:24:43.045622669 -0800
@@ -77,8 +77,7 @@ void autonuma_migrate_split_huge_page(st
 
 static int sync_isolate_migratepages(struct list_head *migratepages,
 				     struct page *page,
-				     struct pglist_data *pgdat,
-				     bool *migrated)
+				     struct pglist_data *pgdat)
 {
 	struct zone *zone;
 	struct lruvec *lruvec;
@@ -129,19 +128,9 @@ static int sync_isolate_migratepages(str
 	 * __isolate_lru_page successd, the page could be freed and
 	 * reallocated out from under us. Thus our previous checks on
 	 * the page, and the split_huge_page, would be worthless.
-	 *
-	 * We really only need to do this if "ret > 0" but it doesn't
-	 * hurt to do it unconditionally as nobody can reference
-	 * "page" anymore after this and so we can avoid an "if (ret >
-	 * 0)" branch here.
 	 */
-	put_page(page);
-	/*
-	 * Tell the caller we already released its pin, to avoid a
-	 * double free.
-	 */
-	*migrated = true;
-
+	if (ret > 0)
+		put_page(page);
 out:
 	return ret;
 }
@@ -215,13 +204,12 @@ static inline void autonuma_migrate_unlo
 	spin_unlock(&NODE_DATA(nid)->autonuma_migrate_lock);
 }
 
-static bool autonuma_migrate_page(struct page *page, int dst_nid,
-				  int page_nid, bool *migrated)
+static bool autonuma_migrate_page(struct page *page, int dst_nid, int page_nid,
+				  int nr_pages)
 {
 	int isolated = 0;
 	LIST_HEAD(migratepages);
 	struct pglist_data *pgdat = NODE_DATA(dst_nid);
-	int nr_pages = hpage_nr_pages(page);
 	unsigned long autonuma_migrate_nr_pages = 0;
 
 	autonuma_migrate_lock(dst_nid);
@@ -242,11 +230,12 @@ static bool autonuma_migrate_page(struct
 		autonuma_printk("migrated %lu pages to node %d\n",
 				autonuma_migrate_nr_pages, dst_nid);
 
-	if (autonuma_balance_pgdat(pgdat, nr_pages))
+	if (autonuma_balance_pgdat(pgdat, nr_pages)) {
+		if (nr_pages > 1)
+			return true;
 		isolated = sync_isolate_migratepages(&migratepages,
-						     page, pgdat,
-						     migrated);
-
+						     page, pgdat);
+	}
 	if (isolated) {
 		int err;
 		trace_numa_migratepages_begin(current, &migratepages,
@@ -381,15 +370,14 @@ static bool should_migrate_page(struct t
 static int numa_hinting_fault_memory_follow_cpu(struct task_struct *p,
 						struct page *page,
 						int this_nid, int page_nid,
-						bool *migrated)
+						int numpages)
 {
 	if (!should_migrate_page(p, page, this_nid, page_nid))
 		goto out;
 	if (!PageLRU(page))
 		goto out;
 	if (this_nid != page_nid) {
-		if (autonuma_migrate_page(page, this_nid, page_nid,
-					  migrated))
+		if (autonuma_migrate_page(page, this_nid, page_nid, numpages))
 			return this_nid;
 	}
 out:
@@ -418,14 +406,17 @@ bool numa_hinting_fault(struct page *pag
 		VM_BUG_ON(this_nid < 0);
 		VM_BUG_ON(this_nid >= MAX_NUMNODES);
 		access_nid = numa_hinting_fault_memory_follow_cpu(p, page,
-								  this_nid,
-								  page_nid,
-								  &migrated);
-		/* "page" has been already freed if "migrated" is true */
+						this_nid, page_nid, numpages);
 		numa_hinting_fault_cpu_follow_memory(p, access_nid, numpages);
+		migrated = access_nid != page_nid;
 	}
 
-	return migrated;
+	/* small page was already freed if migrated */
+	if (!migrated) {
+		put_page(page);
+		return false;
+	}
+	return true;
 }
 
 /* NUMA hinting page fault entry point for ptes */
@@ -434,7 +425,6 @@ int pte_numa_fixup(struct mm_struct *mm,
 {
 	struct page *page;
 	spinlock_t *ptl;
-	bool migrated;
 
 	/*
 	 * The "pte" at this point cannot be used safely without
@@ -455,9 +445,7 @@ int pte_numa_fixup(struct mm_struct *mm,
 	get_page(page);
 	pte_unmap_unlock(ptep, ptl);
 
-	migrated = numa_hinting_fault(page, 1);
-	if (!migrated)
-		put_page(page);
+	numa_hinting_fault(page, 1);
 out:
 	return 0;
 
@@ -476,7 +464,6 @@ int pmd_numa_fixup(struct mm_struct *mm,
 	spinlock_t *ptl;
 	bool numa = false;
 	struct vm_area_struct *vma;
-	bool migrated;
 
 	spin_lock(&mm->page_table_lock);
 	pmd = *pmdp;
@@ -521,9 +508,7 @@ int pmd_numa_fixup(struct mm_struct *mm,
 		get_page(page);
 		pte_unmap_unlock(pte, ptl);
 
-		migrated = numa_hinting_fault(page, 1);
-		if (!migrated)
-			put_page(page);
+		numa_hinting_fault(page, 1);
 
 		pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
 	}
--- 306aa/mm/huge_memory.c	2012-11-04 15:32:28.512793096 -0800
+++ 306AA/mm/huge_memory.c	2012-11-04 22:21:14.112450390 -0800
@@ -17,6 +17,7 @@
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
 #include <linux/mman.h>
+#include <linux/migrate.h>
 #include <linux/autonuma.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -1037,35 +1038,140 @@ out:
 
 #ifdef CONFIG_AUTONUMA
 /* NUMA hinting page fault entry point for trans huge pmds */
-int huge_pmd_numa_fixup(struct mm_struct *mm, unsigned long addr,
-			pmd_t pmd, pmd_t *pmdp)
+int huge_pmd_numa_fixup(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, pmd_t *pmd, pmd_t entry)
 {
+	unsigned long haddr = address & HPAGE_PMD_MASK;
+	struct mem_cgroup *memcg = NULL;
+	struct page *new_page;
 	struct page *page;
-	bool migrated;
 
 	spin_lock(&mm->page_table_lock);
-	if (unlikely(!pmd_same(pmd, *pmdp)))
-		goto out_unlock;
+	if (unlikely(!pmd_same(entry, *pmd)))
+		goto unlock;
 
-	page = pmd_page(pmd);
-	pmd = pmd_mknonnuma(pmd);
-	set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmdp, pmd);
-	VM_BUG_ON(pmd_numa(*pmdp));
-	if (unlikely(page_mapcount(page) != 1))
-		goto out_unlock;
+	page = pmd_page(entry);
+	/*
+	 * Do not migrate this page if it is mapped anywhere else.
+	 * Do not migrate this page if its count has been raised.
+	 * Our caller's down_read of mmap_sem excludes fork raising
+	 * mapcount; but recheck page count below whenever we take
+	 * page_table_lock - although it's unclear what pin we are
+	 * protecting against, since get_user_pages() or GUP fast
+	 * would have to fault it present before they could proceed.
+	 */
+	if (unlikely(page_count(page) != 1))
+		goto fixup;
 	get_page(page);
 	spin_unlock(&mm->page_table_lock);
 
-	migrated = numa_hinting_fault(page, HPAGE_PMD_NR);
-	if (!migrated)
-		put_page(page);
+	if (numa_hinting_fault(page, HPAGE_PMD_NR))
+		goto migrate;
 
-out:
+	spin_lock(&mm->page_table_lock);
+	if (unlikely(!pmd_same(entry, *pmd)))
+		goto unlock;
+fixup:
+	entry = pmd_mknonnuma(entry);
+	set_pmd_at(mm, haddr, pmd, entry);
+	VM_BUG_ON(pmd_numa(*pmd));
+	update_mmu_cache(vma, address, entry);
+
+unlock:
+	spin_unlock(&mm->page_table_lock);
 	return 0;
 
-out_unlock:
+migrate:
+	lock_page(page);
+	spin_lock(&mm->page_table_lock);
+	if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
+		spin_unlock(&mm->page_table_lock);
+		unlock_page(page);
+		put_page(page);
+		return 0;
+	}
 	spin_unlock(&mm->page_table_lock);
-	goto out;
+
+	new_page = alloc_pages_node(numa_node_id(),
+	    (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+	if (!new_page)
+		goto alloc_fail;
+
+	if (isolate_lru_page(page)) {	/* Does an implicit get_page() */
+		put_page(new_page);
+		goto alloc_fail;
+	}
+
+	__set_page_locked(new_page);
+	SetPageSwapBacked(new_page);
+
+	/* anon mapping, we can simply copy page->mapping to the new page: */
+	new_page->mapping = page->mapping;
+	new_page->index = page->index;
+
+	migrate_page_copy(new_page, page);
+
+	WARN_ON(PageLRU(new_page));
+
+	spin_lock(&mm->page_table_lock);
+	if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 3)) {
+		spin_unlock(&mm->page_table_lock);
+
+		/* Reverse changes made by migrate_page_copy() */
+		if (TestClearPageActive(new_page))
+			SetPageActive(page);
+		if (TestClearPageUnevictable(new_page))
+			SetPageUnevictable(page);
+		mlock_migrate_page(page, new_page);
+
+		unlock_page(new_page);
+		put_page(new_page);		/* Free it */
+
+		unlock_page(page);
+		putback_lru_page(page);
+		put_page(page);			/* Drop the local reference */
+		return 0;
+	}
+	/*
+	 * Traditional migration needs to prepare the memcg charge
+	 * transaction early to prevent the old page from being
+	 * uncharged when installing migration entries.  Here we can
+	 * save the potential rollback and start the charge transfer
+	 * only when migration is already known to end successfully.
+	 */
+	mem_cgroup_prepare_migration(page, new_page, &memcg);
+
+	entry = mk_pmd(new_page, vma->vm_page_prot);
+	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+	entry = pmd_mkhuge(entry);
+
+	page_add_new_anon_rmap(new_page, vma, haddr);
+
+	set_pmd_at(mm, haddr, pmd, entry);
+	update_mmu_cache(vma, address, entry);
+	page_remove_rmap(page);
+	/*
+	 * Finish the charge transaction under the page table lock to
+	 * prevent split_huge_page() from dividing up the charge
+	 * before it's fully transferred to the new page.
+	 */
+	mem_cgroup_end_migration(memcg, page, new_page, true);
+	spin_unlock(&mm->page_table_lock);
+
+	unlock_page(new_page);
+	unlock_page(page);
+	put_page(page);			/* Drop the rmap reference */
+	put_page(page);			/* Drop the LRU isolation reference */
+	put_page(page);			/* Drop the local reference */
+	return 0;
+
+alloc_fail:
+	unlock_page(page);
+	put_page(page);
+	spin_lock(&mm->page_table_lock);
+	if (unlikely(!pmd_same(*pmd, entry)))
+		goto unlock;
+	goto fixup;
 }
 #endif
 
--- 306aa/mm/internal.h	2012-09-30 16:47:46.000000000 -0700
+++ 306AA/mm/internal.h	2012-11-04 16:16:21.760439246 -0800
@@ -216,11 +216,12 @@ static inline void mlock_migrate_page(st
 {
 	if (TestClearPageMlocked(page)) {
 		unsigned long flags;
+		int nr_pages = hpage_nr_pages(page);
 
 		local_irq_save(flags);
-		__dec_zone_page_state(page, NR_MLOCK);
+		__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
 		SetPageMlocked(newpage);
-		__inc_zone_page_state(newpage, NR_MLOCK);
+		__mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
 		local_irq_restore(flags);
 	}
 }
--- 306aa/mm/memcontrol.c	2012-09-30 16:47:46.000000000 -0700
+++ 306AA/mm/memcontrol.c	2012-11-04 16:15:55.264437693 -0800
@@ -3261,15 +3261,18 @@ void mem_cgroup_prepare_migration(struct
 				  struct mem_cgroup **memcgp)
 {
 	struct mem_cgroup *memcg = NULL;
+	unsigned int nr_pages = 1;
 	struct page_cgroup *pc;
 	enum charge_type ctype;
 
 	*memcgp = NULL;
 
-	VM_BUG_ON(PageTransHuge(page));
 	if (mem_cgroup_disabled())
 		return;
 
+	if (PageTransHuge(page))
+		nr_pages <<= compound_order(page);
+
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
@@ -3331,7 +3334,7 @@ void mem_cgroup_prepare_migration(struct
 	 * charged to the res_counter since we plan on replacing the
 	 * old one and only one page is going to be left afterwards.
 	 */
-	__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
+	__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
 }
 
 /* remove redundant charge if migration failed*/
--- 306aa/mm/memory.c	2012-11-04 10:21:34.181548869 -0800
+++ 306AA/mm/memory.c	2012-11-04 17:06:11.400620099 -0800
@@ -3546,8 +3546,8 @@ retry:
 		barrier();
 		if (pmd_trans_huge(orig_pmd)) {
 			if (pmd_numa(*pmd))
-				return huge_pmd_numa_fixup(mm, address,
-							   orig_pmd, pmd);
+				return huge_pmd_numa_fixup(mm, vma, address,
+							   pmd, orig_pmd);
 			if (flags & FAULT_FLAG_WRITE &&
 			    !pmd_write(orig_pmd) &&
 			    !pmd_trans_splitting(orig_pmd)) {
--- 306aa/mm/migrate.c	2012-09-30 16:47:46.000000000 -0700
+++ 306AA/mm/migrate.c	2012-11-04 17:10:13.084633509 -0800
@@ -407,7 +407,7 @@ int migrate_huge_page_move_mapping(struc
  */
 void migrate_page_copy(struct page *newpage, struct page *page)
 {
-	if (PageHuge(page))
+	if (PageHuge(page) || PageTransHuge(page))
 		copy_huge_page(newpage, page);
 	else
 		copy_highpage(newpage, page);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>