[PATCH 6/9] numa, mm: Fix !THP, 4K-pte "2M-emu" NUMA fault handling

Ingo Molnar <mingo@xxxxxxxxxx> · Fri, 7 Dec 2012 01:19:23 +0100

The !THP pte_numa code from the unified tree is not working very well
for me: I suspect it would work better with migration bandwidth throttling
in place, but without that (and in form of my port to the unified tree)
it performs badly in a number of situations:

 - when for whatever reason the numa_pmd entry is not established
   yet and threads are hitting the 4K ptes then the pte lock can
   kill performance quickly:

    19.29%        process 1  [kernel.kallsyms]          [k] do_raw_spin_lock
                  |
                  --- do_raw_spin_lock
                     |
                     |--99.67%-- _raw_spin_lock
                     |          |
                     |          |--34.47%-- remove_migration_pte
                     |          |          rmap_walk
                     |          |          move_to_new_page
                     |          |          migrate_pages
                     |          |          migrate_misplaced_page_put
                     |          |          __do_numa_page.isra.56
                     |          |          handle_pte_fault
                     |          |          handle_mm_fault
                     |          |          __do_page_fault
                     |          |          do_page_fault
                     |          |          page_fault
                     |          |          __memset_sse2
                     |          |
                     |          |--34.32%-- __page_check_address
                     |          |          try_to_unmap_one
                     |          |          try_to_unmap_anon
                     |          |          try_to_unmap
                     |          |          migrate_pages
                     |          |          migrate_misplaced_page_put
                     |          |          __do_numa_page.isra.56
                     |          |          handle_pte_fault
                     |          |          handle_mm_fault
                     |          |          __do_page_fault
                     |          |          do_page_fault
                     |          |          page_fault
                     |          |          __memset_sse2
                     |          |
    [...]

 - even if the pmd entry is established we'd hit ptes in a loop while
   other CPUs do it too, seeing the migration ptes as they are being
   established and torn down - resulting in up to 1 million page faults
   per second on my test-system. Not a happy sight and you really don't
   want me to cite that profile here.

So import the 2M-EMU handling code from the v17 numa/core tree, which
was working reasonably well, and add a few other goodies as well:

 - let the first page of an emulated large page determine the target
   node - and also pass down the expected interleaving shift to
   mpol_misplaced(), for overload situations where one group of threads
   spans multiple nodes.

 - turn off the pmd clustering in change_protection() - because the
   2M-emu code works better at the moment. We can re-establish it if
   it's enhanced. I kept both variants for the time being, feedback
   is welcome on this issue.

 - instead of calling mpol_misplaced() 512 times per emulated hugepage,
   extract the cpupid operation from it. Results in measurably lower
   CPU overhead for this functionality.

4K-intense workloads are immediately much happier: 3-5K pagefaults/sec
on my 32-way test-box and a lot less migrations all around.

Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Hugh Dickins <hughd@xxxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
 include/linux/mempolicy.h |   4 +-
 mm/huge_memory.c          |   2 +-
 mm/memory.c               | 153 +++++++++++++++++++++++++++++++++++-----------
 mm/mempolicy.c            |  13 +---
 mm/mprotect.c             |   4 +-
 5 files changed, 127 insertions(+), 49 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index f44b7f3..8bb6ab5 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -161,7 +161,7 @@ static inline int vma_migratable(struct vm_area_struct *vma)
 	return 1;
 }
 
-extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+extern int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr, int shift);
 
 #else
 
@@ -289,7 +289,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
 }
 
 static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
-				 unsigned long address)
+				 unsigned long address, int shift)
 {
 	return -1; /* no node preference */
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e6820aa..7c82f28 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1043,7 +1043,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (page_nid == numa_node_id())
 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 
-	target_nid = mpol_misplaced(page, vma, haddr);
+	target_nid = mpol_misplaced(page, vma, haddr, HPAGE_SHIFT);
 	if (target_nid == -1) {
 		put_page(page);
 		goto clear_pmdnuma;
diff --git a/mm/memory.c b/mm/memory.c
index 6ebfbbe..fc0026e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3455,6 +3455,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
+#ifdef CONFIG_NUMA_BALANCING
 static int numa_migration_target(struct page *page, struct vm_area_struct *vma,
 			       unsigned long addr, int page_nid)
 {
@@ -3462,57 +3463,50 @@ static int numa_migration_target(struct page *page, struct vm_area_struct *vma,
 	if (page_nid == numa_node_id())
 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 
-	return mpol_misplaced(page, vma, addr);
+	return mpol_misplaced(page, vma, addr, PAGE_SHIFT);
 }
 
-int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+static int __do_numa_page(int target_nid, struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long addr, pte_t *ptep, pmd_t *pmd,
+			unsigned int flags, pte_t pte, spinlock_t *ptl)
 {
 	struct page *page = NULL;
 	bool migrated = false;
-	spinlock_t *ptl;
-	int target_nid;
 	int last_cpupid;
 	int page_nid;
 
-	/*
-	* The "pte" at this point cannot be used safely without
-	* validation through pte_unmap_same(). It's of NUMA type but
-	* the pfn may be screwed if the read is non atomic.
-	*
-	* ptep_modify_prot_start is not called as this is clearing
-	* the _PAGE_NUMA bit and it is not really expected that there
-	* would be concurrent hardware modifications to the PTE.
-	*/
-	ptl = pte_lockptr(mm, pmd);
-	spin_lock(ptl);
-	if (unlikely(!pte_same(*ptep, pte))) {
-		pte_unmap_unlock(ptep, ptl);
-		return 0;
-	}
-
+	/* Mark it non-NUMA first: */
 	pte = pte_mknonnuma(pte);
 	set_pte_at(mm, addr, ptep, pte);
 	update_mmu_cache(vma, addr, ptep);
 
 	page = vm_normal_page(vma, addr, pte);
-	if (!page) {
-		pte_unmap_unlock(ptep, ptl);
+	if (!page)
 		return 0;
-	}
 
 	page_nid = page_to_nid(page);
 	WARN_ON_ONCE(page_nid == -1);
 
-	/* Get it before mpol_misplaced() flips it: */
-	last_cpupid = page_last__cpupid(page);
+	/*
+	 * Propagate the last_cpupid access info, even though
+	 * the target_nid has already been established for
+	 * this NID range:
+	 */
+	{
+		int this_cpupid;
+		int this_cpu;
+		int this_node;
+
+		this_cpu = raw_smp_processor_id();
+		this_node = numa_node_id();
 
-	target_nid = numa_migration_target(page, vma, addr, page_nid);
-	if (target_nid == -1) {
-		pte_unmap_unlock(ptep, ptl);
-		goto out;
+		this_cpupid = cpu_pid_to_cpupid(this_cpu, current->pid);
+
+		last_cpupid = page_xchg_last_cpupid(page, this_cpupid);
 	}
-	WARN_ON_ONCE(target_nid == page_nid);
+
+	if (target_nid == -1 || target_nid == page_nid)
+		goto out;
 
 	/* Get a reference for migration: */
 	get_page(page);
@@ -3522,6 +3516,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	migrated = migrate_misplaced_page_put(page, target_nid); /* Drops the reference */
 	if (migrated)
 		page_nid = target_nid;
+
+	spin_lock(ptl);
 out:
 	/* Always account where the page currently is, physically: */
 	task_numa_fault(addr, page_nid, last_cpupid, 1, migrated);
@@ -3529,9 +3525,81 @@ out:
 	return 0;
 }
 
+/*
+ * Also fault over nearby ptes from within the same pmd and vma,
+ * in order to minimize the overhead from page fault exceptions:
+ */
+static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long addr0, pte_t *ptep0, pmd_t *pmd,
+			unsigned int flags, pte_t entry0)
+{
+	unsigned long addr0_pmd;
+	unsigned long addr_start;
+	unsigned long addr;
+	struct page *page0;
+	spinlock_t *ptl;
+	pte_t *ptep_start;
+	pte_t *ptep;
+	pte_t entry;
+	int target_nid;
+
+	WARN_ON_ONCE(addr0 < vma->vm_start || addr0 >= vma->vm_end);
+
+	addr0_pmd = addr0 & PMD_MASK;
+	addr_start = max(addr0_pmd, vma->vm_start);
+
+	ptep_start = pte_offset_map(pmd, addr_start);
+	ptl = pte_lockptr(mm, pmd);
+	spin_lock(ptl);
+
+	ptep = ptep_start+1;
+
+	/*
+	 * The first page of the range represents the NUMA
+	 * placement of the range. This way we get consistent
+	 * placement even if the faults themselves might hit
+	 * this area at different offsets:
+	 */
+	target_nid = -1;
+	entry = ACCESS_ONCE(*ptep_start);
+	if (pte_present(entry)) {
+		page0 = vm_normal_page(vma, addr_start, entry);
+		if (page0) {
+			target_nid = mpol_misplaced(page0, vma, addr_start, PMD_SHIFT);
+			if (target_nid == -1)
+				target_nid = page_to_nid(page0);
+		}
+		if (WARN_ON_ONCE(target_nid == -1))
+			target_nid = numa_node_id();
+	}
+
+	for (addr = addr_start+PAGE_SIZE; addr < vma->vm_end; addr += PAGE_SIZE, ptep++) {
+
+		if ((addr & PMD_MASK) != addr0_pmd)
+			break;
+
+		entry = ACCESS_ONCE(*ptep);
+
+		if (!pte_present(entry))
+			continue;
+		if (!pte_numa(entry))
+			continue;
+
+		__do_numa_page(target_nid, mm, vma, addr, ptep, pmd, flags, entry, ptl);
+	}
+
+	entry = ACCESS_ONCE(*ptep_start);
+	if (pte_present(entry) && pte_numa(entry))
+		__do_numa_page(target_nid, mm, vma, addr_start, ptep_start, pmd, flags, entry, ptl);
+
+	pte_unmap_unlock(ptep_start, ptl);
+
+	return 0;
+}
+
 /* NUMA hinting page fault entry point for regular pmds */
-int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		     unsigned long addr, pmd_t *pmdp)
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			    unsigned long addr, pmd_t *pmdp)
 {
 	pmd_t pmd;
 	pte_t *pte, *orig_pte;
@@ -3558,6 +3626,7 @@ int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	VM_BUG_ON(offset >= PMD_SIZE);
 	orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
 	pte += offset >> PAGE_SHIFT;
+
 	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
 		struct page *page;
 		int page_nid;
@@ -3581,6 +3650,9 @@ int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (pte_numa(pteval)) {
 			pteval = pte_mknonnuma(pteval);
 			set_pte_at(mm, addr, pte, pteval);
+		} else {
+			/* Should not happen */
+			WARN_ON_ONCE(1);
 		}
 		page = vm_normal_page(vma, addr, pteval);
 		if (unlikely(!page))
@@ -3621,6 +3693,19 @@ int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	return 0;
 }
+#else
+static inline int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long addr0, pte_t *ptep0, pmd_t *pmd,
+			unsigned int flags, pte_t entry0)
+{
+	return 0;
+}
+static inline int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		     unsigned long addr, pmd_t *pmdp)
+{
+	return 0;
+}
+#endif
 
 /*
  * These routines also need to handle stuff like marking pages dirty
@@ -3661,7 +3746,7 @@ int handle_pte_fault(struct mm_struct *mm,
 	}
 
 	if (pte_numa(entry))
-		return do_numa_page(mm, vma, address, entry, pte, pmd);
+		return do_numa_page(mm, vma, address, pte, pmd, flags, entry);
 
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6bb9fd0..128e2e7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2339,7 +2339,7 @@ static void sp_free(struct sp_node *n)
  * Policy determination "mimics" alloc_page_vma().
  * Called from fault path where we know the vma and faulting address.
  */
-int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr, int shift)
 {
 	struct mempolicy *pol;
 	struct zone *zone;
@@ -2353,6 +2353,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 	BUG_ON(!vma);
 
 	pol = get_vma_policy(current, vma, addr);
+
 	if (!(pol->flags & MPOL_F_MOF))
 		goto out_keep_page;
 	if (task_numa_shared(current) < 0)
@@ -2360,23 +2361,13 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 	
 	switch (pol->mode) {
 	case MPOL_INTERLEAVE:
-	{
-		int shift;
 
 		BUG_ON(addr >= vma->vm_end);
 		BUG_ON(addr < vma->vm_start);
 
-#ifdef CONFIG_HUGETLB_PAGE
-		if (transparent_hugepage_enabled(vma) || vma->vm_flags & VM_HUGETLB)
-			shift = HPAGE_SHIFT;
-		else
-#endif
-			shift = PAGE_SHIFT;
-
 		target_node = interleave_nid(pol, vma, addr, shift);
 
 		goto out_keep_page;
-	}
 
 	case MPOL_PREFERRED:
 		if (pol->flags & MPOL_F_LOCAL)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 47335a9..b5be3f1 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -138,19 +138,21 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *
 		pages += change_pte_range(vma, pmd, addr, next, newprot,
 				 dirty_accountable, prot_numa, &all_same_node);
 
+#ifdef CONFIG_NUMA_BALANCING
 		/*
 		 * If we are changing protections for NUMA hinting faults then
 		 * set pmd_numa if the examined pages were all on the same
 		 * node. This allows a regular PMD to be handled as one fault
 		 * and effectively batches the taking of the PTL
 		 */
-		if (prot_numa && all_same_node) {
+		if (prot_numa && all_same_node && 0) {
 			struct mm_struct *mm = vma->vm_mm;
 
 			spin_lock(&mm->page_table_lock);
 			set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
 			spin_unlock(&mm->page_table_lock);
 		}
+#endif
 	} while (pmd++, addr = next, addr != end);
 
 	return pages;
-- 
1.7.11.7

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>