[tip:numa/core] numa, mempolicy: Improve CONFIG_NUMA_BALANCING= y OOM behavior

tip-bot for Ingo Molnar <mingo@xxxxxxxxxx> · Thu, 13 Dec 2012 00:13:56 -0800

Commit-ID:  d9f1eb87270494d889ded7fc3fc7fe8effb41e42
Gitweb:     http://git.kernel.org/tip/d9f1eb87270494d889ded7fc3fc7fe8effb41e42
Author:     Ingo Molnar <mingo@xxxxxxxxxx>
AuthorDate: Mon, 3 Dec 2012 10:15:27 +0100
Committer:  Ingo Molnar <mingo@xxxxxxxxxx>
CommitDate: Mon, 3 Dec 2012 10:19:43 +0100

numa, mempolicy: Improve CONFIG_NUMA_BALANCING=y OOM behavior

Zhouping Liu reported worse out-of-memory behavior with
CONFIG_NUMA_BALANCING=y, compared to the mainline kernel.

One reason for that change in behavior is that with typical
applications the mainline kernel allocates memory essentially
randomly, and leaves it where it was.

"Random" placement is not the worst possible placement - in fact
it's a pretty good placement strategy. It's definitely possible
for a NUMA-aware kernel to do worse than that, and
CONFIG_NUMA_BALANCING=y regressed because it's very opinionated
about which node tasks should execute and on which node they
should allocate memory on.

One such problematic case is when a node has already used up
most of its memory - in that case it's pointless trying to
allocate even more memory from there. Doing so would trigger
OOMs even though the system has more memory on other nodes.

The migration code is already trying to be nice when allocating
memory for NUMA purposes - extend this concept to mempolicy
driven allocations as well.

Expose migrate_balanced_pgdat() and use it. If all fails try just
as hard as the old code would.

Hopefully this improves behavior in memory allocation corner
cases.

[ migrate_balanced_pgdat() should probably be moved to
  mm/page_alloc.c and be renamed to balanced_pgdat() or
  so - but this patch tries to be minimalistic. ]

Reported-by: Zhouping Liu <zliu@xxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Hugh Dickins <hughd@xxxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
 kernel/sched/core.c |  2 +-
 mm/mempolicy.c      | 93 ++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 78 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 05d4e1d..26ab5ff 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1566,7 +1566,7 @@ static void __sched_fork(struct task_struct *p)
 	p->ideal_cpu_curr = -1;
 	atomic_set(&p->numa_policy.refcnt, 1);
 	p->numa_policy.mode = MPOL_INTERLEAVE;
-	p->numa_policy.flags = 0;
+	p->numa_policy.flags = MPOL_F_MOF;
 	p->numa_policy.v.preferred_node = 0;
 	p->numa_policy.v.nodes = node_online_map;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0649679..42da0f2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -115,7 +115,7 @@ enum zone_type policy_zone = 0;
 static struct mempolicy default_policy_local = {
 	.refcnt		= ATOMIC_INIT(1), /* never free it */
 	.mode		= MPOL_PREFERRED,
-	.flags		= MPOL_F_LOCAL,
+	.flags		= MPOL_F_LOCAL | MPOL_F_MOF,
 };
 
 static struct mempolicy *default_policy(void)
@@ -1746,11 +1746,14 @@ unsigned slab_node(void)
 		struct zonelist *zonelist;
 		struct zone *zone;
 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
+		int node;
+
 		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
 							&policy->v.nodes,
 							&zone);
-		return zone ? zone->node : numa_node_id();
+		node = zone ? zone->node : numa_node_id();
+		return node;
 	}
 
 	default:
@@ -1960,6 +1963,66 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 	return page;
 }
 
+static struct page *
+alloc_pages_nice(gfp_t gfp, int order, struct mempolicy *pol, int best_nid)
+{
+	struct zonelist *zl = policy_zonelist(gfp, pol, best_nid);
+#ifdef CONFIG_NUMA_BALANCING
+	unsigned int pages = 1 << order;
+	gfp_t gfp_nice = gfp | GFP_THISNODE;
+#endif
+	struct page *page = NULL;
+	nodemask_t *nodemask;
+
+	nodemask = policy_nodemask(gfp, pol);
+
+#ifdef CONFIG_NUMA_BALANCING
+	/* Is our preferred node possible? */
+	if (nodemask && !node_isset(best_nid, *nodemask))
+		best_nid = find_first_bit(nodemask->bits, MAX_NUMNODES);
+
+	if (migrate_balanced_pgdat(NODE_DATA(best_nid), pages)) {
+		page = alloc_pages_node(best_nid, gfp_nice, order);
+		if (page)
+			return page;
+	}
+
+	/*
+	 * For non-hard-bound tasks, see whether there's another node
+	 * before trying harder:
+	 */
+	if (current->nr_cpus_allowed > 1) {
+		int nid;
+
+		if (nodemask) {
+			int first_nid = find_first_bit(nodemask->bits, MAX_NUMNODES);
+
+			page = alloc_pages_node(first_nid, gfp_nice, order);
+			if (page)
+				return page;
+		}
+
+		/*
+		 * Pick a less loaded node, if possible:
+		 */
+		for_each_node(nid) {
+			if (!migrate_balanced_pgdat(NODE_DATA(nid), pages))
+				continue;
+
+			page = alloc_pages_node(nid, gfp_nice, order);
+			if (page)
+				return page;
+		}
+	}
+#endif
+
+	/* If all failed then try the original plan: */
+	if (!page)
+		page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
+
+	return page;
+}
+
 /**
  * 	alloc_pages_vma	- Allocate a page for a VMA.
  *
@@ -1988,8 +2051,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		unsigned long addr, int node)
 {
 	struct mempolicy *pol;
-	struct zonelist *zl;
-	struct page *page;
+	struct page *page = NULL;
 	unsigned int cpuset_mems_cookie;
 
 retry_cpuset:
@@ -2007,13 +2069,12 @@ retry_cpuset:
 
 		return page;
 	}
-	zl = policy_zonelist(gfp, pol, node);
 	if (unlikely(mpol_needs_cond_ref(pol))) {
 		/*
 		 * slow path: ref counted shared policy
 		 */
-		struct page *page =  __alloc_pages_nodemask(gfp, order,
-						zl, policy_nodemask(gfp, pol));
+		page = alloc_pages_nice(gfp, order, pol, node);
+
 		__mpol_put(pol);
 		if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
 			goto retry_cpuset;
@@ -2022,10 +2083,10 @@ retry_cpuset:
 	/*
 	 * fast path:  default or task policy
 	 */
-	page = __alloc_pages_nodemask(gfp, order, zl,
-				      policy_nodemask(gfp, pol));
+	page = alloc_pages_nice(gfp, order, pol, node);
 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
 		goto retry_cpuset;
+
 	return page;
 }
 
@@ -2067,9 +2128,7 @@ retry_cpuset:
 	if (pol->mode == MPOL_INTERLEAVE)
 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
 	else
-		page = __alloc_pages_nodemask(gfp, order,
-				policy_zonelist(gfp, pol, numa_node_id()),
-				policy_nodemask(gfp, pol));
+		page = alloc_pages_nice(gfp, order, pol, numa_node_id());
 
 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
 		goto retry_cpuset;
@@ -2284,8 +2343,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 
 	pol = get_vma_policy(current, vma, addr);
 	if (!(pol->flags & MPOL_F_MOF))
-		goto out;
-
+		goto out_keep_page;
+	if (task_numa_shared(current) < 0)
+		goto out_keep_page;
+	
 	switch (pol->mode) {
 	case MPOL_INTERLEAVE:
 	{
@@ -2321,7 +2382,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		 * If no allowed nodes, use current [!misplaced].
 		 */
 		if (node_isset(page_nid, pol->v.nodes))
-			goto out;
+			goto out_keep_page;
 		(void)first_zones_zonelist(
 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
 				gfp_zone(GFP_HIGHUSER),
@@ -2369,7 +2430,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		if (cpu_last_access == this_cpu)
 			target_node = this_node;
 	}
-out:
+out_keep_page:
 	mpol_cond_put(pol);
 
 	/* Page already at its ideal target node: */
--
To unsubscribe from this list: send the line "unsubscribe linux-tip-commits" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html