Commit-ID: d9f1eb87270494d889ded7fc3fc7fe8effb41e42 Gitweb: http://git.kernel.org/tip/d9f1eb87270494d889ded7fc3fc7fe8effb41e42 Author: Ingo Molnar <mingo@xxxxxxxxxx> AuthorDate: Mon, 3 Dec 2012 10:15:27 +0100 Committer: Ingo Molnar <mingo@xxxxxxxxxx> CommitDate: Mon, 3 Dec 2012 10:19:43 +0100 numa, mempolicy: Improve CONFIG_NUMA_BALANCING=y OOM behavior Zhouping Liu reported worse out-of-memory behavior with CONFIG_NUMA_BALANCING=y, compared to the mainline kernel. One reason for that change in behavior is that with typical applications the mainline kernel allocates memory essentially randomly, and leaves it where it was. "Random" placement is not the worst possible placement - in fact it's a pretty good placement strategy. It's definitely possible for a NUMA-aware kernel to do worse than that, and CONFIG_NUMA_BALANCING=y regressed because it's very opinionated about which node tasks should execute and on which node they should allocate memory on. One such problematic case is when a node has already used up most of its memory - in that case it's pointless trying to allocate even more memory from there. Doing so would trigger OOMs even though the system has more memory on other nodes. The migration code is already trying to be nice when allocating memory for NUMA purposes - extend this concept to mempolicy driven allocations as well. Expose migrate_balanced_pgdat() and use it. If all fails try just as hard as the old code would. Hopefully this improves behavior in memory allocation corner cases. [ migrate_balanced_pgdat() should probably be moved to mm/page_alloc.c and be renamed to balanced_pgdat() or so - but this patch tries to be minimalistic. ] Reported-by: Zhouping Liu <zliu@xxxxxxxxxx> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Rik van Riel <riel@xxxxxxxxxx> Cc: Mel Gorman <mgorman@xxxxxxx> Cc: Hugh Dickins <hughd@xxxxxxxxxx> Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx> --- kernel/sched/core.c | 2 +- mm/mempolicy.c | 93 ++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 78 insertions(+), 17 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 05d4e1d..26ab5ff 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1566,7 +1566,7 @@ static void __sched_fork(struct task_struct *p) p->ideal_cpu_curr = -1; atomic_set(&p->numa_policy.refcnt, 1); p->numa_policy.mode = MPOL_INTERLEAVE; - p->numa_policy.flags = 0; + p->numa_policy.flags = MPOL_F_MOF; p->numa_policy.v.preferred_node = 0; p->numa_policy.v.nodes = node_online_map; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0649679..42da0f2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -115,7 +115,7 @@ enum zone_type policy_zone = 0; static struct mempolicy default_policy_local = { .refcnt = ATOMIC_INIT(1), /* never free it */ .mode = MPOL_PREFERRED, - .flags = MPOL_F_LOCAL, + .flags = MPOL_F_LOCAL | MPOL_F_MOF, }; static struct mempolicy *default_policy(void) @@ -1746,11 +1746,14 @@ unsigned slab_node(void) struct zonelist *zonelist; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); + int node; + zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); - return zone ? zone->node : numa_node_id(); + node = zone ? zone->node : numa_node_id(); + return node; } default: @@ -1960,6 +1963,66 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, return page; } +static struct page * +alloc_pages_nice(gfp_t gfp, int order, struct mempolicy *pol, int best_nid) +{ + struct zonelist *zl = policy_zonelist(gfp, pol, best_nid); +#ifdef CONFIG_NUMA_BALANCING + unsigned int pages = 1 << order; + gfp_t gfp_nice = gfp | GFP_THISNODE; +#endif + struct page *page = NULL; + nodemask_t *nodemask; + + nodemask = policy_nodemask(gfp, pol); + +#ifdef CONFIG_NUMA_BALANCING + /* Is our preferred node possible? */ + if (nodemask && !node_isset(best_nid, *nodemask)) + best_nid = find_first_bit(nodemask->bits, MAX_NUMNODES); + + if (migrate_balanced_pgdat(NODE_DATA(best_nid), pages)) { + page = alloc_pages_node(best_nid, gfp_nice, order); + if (page) + return page; + } + + /* + * For non-hard-bound tasks, see whether there's another node + * before trying harder: + */ + if (current->nr_cpus_allowed > 1) { + int nid; + + if (nodemask) { + int first_nid = find_first_bit(nodemask->bits, MAX_NUMNODES); + + page = alloc_pages_node(first_nid, gfp_nice, order); + if (page) + return page; + } + + /* + * Pick a less loaded node, if possible: + */ + for_each_node(nid) { + if (!migrate_balanced_pgdat(NODE_DATA(nid), pages)) + continue; + + page = alloc_pages_node(nid, gfp_nice, order); + if (page) + return page; + } + } +#endif + + /* If all failed then try the original plan: */ + if (!page) + page = __alloc_pages_nodemask(gfp, order, zl, nodemask); + + return page; +} + /** * alloc_pages_vma - Allocate a page for a VMA. * @@ -1988,8 +2051,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node) { struct mempolicy *pol; - struct zonelist *zl; - struct page *page; + struct page *page = NULL; unsigned int cpuset_mems_cookie; retry_cpuset: @@ -2007,13 +2069,12 @@ retry_cpuset: return page; } - zl = policy_zonelist(gfp, pol, node); if (unlikely(mpol_needs_cond_ref(pol))) { /* * slow path: ref counted shared policy */ - struct page *page = __alloc_pages_nodemask(gfp, order, - zl, policy_nodemask(gfp, pol)); + page = alloc_pages_nice(gfp, order, pol, node); + __mpol_put(pol); if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; @@ -2022,10 +2083,10 @@ retry_cpuset: /* * fast path: default or task policy */ - page = __alloc_pages_nodemask(gfp, order, zl, - policy_nodemask(gfp, pol)); + page = alloc_pages_nice(gfp, order, pol, node); if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; + return page; } @@ -2067,9 +2128,7 @@ retry_cpuset: if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else - page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol)); + page = alloc_pages_nice(gfp, order, pol, numa_node_id()); if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; @@ -2284,8 +2343,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long pol = get_vma_policy(current, vma, addr); if (!(pol->flags & MPOL_F_MOF)) - goto out; - + goto out_keep_page; + if (task_numa_shared(current) < 0) + goto out_keep_page; + switch (pol->mode) { case MPOL_INTERLEAVE: { @@ -2321,7 +2382,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * If no allowed nodes, use current [!misplaced]. */ if (node_isset(page_nid, pol->v.nodes)) - goto out; + goto out_keep_page; (void)first_zones_zonelist( node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), @@ -2369,7 +2430,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long if (cpu_last_access == this_cpu) target_node = this_node; } -out: +out_keep_page: mpol_cond_put(pol); /* Page already at its ideal target node: */ -- To unsubscribe from this list: send the line "unsubscribe linux-tip-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html