The patch titled oom-kill: fix NUMA constraint check with nodemask has been removed from the -mm tree. Its filename was oom-kill-fix-numa-consraint-check-with-nodemask-v42.patch This patch was dropped because it was merged into mainline or a subsystem tree The current -mm tree may be found at http://userweb.kernel.org/~akpm/mmotm/ ------------------------------------------------------ Subject: oom-kill: fix NUMA constraint check with nodemask From: KAMEZAWA Hiroyuki <kamezawa.hioryu@xxxxxxxxxxxxxx> Fix node-oriented allocation handling in oom-kill.c I myself think of this as a bugfix not as an ehnancement. In these days, things are changed as - alloc_pages() eats nodemask as its arguments, __alloc_pages_nodemask(). - mempolicy don't maintain its own private zonelists. (And cpuset doesn't use nodemask for __alloc_pages_nodemask()) So, current oom-killer's check function is wrong. This patch does - check nodemask, if nodemask && nodemask doesn't cover all node_states[N_HIGH_MEMORY], this is CONSTRAINT_MEMORY_POLICY. - Scan all zonelist under nodemask, if it hits cpuset's wall this faiulre is from cpuset. And - modifies the caller of out_of_memory not to call oom if __GFP_THISNODE. This doesn't change "current" behavior. If callers use __GFP_THISNODE it should handle "page allocation failure" by itself. - handle __GFP_NOFAIL+__GFP_THISNODE path. This is something like a FIXME but this gfpmask is not used now. [akpm@xxxxxxxxxxxxxxxxxxxx: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hioryu@xxxxxxxxxxxxxx> Acked-by: David Rientjes <rientjes@xxxxxxxxxx> Cc: Daisuke Nishimura <nishimura@xxxxxxxxxxxxxxxxx> Cc: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> Cc: Christoph Lameter <cl@xxxxxxxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- drivers/char/sysrq.c | 2 - include/linux/oom.h | 4 ++- mm/oom_kill.c | 46 +++++++++++++++++++++++++++++------------ mm/page_alloc.c | 22 ++++++++++++++----- 4 files changed, 53 insertions(+), 21 deletions(-) diff -puN drivers/char/sysrq.c~oom-kill-fix-numa-consraint-check-with-nodemask-v42 drivers/char/sysrq.c --- a/drivers/char/sysrq.c~oom-kill-fix-numa-consraint-check-with-nodemask-v42 +++ a/drivers/char/sysrq.c @@ -339,7 +339,7 @@ static struct sysrq_key_op sysrq_term_op static void moom_callback(struct work_struct *ignored) { - out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0); + out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL); } static DECLARE_WORK(moom_work, moom_callback); diff -puN include/linux/oom.h~oom-kill-fix-numa-consraint-check-with-nodemask-v42 include/linux/oom.h --- a/include/linux/oom.h~oom-kill-fix-numa-consraint-check-with-nodemask-v42 +++ a/include/linux/oom.h @@ -10,6 +10,7 @@ #ifdef __KERNEL__ #include <linux/types.h> +#include <linux/nodemask.h> struct zonelist; struct notifier_block; @@ -26,7 +27,8 @@ enum oom_constraint { extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); -extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); +extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *mask); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); diff -puN mm/oom_kill.c~oom-kill-fix-numa-consraint-check-with-nodemask-v42 mm/oom_kill.c --- a/mm/oom_kill.c~oom-kill-fix-numa-consraint-check-with-nodemask-v42 +++ a/mm/oom_kill.c @@ -196,27 +196,46 @@ unsigned long badness(struct task_struct /* * Determine the type of allocation constraint. */ -static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, - gfp_t gfp_mask) -{ #ifdef CONFIG_NUMA +static enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask, nodemask_t *nodemask) +{ struct zone *zone; struct zoneref *z; enum zone_type high_zoneidx = gfp_zone(gfp_mask); - nodemask_t nodes = node_states[N_HIGH_MEMORY]; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - if (cpuset_zone_allowed_softwall(zone, gfp_mask)) - node_clear(zone_to_nid(zone), nodes); - else - return CONSTRAINT_CPUSET; + /* + * Reach here only when __GFP_NOFAIL is used. So, we should avoid + * to kill current.We have to random task kill in this case. + * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. + */ + if (gfp_mask & __GFP_THISNODE) + return CONSTRAINT_NONE; - if (!nodes_empty(nodes)) + /* + * The nodemask here is a nodemask passed to alloc_pages(). Now, + * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy + * feature. mempolicy is an only user of nodemask here. + * check mempolicy's nodemask contains all N_HIGH_MEMORY + */ + if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) return CONSTRAINT_MEMORY_POLICY; -#endif + + /* Check this allocation failure is caused by cpuset's wall function */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + high_zoneidx, nodemask) + if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) + return CONSTRAINT_CPUSET; return CONSTRAINT_NONE; } +#else +static enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask, nodemask_t *nodemask) +{ + return CONSTRAINT_NONE; +} +#endif /* * Simple selection loop. We chose the process with the highest @@ -613,7 +632,8 @@ rest_and_return: * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) +void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask) { unsigned long freed = 0; enum oom_constraint constraint; @@ -632,7 +652,7 @@ void out_of_memory(struct zonelist *zone * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ - constraint = constrained_alloc(zonelist, gfp_mask); + constraint = constrained_alloc(zonelist, gfp_mask, nodemask); read_lock(&tasklist_lock); switch (constraint) { diff -puN mm/page_alloc.c~oom-kill-fix-numa-consraint-check-with-nodemask-v42 mm/page_alloc.c --- a/mm/page_alloc.c~oom-kill-fix-numa-consraint-check-with-nodemask-v42 +++ a/mm/page_alloc.c @@ -1654,12 +1654,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, un if (page) goto out; - /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) - goto out; - + if (!(gfp_mask & __GFP_NOFAIL)) { + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + /* + * GFP_THISNODE contains __GFP_NORETRY and we never hit this. + * Sanity check for bare calls of __GFP_THISNODE, not real OOM. + * The caller should handle page allocation failure by itself if + * it specifies __GFP_THISNODE. + * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. + */ + if (gfp_mask & __GFP_THISNODE) + goto out; + } /* Exhausted what can be done so it's blamo time */ - out_of_memory(zonelist, gfp_mask, order); + out_of_memory(zonelist, gfp_mask, order, nodemask); out: clear_zonelist_oom(zonelist, gfp_mask); @@ -3123,7 +3133,7 @@ static int __cpuinit process_zones(int c if (percpu_pagelist_fraction) setup_pagelist_highmark(zone_pcp(zone, cpu), - (zone->present_pages / percpu_pagelist_fraction)); + (zone->present_pages / percpu_pagelist_fraction)); } return 0; _ Patches currently in -mm which might be from kamezawa.hioryu@xxxxxxxxxxxxxx are origin.patch -- To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html