When out_of_memory() is called consecutively, sometimes doing last second allocation attempt after selecting an OOM victim can succeed because somebody (presumably previously killed OOM victims) might have managed to free memory while we were selecting an OOM victim which can take quite some time, for setting MMF_OOM_SKIP by exiting OOM victims is not serialized by oom_lock. Therefore, this patch moves last second allocation attempt to after selecting an OOM victim. This patch is expected to reduce the time window for potentially pre-mature OOM killing considerably. Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> Suggested-by: Michal Hocko <mhocko@xxxxxxxx> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> --- include/linux/oom.h | 13 +++++++++++++ mm/oom_kill.c | 14 ++++++++++++++ mm/page_alloc.c | 40 ++++++++++++++++++++++++---------------- 3 files changed, 51 insertions(+), 16 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 01c91d8..27cd36b 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -14,6 +14,8 @@ struct notifier_block; struct mem_cgroup; struct task_struct; +struct alloc_context; +struct page; /* * Details of the page allocation that triggered the oom killer that are used to @@ -38,6 +40,15 @@ struct oom_control { */ const int order; + /* Context for really last second allocation attempt. */ + const struct alloc_context *ac; + /* + * Set by the OOM killer if ac != NULL and last second allocation + * attempt succeeded. If ac != NULL, the caller must check for + * page != NULL. + */ + struct page *page; + /* Used by oom implementation, do not set */ unsigned long totalpages; struct task_struct *chosen; @@ -102,6 +113,8 @@ extern unsigned long oom_badness(struct task_struct *p, extern struct task_struct *find_lock_task_mm(struct task_struct *p); +extern struct page *alloc_pages_before_oomkill(const struct oom_control *oc); + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 85eced9..cf6f19b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1065,6 +1065,9 @@ bool out_of_memory(struct oom_control *oc) if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + oc->page = alloc_pages_before_oomkill(oc); + if (oc->page) + return true; get_task_struct(current); oc->chosen = current; oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); @@ -1072,6 +1075,17 @@ bool out_of_memory(struct oom_control *oc) } select_bad_process(oc); + /* + * Try really last second allocation attempt after we selected an OOM + * victim, for somebody might have managed to free memory while we were + * selecting an OOM victim which can take quite some time. + */ + oc->page = alloc_pages_before_oomkill(oc); + if (oc->page) { + if (oc->chosen && oc->chosen != (void *)-1UL) + put_task_struct(oc->chosen); + return true; + } /* Found nothing?!?! Either we hang forever, or we panic. */ if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { dump_header(oc, NULL); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 613814c..764f24c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3325,6 +3325,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) .memcg = NULL, .gfp_mask = gfp_mask, .order = order, + .ac = ac, }; struct page *page; @@ -3340,18 +3341,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) return NULL; } - /* - * This allocation attempt must not depend on __GFP_DIRECT_RECLAIM && - * !__GFP_NORETRY allocation which will never fail due to oom_lock - * already held. And since this allocation attempt does not sleep, - * there is no reason we must use high watermark here. - */ - page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & - ~__GFP_DIRECT_RECLAIM, order, - ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); - if (page) - goto out; - /* Coredumps can quickly deplete all memory reserves */ if (current->flags & PF_DUMPCORE) goto out; @@ -3386,16 +3375,18 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) goto out; /* Exhausted what can be done so it's blamo time */ - if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { + if (out_of_memory(&oc)) { + *did_some_progress = 1; + page = oc.page; + } else if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; /* * Help non-failing allocations by giving them access to memory * reserves */ - if (gfp_mask & __GFP_NOFAIL) - page = __alloc_pages_cpuset_fallback(gfp_mask, order, - ALLOC_NO_WATERMARKS, ac); + page = __alloc_pages_cpuset_fallback(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); } out: mutex_unlock(&oom_lock); @@ -4155,6 +4146,23 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return page; } +struct page *alloc_pages_before_oomkill(const struct oom_control *oc) +{ + /* + * This allocation attempt must not depend on __GFP_DIRECT_RECLAIM && + * !__GFP_NORETRY allocation which will never fail due to oom_lock + * already held. And since this allocation attempt does not sleep, + * there is no reason we must use high watermark here. + */ + int alloc_flags = ALLOC_CPUSET | ALLOC_WMARK_HIGH; + gfp_t gfp_mask = oc->gfp_mask | __GFP_HARDWALL; + + if (!oc->ac) + return NULL; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + return get_page_from_freelist(gfp_mask, oc->order, alloc_flags, oc->ac); +} + static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask, struct alloc_context *ac, gfp_t *alloc_mask, -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>