On Wed, 11 Mar 2020, Tetsuo Handa wrote: > > The patch certainly prevents unnecessary oom kills when there is a pending > > victim that uncharges its memory between invoking the oom killer and > > finding MMF_OOM_SKIP in the list of eligible tasks and its much more > > common on systems with limited cpu cores. > > I think that it is dump_header() which currently spends much time (due to > synchronous printing) enough to make "the second memcg oom kill shows usage > is >40MB below its limit of 100MB" happen. Shouldn't we call dump_header() > and then do the last check and end with "but did not kill anybody" message? > Lol, I actually did that for internal testing as well :) I didn't like how it spammed the kernel log and then basically said "just kidding, nothing was oom killed." But if you think this would helpful I can propose it as v2. --- include/linux/memcontrol.h | 7 +++++++ mm/memcontrol.c | 2 +- mm/oom_kill.c | 14 +++++++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -445,6 +445,8 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); int mem_cgroup_scan_tasks(struct mem_cgroup *, int (*)(struct task_struct *, void *), void *); +unsigned long mem_cgroup_margin(struct mem_cgroup *memcg); + static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) @@ -945,6 +947,11 @@ static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, return 0; } +static inline unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) +{ + return 0; +} + static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1286,7 +1286,7 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, * Returns the maximum amount of memory @mem can be charged with, in * pages. */ -static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) +unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) { unsigned long margin = 0; unsigned long count; diff --git a/mm/oom_kill.c b/mm/oom_kill.c --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -934,7 +934,6 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) mmdrop(mm); put_task_struct(victim); } -#undef K /* * Kill provided task unless it's secured by setting @@ -982,6 +981,18 @@ static void oom_kill_process(struct oom_control *oc, const char *message) */ oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); + /* One last check: do we *really* need to kill? */ + if (is_memcg_oom(oc)) { + unsigned long margin = mem_cgroup_margin(oc->memcg); + + if (unlikely(margin >= (1 << oc->order))) { + put_task_struct(victim); + pr_info("Suppressed oom kill, %lukB of memory can be charged\n", + K(margin)); + return; + } + } + __oom_kill_process(victim, message); /* @@ -994,6 +1005,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) mem_cgroup_put(oom_group); } } +#undef K /* * Determines whether the kernel must panic because of the panic_on_oom sysctl.