On Thu 30-11-17 15:28:20, Roman Gushchin wrote: > Traditionally, the OOM killer is operating on a process level. > Under oom conditions, it finds a process with the highest oom score > and kills it. > > This behavior doesn't suit well the system with many running > containers: > > 1) There is no fairness between containers. A small container with > few large processes will be chosen over a large one with huge > number of small processes. > > 2) Containers often do not expect that some random process inside > will be killed. In many cases much safer behavior is to kill > all tasks in the container. Traditionally, this was implemented > in userspace, but doing it in the kernel has some advantages, > especially in a case of a system-wide OOM. > > To address these issues, the cgroup-aware OOM killer is introduced. > > This patch introduces the core functionality: an ability to select > a memory cgroup as an OOM victim. Under OOM conditions the OOM killer > looks for the biggest leaf memory cgroup and kills the biggest > task belonging to it. > > The following patches will extend this functionality to consider > non-leaf memory cgroups as OOM victims, and also provide an ability > to kill all tasks belonging to the victim cgroup. > > The root cgroup is treated as a leaf memory cgroup, so it's score > is compared with other leaf memory cgroups. > Due to memcg statistics implementation a special approximation > is used for estimating oom_score of root memory cgroup: we sum > oom_score of the belonging processes (or, to be more precise, > tasks owning their mm structures). > > Signed-off-by: Roman Gushchin <guro@xxxxxx> > Cc: Michal Hocko <mhocko@xxxxxxxx> > Cc: Johannes Weiner <hannes@xxxxxxxxxxx> > Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx> > Cc: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> > Cc: David Rientjes <rientjes@xxxxxxxxxx> > Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > Cc: Tejun Heo <tj@xxxxxxxxxx> > Cc: kernel-team@xxxxxx > Cc: cgroups@xxxxxxxxxxxxxxx > Cc: linux-doc@xxxxxxxxxxxxxxx > Cc: linux-kernel@xxxxxxxxxxxxxxx > Cc: linux-mm@xxxxxxxxx I am not entirely happy that this patch enables the cgroup behavior unconditioanlly for cgroup v2 but later patch fixes that up. I do not expect people are going to bisect oom workloads over these few commits so this should be a big deal. Anyway I still _strongly_ believe that the new heuristic is not suitable for the default behavior and the opt-in is required. So my ack is under this condition. Acked-by: Michal Hocko <mhocko@xxxxxxxx> > --- > include/linux/memcontrol.h | 17 +++++ > include/linux/oom.h | 12 ++- > mm/memcontrol.c | 181 +++++++++++++++++++++++++++++++++++++++++++++ > mm/oom_kill.c | 84 +++++++++++++++------ > 4 files changed, 272 insertions(+), 22 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 882046863581..cb4db659a8b5 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -35,6 +35,7 @@ struct mem_cgroup; > struct page; > struct mm_struct; > struct kmem_cache; > +struct oom_control; > > /* Cgroup-specific page state, on top of universal node page state */ > enum memcg_stat_item { > @@ -344,6 +345,11 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ > return css ? container_of(css, struct mem_cgroup, css) : NULL; > } > > +static inline void mem_cgroup_put(struct mem_cgroup *memcg) > +{ > + css_put(&memcg->css); > +} > + > #define mem_cgroup_from_counter(counter, member) \ > container_of(counter, struct mem_cgroup, member) > > @@ -482,6 +488,8 @@ static inline bool task_in_memcg_oom(struct task_struct *p) > > bool mem_cgroup_oom_synchronize(bool wait); > > +bool mem_cgroup_select_oom_victim(struct oom_control *oc); > + > #ifdef CONFIG_MEMCG_SWAP > extern int do_swap_account; > #endif > @@ -781,6 +789,10 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, > return true; > } > > +static inline void mem_cgroup_put(struct mem_cgroup *memcg) > +{ > +} > + > static inline struct mem_cgroup * > mem_cgroup_iter(struct mem_cgroup *root, > struct mem_cgroup *prev, > @@ -973,6 +985,11 @@ static inline > void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) > { > } > + > +static inline bool mem_cgroup_select_oom_victim(struct oom_control *oc) > +{ > + return false; > +} > #endif /* CONFIG_MEMCG */ > > /* idx can be of type enum memcg_stat_item or node_stat_item */ > diff --git a/include/linux/oom.h b/include/linux/oom.h > index 27cd36b762b5..10f495c8454d 100644 > --- a/include/linux/oom.h > +++ b/include/linux/oom.h > @@ -10,6 +10,13 @@ > #include <linux/sched/coredump.h> /* MMF_* */ > #include <linux/mm.h> /* VM_FAULT* */ > > + > +/* > + * Special value returned by victim selection functions to indicate > + * that are inflight OOM victims. > + */ > +#define INFLIGHT_VICTIM ((void *)-1UL) > + > struct zonelist; > struct notifier_block; > struct mem_cgroup; > @@ -51,7 +58,8 @@ struct oom_control { > > /* Used by oom implementation, do not set */ > unsigned long totalpages; > - struct task_struct *chosen; > + struct task_struct *chosen_task; > + struct mem_cgroup *chosen_memcg; > unsigned long chosen_points; > }; > > @@ -115,6 +123,8 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p); > > extern struct page *alloc_pages_before_oomkill(const struct oom_control *oc); > > +extern int oom_evaluate_task(struct task_struct *task, void *arg); > + > /* sysctls */ > extern int sysctl_oom_dump_tasks; > extern int sysctl_oom_kill_allocating_task; > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 55fbda60cef6..592ffb1c98a7 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -2664,6 +2664,187 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) > return ret; > } > > +static long memcg_oom_badness(struct mem_cgroup *memcg, > + const nodemask_t *nodemask, > + unsigned long totalpages) > +{ > + long points = 0; > + int nid; > + pg_data_t *pgdat; > + > + for_each_node_state(nid, N_MEMORY) { > + if (nodemask && !node_isset(nid, *nodemask)) > + continue; > + > + points += mem_cgroup_node_nr_lru_pages(memcg, nid, > + LRU_ALL_ANON | BIT(LRU_UNEVICTABLE)); > + > + pgdat = NODE_DATA(nid); > + points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg), > + NR_SLAB_UNRECLAIMABLE); > + } > + > + points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) / > + (PAGE_SIZE / 1024); > + points += memcg_page_state(memcg, MEMCG_SOCK); > + points += memcg_page_state(memcg, MEMCG_SWAP); > + > + return points; > +} > + > +/* > + * Checks if the given memcg is a valid OOM victim and returns a number, > + * which means the folowing: > + * -1: there are inflight OOM victim tasks, belonging to the memcg > + * 0: memcg is not eligible, e.g. all belonging tasks are protected > + * by oom_score_adj set to OOM_SCORE_ADJ_MIN > + * >0: memcg is eligible, and the returned value is an estimation > + * of the memory footprint > + */ > +static long oom_evaluate_memcg(struct mem_cgroup *memcg, > + const nodemask_t *nodemask, > + unsigned long totalpages) > +{ > + struct css_task_iter it; > + struct task_struct *task; > + int eligible = 0; > + > + /* > + * Root memory cgroup is a special case: > + * we don't have necessary stats to evaluate it exactly as > + * leaf memory cgroups, so we approximate it's oom_score > + * by summing oom_score of all belonging tasks, which are > + * owners of their mm structs. > + * > + * If there are inflight OOM victim tasks inside > + * the root memcg, we return -1. > + */ > + if (memcg == root_mem_cgroup) { > + struct css_task_iter it; > + struct task_struct *task; > + long score = 0; > + > + css_task_iter_start(&memcg->css, 0, &it); > + while ((task = css_task_iter_next(&it))) { > + if (tsk_is_oom_victim(task) && > + !test_bit(MMF_OOM_SKIP, > + &task->signal->oom_mm->flags)) { > + score = -1; > + break; > + } > + > + task_lock(task); > + if (!task->mm || task->mm->owner != task) { > + task_unlock(task); > + continue; > + } > + task_unlock(task); > + > + score += oom_badness(task, memcg, nodemask, > + totalpages); > + } > + css_task_iter_end(&it); > + > + return score; > + } > + > + /* > + * Memcg is OOM eligible if there are OOM killable tasks inside. > + * > + * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN > + * as unkillable. > + * > + * If there are inflight OOM victim tasks inside the memcg, > + * we return -1. > + */ > + css_task_iter_start(&memcg->css, 0, &it); > + while ((task = css_task_iter_next(&it))) { > + if (!eligible && > + task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) > + eligible = 1; > + > + if (tsk_is_oom_victim(task) && > + !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) { > + eligible = -1; > + break; > + } > + } > + css_task_iter_end(&it); > + > + if (eligible <= 0) > + return eligible; > + > + return memcg_oom_badness(memcg, nodemask, totalpages); > +} > + > +static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc) > +{ > + struct mem_cgroup *iter; > + > + oc->chosen_memcg = NULL; > + oc->chosen_points = 0; > + > + /* > + * The oom_score is calculated for leaf memory cgroups (including > + * the root memcg). > + */ > + rcu_read_lock(); > + for_each_mem_cgroup_tree(iter, root) { > + long score; > + > + if (memcg_has_children(iter) && iter != root_mem_cgroup) > + continue; > + > + score = oom_evaluate_memcg(iter, oc->nodemask, oc->totalpages); > + > + /* > + * Ignore empty and non-eligible memory cgroups. > + */ > + if (score == 0) > + continue; > + > + /* > + * If there are inflight OOM victims, we don't need > + * to look further for new victims. > + */ > + if (score == -1) { > + oc->chosen_memcg = INFLIGHT_VICTIM; > + mem_cgroup_iter_break(root, iter); > + break; > + } > + > + if (score > oc->chosen_points) { > + oc->chosen_points = score; > + oc->chosen_memcg = iter; > + } > + } > + > + if (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM) > + css_get(&oc->chosen_memcg->css); > + > + rcu_read_unlock(); > +} > + > +bool mem_cgroup_select_oom_victim(struct oom_control *oc) > +{ > + struct mem_cgroup *root; > + > + if (mem_cgroup_disabled()) > + return false; > + > + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) > + return false; > + > + if (oc->memcg) > + root = oc->memcg; > + else > + root = root_mem_cgroup; > + > + select_victim_memcg(root, oc); > + > + return oc->chosen_memcg; > +} > + > /* > * Reclaims as many pages from the given memcg as possible. > * > diff --git a/mm/oom_kill.c b/mm/oom_kill.c > index f041534d77d3..bcfa92f29407 100644 > --- a/mm/oom_kill.c > +++ b/mm/oom_kill.c > @@ -309,7 +309,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) > return CONSTRAINT_NONE; > } > > -static int oom_evaluate_task(struct task_struct *task, void *arg) > +int oom_evaluate_task(struct task_struct *task, void *arg) > { > struct oom_control *oc = arg; > unsigned long points; > @@ -343,26 +343,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) > goto next; > > /* Prefer thread group leaders for display purposes */ > - if (points == oc->chosen_points && thread_group_leader(oc->chosen)) > + if (points == oc->chosen_points && thread_group_leader(oc->chosen_task)) > goto next; > select: > - if (oc->chosen) > - put_task_struct(oc->chosen); > + if (oc->chosen_task) > + put_task_struct(oc->chosen_task); > get_task_struct(task); > - oc->chosen = task; > + oc->chosen_task = task; > oc->chosen_points = points; > next: > return 0; > abort: > - if (oc->chosen) > - put_task_struct(oc->chosen); > - oc->chosen = (void *)-1UL; > + if (oc->chosen_task) > + put_task_struct(oc->chosen_task); > + oc->chosen_task = INFLIGHT_VICTIM; > return 1; > } > > /* > * Simple selection loop. We choose the process with the highest number of > - * 'points'. In case scan was aborted, oc->chosen is set to -1. > + * 'points'. In case scan was aborted, oc->chosen_task is set to -1. > */ > static void select_bad_process(struct oom_control *oc) > { > @@ -895,7 +895,7 @@ static void __oom_kill_process(struct task_struct *victim) > > static void oom_kill_process(struct oom_control *oc, const char *message) > { > - struct task_struct *p = oc->chosen; > + struct task_struct *p = oc->chosen_task; > unsigned int points = oc->chosen_points; > struct task_struct *victim = p; > struct task_struct *child; > @@ -956,6 +956,27 @@ static void oom_kill_process(struct oom_control *oc, const char *message) > __oom_kill_process(victim); > } > > +static bool oom_kill_memcg_victim(struct oom_control *oc) > +{ > + > + if (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM) > + return oc->chosen_memcg; > + > + /* Kill a task in the chosen memcg with the biggest memory footprint */ > + oc->chosen_points = 0; > + oc->chosen_task = NULL; > + mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc); > + > + if (oc->chosen_task == NULL || oc->chosen_task == INFLIGHT_VICTIM) > + goto out; > + > + __oom_kill_process(oc->chosen_task); > + > +out: > + mem_cgroup_put(oc->chosen_memcg); > + return oc->chosen_task; > +} > + > /* > * Determines whether the kernel must panic because of the panic_on_oom sysctl. > */ > @@ -1008,6 +1029,7 @@ bool out_of_memory(struct oom_control *oc) > { > unsigned long freed = 0; > enum oom_constraint constraint = CONSTRAINT_NONE; > + bool delay = false; /* if set, delay next allocation attempt */ > > if (oom_killer_disabled) > return false; > @@ -1055,11 +1077,26 @@ bool out_of_memory(struct oom_control *oc) > if (oc->page) > return true; > get_task_struct(current); > - oc->chosen = current; > + oc->chosen_task = current; > oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); > return true; > } > > + if (mem_cgroup_select_oom_victim(oc)) { > + oc->page = alloc_pages_before_oomkill(oc); > + if (oc->page) { > + if (oc->chosen_memcg && > + oc->chosen_memcg != INFLIGHT_VICTIM) > + mem_cgroup_put(oc->chosen_memcg); > + return true; > + } > + > + if (oom_kill_memcg_victim(oc)) { > + delay = true; > + goto out; > + } > + } > + > select_bad_process(oc); > /* > * Try really last second allocation attempt after we selected an OOM > @@ -1068,25 +1105,30 @@ bool out_of_memory(struct oom_control *oc) > */ > oc->page = alloc_pages_before_oomkill(oc); > if (oc->page) { > - if (oc->chosen && oc->chosen != (void *)-1UL) > - put_task_struct(oc->chosen); > + if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) > + put_task_struct(oc->chosen_task); > return true; > } > /* Found nothing?!?! Either we hang forever, or we panic. */ > - if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { > + if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { > dump_header(oc, NULL); > panic("Out of memory and no killable processes...\n"); > } > - if (oc->chosen && oc->chosen != (void *)-1UL) { > + if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) { > oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : > "Memory cgroup out of memory"); > - /* > - * Give the killed process a good chance to exit before trying > - * to allocate memory again. > - */ > - schedule_timeout_killable(1); > + delay = true; > } > - return !!oc->chosen; > + > +out: > + /* > + * Give the killed process a good chance to exit before trying > + * to allocate memory again. > + */ > + if (delay) > + schedule_timeout_killable(1); > + > + return !!oc->chosen_task; > } > > /* > -- > 2.14.3 > -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html