Hi Roman, great work! This looks mostly good to me now. Below are some nitpicks concerning naming and code layout, but nothing major. On Mon, Aug 14, 2017 at 07:32:11PM +0100, Roman Gushchin wrote: > @@ -39,6 +39,7 @@ struct oom_control { > unsigned long totalpages; > struct task_struct *chosen; > unsigned long chosen_points; > + struct mem_cgroup *chosen_memcg; > }; Please rename 'chosen' to 'chosen_task' to make the distinction to chosen_memcg clearer. The ordering is a little weird too, with chosen_points in between. chosen_task chosen_memcg chosen_points ? > @@ -2639,6 +2639,181 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) > return ret; > } > > +static long memcg_oom_badness(struct mem_cgroup *memcg, > + const nodemask_t *nodemask) > +{ > + long points = 0; > + int nid; > + > + for_each_node_state(nid, N_MEMORY) { > + if (nodemask && !node_isset(nid, *nodemask)) > + continue; > + > + points += mem_cgroup_node_nr_lru_pages(memcg, nid, > + LRU_ALL_ANON | BIT(LRU_UNEVICTABLE)); > + } > + > + points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) / > + (PAGE_SIZE / 1024); > + points += memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE); NR_SLAB_UNRECLAIMABLE is now accounted per-lruvec, which takes nodeness into account, and so would be more accurate here. You can get it with mem_cgroup_lruvec() and lruvec_page_state(). > + points += memcg_page_state(memcg, MEMCG_SOCK); > + points += memcg_page_state(memcg, MEMCG_SWAP); > + > + return points; > +} > + > +static long oom_evaluate_memcg(struct mem_cgroup *memcg, > + const nodemask_t *nodemask) > +{ > + struct css_task_iter it; > + struct task_struct *task; > + int elegible = 0; eligible > + > + css_task_iter_start(&memcg->css, 0, &it); > + while ((task = css_task_iter_next(&it))) { > + /* > + * If there are no tasks, or all tasks have oom_score_adj set > + * to OOM_SCORE_ADJ_MIN and oom_kill_all_tasks is not set, > + * don't select this memory cgroup. > + */ > + if (!elegible && > + (memcg->oom_kill_all_tasks || > + task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN)) > + elegible = 1; This is a little awkward to read. How about something like this: /* * When killing individual tasks, we respect OOM score adjustments: * at least one task in the group needs to be killable for the group * to be oomable. * * Also check that previous OOM kills have finished, and abort if * there are any pending OOM victims. */ oomable = memcg->oom_kill_all_tasks; while ((task = css_task_iter_next(&it))) { if (!oomable && task->signal_oom_score_adj != OOM_SCORE_ADJ_MIN) oomable = 1; > + if (tsk_is_oom_victim(task) && > + !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) { > + elegible = -1; > + break; > + } > + } > + css_task_iter_end(&it); etc. > + > + return elegible > 0 ? memcg_oom_badness(memcg, nodemask) : elegible; I find these much easier to read if broken up, even if it's more LOC: if (eligible <= 0) return eligible; return memcg_oom_badness(memcg, nodemask); > +static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc) > +{ > + struct mem_cgroup *iter, *parent; > + > + for_each_mem_cgroup_tree(iter, root) { > + if (memcg_has_children(iter)) { > + iter->oom_score = 0; > + continue; > + } > + > + iter->oom_score = oom_evaluate_memcg(iter, oc->nodemask); > + if (iter->oom_score == -1) { Please add comments to document the special returns. Maybe #defines would be clearer, too. > + oc->chosen_memcg = (void *)-1UL; > + mem_cgroup_iter_break(root, iter); > + return; > + } > + > + if (!iter->oom_score) > + continue; Same here. Maybe a switch would be suitable to handle the abort/no-score cases. > + for (parent = parent_mem_cgroup(iter); parent && parent != root; > + parent = parent_mem_cgroup(parent)) > + parent->oom_score += iter->oom_score; > + } > + > + for (;;) { > + struct cgroup_subsys_state *css; > + struct mem_cgroup *memcg = NULL; > + long score = LONG_MIN; > + > + css_for_each_child(css, &root->css) { > + struct mem_cgroup *iter = mem_cgroup_from_css(css); > + > + if (iter->oom_score > score) { > + memcg = iter; > + score = iter->oom_score; > + } > + } > + > + if (!memcg) { > + if (oc->memcg && root == oc->memcg) { > + oc->chosen_memcg = oc->memcg; > + css_get(&oc->chosen_memcg->css); > + oc->chosen_points = oc->memcg->oom_score; > + } > + break; > + } > + > + if (memcg->oom_kill_all_tasks || !memcg_has_children(memcg)) { > + oc->chosen_memcg = memcg; > + css_get(&oc->chosen_memcg->css); > + oc->chosen_points = score; > + break; > + } > + > + root = memcg; > + } > +} > + > +static void select_victim_root_cgroup_task(struct oom_control *oc) > +{ > + struct css_task_iter it; > + struct task_struct *task; > + int ret = 0; > + > + css_task_iter_start(&root_mem_cgroup->css, 0, &it); > + while (!ret && (task = css_task_iter_next(&it))) > + ret = oom_evaluate_task(task, oc); > + css_task_iter_end(&it); > +} > + > +bool mem_cgroup_select_oom_victim(struct oom_control *oc) > +{ > + struct mem_cgroup *root = root_mem_cgroup; > + > + oc->chosen = NULL; > + oc->chosen_memcg = NULL; > + > + if (mem_cgroup_disabled()) > + return false; > + > + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) > + return false; > + > + if (oc->memcg) > + root = oc->memcg; > + > + rcu_read_lock(); > + > + select_victim_memcg(root, oc); > + if (oc->chosen_memcg == (void *)-1UL) { > + /* Existing OOM victims are found. */ > + rcu_read_unlock(); > + return true; > + } It would be good to format this branch like the block below, with a newline and the comment before the branch block rather than inside. That would also set apart the call to select_victim_memcg(), which is the main workhorse of this function. > + /* > + * For system-wide OOMs we should consider tasks in the root cgroup > + * with oom_score larger than oc->chosen_points. > + */ > + if (!oc->memcg) { > + select_victim_root_cgroup_task(oc); > + > + if (oc->chosen && oc->chosen_memcg) { > + /* > + * If we've decided to kill a task in the root memcg, > + * release chosen_memcg. > + */ > + css_put(&oc->chosen_memcg->css); > + oc->chosen_memcg = NULL; > + } > + } ^^ like this one. > + > + rcu_read_unlock(); > + > + return !!oc->chosen || !!oc->chosen_memcg; The !! are detrimental to readability and shouldn't be necessary. > @@ -5190,6 +5365,33 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, > return nbytes; > } > > +static int memory_oom_kill_all_tasks_show(struct seq_file *m, void *v) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); > + bool oom_kill_all_tasks = memcg->oom_kill_all_tasks; > + > + seq_printf(m, "%d\n", oom_kill_all_tasks); > + > + return 0; > +} > + > +static ssize_t memory_oom_kill_all_tasks_write(struct kernfs_open_file *of, > + char *buf, size_t nbytes, > + loff_t off) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); > + int oom_kill_all_tasks; > + int err; > + > + err = kstrtoint(strstrip(buf), 0, &oom_kill_all_tasks); > + if (err) > + return err; > + > + memcg->oom_kill_all_tasks = !!oom_kill_all_tasks; > + > + return nbytes; > +} > + > static int memory_events_show(struct seq_file *m, void *v) > { > struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); > @@ -5310,6 +5512,12 @@ static struct cftype memory_files[] = { > .write = memory_max_write, > }, > { > + .name = "oom_kill_all_tasks", > + .flags = CFTYPE_NOT_ON_ROOT, > + .seq_show = memory_oom_kill_all_tasks_show, > + .write = memory_oom_kill_all_tasks_write, > + }, This name is quite a mouthful and reminiscent of the awkward v1 interface names. It doesn't really go well with the v2 names. How about memory.oom_group? > + { > .name = "events", > .flags = CFTYPE_NOT_ON_ROOT, > .file_offset = offsetof(struct mem_cgroup, events_file), > diff --git a/mm/oom_kill.c b/mm/oom_kill.c > index 5c29a3dd591b..28e42a0d5eee 100644 > --- a/mm/oom_kill.c > +++ b/mm/oom_kill.c > @@ -288,7 +288,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) > return CONSTRAINT_NONE; > } > > -static int oom_evaluate_task(struct task_struct *task, void *arg) > +int oom_evaluate_task(struct task_struct *task, void *arg) > { > struct oom_control *oc = arg; > unsigned long points; > @@ -823,6 +823,9 @@ static void __oom_kill_process(struct task_struct *victim) > struct mm_struct *mm; > bool can_oom_reap = true; > > + if (is_global_init(victim) || (victim->flags & PF_KTHREAD)) > + return; > + > p = find_lock_task_mm(victim); > if (!p) { > put_task_struct(victim); > @@ -958,6 +961,60 @@ static void oom_kill_process(struct oom_control *oc, const char *message) > put_task_struct(victim); > } > > +static int oom_kill_memcg_member(struct task_struct *task, void *unused) > +{ > + if (!tsk_is_oom_victim(task)) > + __oom_kill_process(task); > + return 0; > +} > + > +static bool oom_kill_memcg_victim(struct oom_control *oc) > +{ > + static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, > + DEFAULT_RATELIMIT_BURST); > + > + if (oc->chosen) { > + if (oc->chosen != (void *)-1UL) { > + if (__ratelimit(&oom_rs)) > + dump_header(oc, oc->chosen); > + > + __oom_kill_process(oc->chosen); > + put_task_struct(oc->chosen); > + schedule_timeout_killable(1); > + } > + return true; > + > + } else if (oc->chosen_memcg) { > + if (oc->chosen_memcg == (void *)-1UL) > + return true; Can you format the above chosen == (void *)-1UL the same way? That makes it easier to see that it's checking the same thing. Thanks -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html