Re: [v5 2/4] mm, oom: cgroup-aware OOM killer

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Roman,

great work! This looks mostly good to me now. Below are some nitpicks
concerning naming and code layout, but nothing major.

On Mon, Aug 14, 2017 at 07:32:11PM +0100, Roman Gushchin wrote:
> @@ -39,6 +39,7 @@ struct oom_control {
>  	unsigned long totalpages;
>  	struct task_struct *chosen;
>  	unsigned long chosen_points;
> +	struct mem_cgroup *chosen_memcg;
>  };

Please rename 'chosen' to 'chosen_task' to make the distinction to
chosen_memcg clearer.

The ordering is a little weird too, with chosen_points in between.

	chosen_task
	chosen_memcg
	chosen_points

?

> @@ -2639,6 +2639,181 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
>  	return ret;
>  }
>  
> +static long memcg_oom_badness(struct mem_cgroup *memcg,
> +			      const nodemask_t *nodemask)
> +{
> +	long points = 0;
> +	int nid;
> +
> +	for_each_node_state(nid, N_MEMORY) {
> +		if (nodemask && !node_isset(nid, *nodemask))
> +			continue;
> +
> +		points += mem_cgroup_node_nr_lru_pages(memcg, nid,
> +				LRU_ALL_ANON | BIT(LRU_UNEVICTABLE));
> +	}
> +
> +	points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) /
> +		(PAGE_SIZE / 1024);
> +	points += memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE);

NR_SLAB_UNRECLAIMABLE is now accounted per-lruvec, which takes
nodeness into account, and so would be more accurate here.

You can get it with mem_cgroup_lruvec() and lruvec_page_state().

> +	points += memcg_page_state(memcg, MEMCG_SOCK);
> +	points += memcg_page_state(memcg, MEMCG_SWAP);
> +
> +	return points;
> +}
> +
> +static long oom_evaluate_memcg(struct mem_cgroup *memcg,
> +			       const nodemask_t *nodemask)
> +{
> +	struct css_task_iter it;
> +	struct task_struct *task;
> +	int elegible = 0;

eligible

> +
> +	css_task_iter_start(&memcg->css, 0, &it);
> +	while ((task = css_task_iter_next(&it))) {
> +		/*
> +		 * If there are no tasks, or all tasks have oom_score_adj set
> +		 * to OOM_SCORE_ADJ_MIN and oom_kill_all_tasks is not set,
> +		 * don't select this memory cgroup.
> +		 */
> +		if (!elegible &&
> +		    (memcg->oom_kill_all_tasks ||
> +		     task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN))
> +			elegible = 1;

This is a little awkward to read. How about something like this:

	/*
	 * When killing individual tasks, we respect OOM score adjustments:
	 * at least one task in the group needs to be killable for the group
	 * to be oomable.
	 *
	 * Also check that previous OOM kills have finished, and abort if
	 * there are any pending OOM victims.
	 */
	oomable = memcg->oom_kill_all_tasks;
	while ((task = css_task_iter_next(&it))) {
		if (!oomable && task->signal_oom_score_adj != OOM_SCORE_ADJ_MIN)
			oomable = 1;

> +		if (tsk_is_oom_victim(task) &&
> +		    !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) {
> +			elegible = -1;
> +			break;
> +		}
> +	}
> +	css_task_iter_end(&it);

etc.

> +
> +	return elegible > 0 ? memcg_oom_badness(memcg, nodemask) : elegible;

I find these much easier to read if broken up, even if it's more LOC:

	if (eligible <= 0)
		return eligible;

	return memcg_oom_badness(memcg, nodemask);

> +static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
> +{
> +	struct mem_cgroup *iter, *parent;
> +
> +	for_each_mem_cgroup_tree(iter, root) {
> +		if (memcg_has_children(iter)) {
> +			iter->oom_score = 0;
> +			continue;
> +		}
> +
> +		iter->oom_score = oom_evaluate_memcg(iter, oc->nodemask);
> +		if (iter->oom_score == -1) {

Please add comments to document the special returns. Maybe #defines
would be clearer, too.

> +			oc->chosen_memcg = (void *)-1UL;
> +			mem_cgroup_iter_break(root, iter);
> +			return;
> +		}
> +
> +		if (!iter->oom_score)
> +			continue;

Same here.

Maybe a switch would be suitable to handle the abort/no-score cases.

> +		for (parent = parent_mem_cgroup(iter); parent && parent != root;
> +		     parent = parent_mem_cgroup(parent))
> +			parent->oom_score += iter->oom_score;
> +	}
> +
> +	for (;;) {
> +		struct cgroup_subsys_state *css;
> +		struct mem_cgroup *memcg = NULL;
> +		long score = LONG_MIN;
> +
> +		css_for_each_child(css, &root->css) {
> +			struct mem_cgroup *iter = mem_cgroup_from_css(css);
> +
> +			if (iter->oom_score > score) {
> +				memcg = iter;
> +				score = iter->oom_score;
> +			}
> +		}
> +
> +		if (!memcg) {
> +			if (oc->memcg && root == oc->memcg) {
> +				oc->chosen_memcg = oc->memcg;
> +				css_get(&oc->chosen_memcg->css);
> +				oc->chosen_points = oc->memcg->oom_score;
> +			}
> +			break;
> +		}
> +
> +		if (memcg->oom_kill_all_tasks || !memcg_has_children(memcg)) {
> +			oc->chosen_memcg = memcg;
> +			css_get(&oc->chosen_memcg->css);
> +			oc->chosen_points = score;
> +			break;
> +		}
> +
> +		root = memcg;
> +	}
> +}
> +
> +static void select_victim_root_cgroup_task(struct oom_control *oc)
> +{
> +	struct css_task_iter it;
> +	struct task_struct *task;
> +	int ret = 0;
> +
> +	css_task_iter_start(&root_mem_cgroup->css, 0, &it);
> +	while (!ret && (task = css_task_iter_next(&it)))
> +		ret = oom_evaluate_task(task, oc);
> +	css_task_iter_end(&it);
> +}
> +
> +bool mem_cgroup_select_oom_victim(struct oom_control *oc)
> +{
> +	struct mem_cgroup *root = root_mem_cgroup;
> +
> +	oc->chosen = NULL;
> +	oc->chosen_memcg = NULL;
> +
> +	if (mem_cgroup_disabled())
> +		return false;
> +
> +	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> +		return false;
> +
> +	if (oc->memcg)
> +		root = oc->memcg;
> +
> +	rcu_read_lock();
> +
> +	select_victim_memcg(root, oc);
> +	if (oc->chosen_memcg == (void *)-1UL) {
> +		/* Existing OOM victims are found. */
> +		rcu_read_unlock();
> +		return true;
> +	}

It would be good to format this branch like the block below, with a
newline and the comment before the branch block rather than inside.

That would also set apart the call to select_victim_memcg(), which is
the main workhorse of this function.

> +	/*
> +	 * For system-wide OOMs we should consider tasks in the root cgroup
> +	 * with oom_score larger than oc->chosen_points.
> +	 */
> +	if (!oc->memcg) {
> +		select_victim_root_cgroup_task(oc);
> +
> +		if (oc->chosen && oc->chosen_memcg) {
> +			/*
> +			 * If we've decided to kill a task in the root memcg,
> +			 * release chosen_memcg.
> +			 */
> +			css_put(&oc->chosen_memcg->css);
> +			oc->chosen_memcg = NULL;
> +		}
> +	}

^^ like this one.

> +
> +	rcu_read_unlock();
> +
> +	return !!oc->chosen || !!oc->chosen_memcg;

The !! are detrimental to readability and shouldn't be necessary.

> @@ -5190,6 +5365,33 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
>  	return nbytes;
>  }
>  
> +static int memory_oom_kill_all_tasks_show(struct seq_file *m, void *v)
> +{
> +	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
> +	bool oom_kill_all_tasks = memcg->oom_kill_all_tasks;
> +
> +	seq_printf(m, "%d\n", oom_kill_all_tasks);
> +
> +	return 0;
> +}
> +
> +static ssize_t memory_oom_kill_all_tasks_write(struct kernfs_open_file *of,
> +					       char *buf, size_t nbytes,
> +					       loff_t off)
> +{
> +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> +	int oom_kill_all_tasks;
> +	int err;
> +
> +	err = kstrtoint(strstrip(buf), 0, &oom_kill_all_tasks);
> +	if (err)
> +		return err;
> +
> +	memcg->oom_kill_all_tasks = !!oom_kill_all_tasks;
> +
> +	return nbytes;
> +}
> +
>  static int memory_events_show(struct seq_file *m, void *v)
>  {
>  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
> @@ -5310,6 +5512,12 @@ static struct cftype memory_files[] = {
>  		.write = memory_max_write,
>  	},
>  	{
> +		.name = "oom_kill_all_tasks",
> +		.flags = CFTYPE_NOT_ON_ROOT,
> +		.seq_show = memory_oom_kill_all_tasks_show,
> +		.write = memory_oom_kill_all_tasks_write,
> +	},

This name is quite a mouthful and reminiscent of the awkward v1
interface names. It doesn't really go well with the v2 names.

How about memory.oom_group?

> +	{
>  		.name = "events",
>  		.flags = CFTYPE_NOT_ON_ROOT,
>  		.file_offset = offsetof(struct mem_cgroup, events_file),
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 5c29a3dd591b..28e42a0d5eee 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -288,7 +288,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
>  	return CONSTRAINT_NONE;
>  }
>  
> -static int oom_evaluate_task(struct task_struct *task, void *arg)
> +int oom_evaluate_task(struct task_struct *task, void *arg)
>  {
>  	struct oom_control *oc = arg;
>  	unsigned long points;
> @@ -823,6 +823,9 @@ static void __oom_kill_process(struct task_struct *victim)
>  	struct mm_struct *mm;
>  	bool can_oom_reap = true;
>  
> +	if (is_global_init(victim) || (victim->flags & PF_KTHREAD))
> +		return;
> +
>  	p = find_lock_task_mm(victim);
>  	if (!p) {
>  		put_task_struct(victim);
> @@ -958,6 +961,60 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
>  	put_task_struct(victim);
>  }
>  
> +static int oom_kill_memcg_member(struct task_struct *task, void *unused)
> +{
> +	if (!tsk_is_oom_victim(task))
> +		__oom_kill_process(task);
> +	return 0;
> +}
> +
> +static bool oom_kill_memcg_victim(struct oom_control *oc)
> +{
> +	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
> +				      DEFAULT_RATELIMIT_BURST);
> +
> +	if (oc->chosen) {
> +		if (oc->chosen != (void *)-1UL) {
> +			if (__ratelimit(&oom_rs))
> +				dump_header(oc, oc->chosen);
> +
> +			__oom_kill_process(oc->chosen);
> +			put_task_struct(oc->chosen);
> +			schedule_timeout_killable(1);
> +		}
> +		return true;
> +
> +	} else if (oc->chosen_memcg) {
> +		if (oc->chosen_memcg == (void *)-1UL)
> +			return true;

Can you format the above chosen == (void *)-1UL the same way? That
makes it easier to see that it's checking the same thing.

Thanks
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]     [Linux Resources]

  Powered by Linux