[PATCH 3/3] memcg oom: bail out from the charge path if no victim found

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Without considering the manually triggered OOM, if no victim found in
system OOM, the system will be deadlocked on memory, however if no
victim found in memcg OOM, it can charge successfully and runs well.
This behavior in memcg oom is not proper because that can prevent the
memcg from being limited.

Take an easy example.
        $ cd /sys/fs/cgroup/foo/
	$ echo $$ > cgroup.procs
	$ echo 200M > memory.max
	$ cat memory.max
	209715200
	$ echo -1000 > /proc/$$/oom_score_adj
Then, let's run a memhog task in memcg foo, which will allocate 1G
memory and keeps running.
	$ /home/yafang/test/memhog &
Then memory.current will be greater than memory.max. Run bellow command
in another shell.
	$ cat /sys/fs/cgroup/foo/memory.current
	1097228288
The tasks which have already allocated memory and won't allocate new
memory still runs well. This behavior makes nonsense.

This patch is to improve it.
If no victim found in memcg oom, we should force the current task to
wait until there's available pages. That is similar with the behavior in
memcg1 when oom_kill_disable is set.

In memcg2, the memcg oom can also be triggered manually - by reducing
memory.max. We should distinguish this manually triggered memcg oom with
other reguler memcg oom, so a magic key "oom_control.order == -2" is set
in this situation. The tasks waiting in memcg oom will be waked up when we
enlarge the memory.max. As these tasks is killable, we can also kill
them directly.

In memcg1, it cooperates well with memory.oom_control(oom_kill_disable).
The tasks waiting in memcg1 oom can be waked up by enlarging
memory.limit_in_bytes or disabling oom_kill_disable. As oom_kill_disable
can be set manually, we should distinguish it with other reguler memcg
oom as well. A member named memcg_oom_wait is introduced in struct
task_struct to handle it.

Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx>
---
 include/linux/memcontrol.h |  8 ++++++--
 include/linux/sched.h      |  1 +
 mm/memcontrol.c            | 25 +++++++++++++++++++++----
 mm/oom_kill.c              | 22 ++++++++++++++++++++++
 4 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 767bac135787..5df2c08e2720 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -569,7 +569,8 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
 static inline enum oom_status task_in_memcg_oom_set(struct task_struct *p,
 						    struct mem_cgroup *memcg,
 						    gfp_t mask,
-						    int order)
+						    int order,
+						    bool force)
 {
 	if (!current->in_user_fault)
 		return OOM_SKIPPED;
@@ -578,6 +579,8 @@ static inline enum oom_status task_in_memcg_oom_set(struct task_struct *p,
 	p->memcg_in_oom = memcg;
 	p->memcg_oom_gfp_mask = mask;
 	p->memcg_oom_order = order;
+	if (force)
+		p->memcg_oom_wait = true;
 
 	return OOM_ASYNC;
 }
@@ -1051,7 +1054,8 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
 static inline enum oom_status task_in_memcg_oom_set(struct task_struct *p,
 						    struct mem_cgroup *memcg,
 						    gfp_t mask,
-						    int order)
+						    int order,
+						    bool force)
 {
 	return OOM_SUCCESS;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4418f5cb8324..cc1c7de7c248 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -774,6 +774,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_MEMCG
 	unsigned			in_user_fault:1;
+	unsigned			memcg_oom_wait:1;
 #endif
 #ifdef CONFIG_COMPAT_BRK
 	unsigned			brk_randomized:1;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d6cb1b786045..a637e13a4964 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1801,7 +1801,8 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
 	 * victim and then we have to bail out from the charge path.
 	 */
 	if (memcg->oom_kill_disable)
-		return task_in_memcg_oom_set(current, memcg, mask, order);
+		return task_in_memcg_oom_set(current, memcg, mask,
+					     order, false);
 
 	mem_cgroup_mark_under_oom(memcg);
 
@@ -1863,7 +1864,8 @@ bool mem_cgroup_oom_synchronize(bool handle)
 	if (locked)
 		mem_cgroup_oom_notify(memcg);
 
-	if (locked && !memcg->oom_kill_disable) {
+	if (locked && !memcg->oom_kill_disable &&
+	    !current->memcg_oom_wait) {
 		mem_cgroup_unmark_under_oom(memcg);
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
@@ -1883,7 +1885,10 @@ bool mem_cgroup_oom_synchronize(bool handle)
 		 */
 		memcg_oom_recover(memcg);
 	}
+
 cleanup:
+	if (current->memcg_oom_wait)
+		current->memcg_oom_wait = false;
 	current->memcg_in_oom = NULL;
 	css_put(&memcg->css);
 	return true;
@@ -6056,6 +6061,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
 	bool drained = false;
+	bool enlarge = false;
 	unsigned long max;
 	int err;
 
@@ -6069,7 +6075,12 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 	for (;;) {
 		unsigned long nr_pages = page_counter_read(&memcg->memory);
 
-		if (nr_pages <= max)
+		if (nr_pages < max) {
+			enlarge = true;
+			break;
+		}
+
+		if (nr_pages == max)
 			break;
 
 		if (signal_pending(current))
@@ -6081,6 +6092,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 			continue;
 		}
 
+		if (memcg->under_oom)
+			break;
+
 		if (nr_reclaims) {
 			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
 							  GFP_KERNEL, true))
@@ -6089,11 +6103,14 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 		}
 
 		memcg_memory_event(memcg, MEMCG_OOM);
-		if (mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0) !=
+		if (mem_cgroup_out_of_memory(memcg, GFP_KERNEL, -2) !=
 		    OOM_SUCCESS)
 			break;
 	}
 
+	if (enlarge)
+		memcg_oom_recover(memcg);
+
 	memcg_wb_domain_size_changed(memcg);
 	return nbytes;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d5a941bea2d7..df564495c8b2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -158,6 +158,15 @@ static inline bool is_sysrq_oom(struct oom_control *oc)
 	return oc->order == -1;
 }
 
+/*
+ * order == -2 means the oom kill is triggered by reducing memcg max,
+ * otherwise only for display puerposes.
+ */
+static bool is_max_write_oom(struct oom_control *oc)
+{
+	return oc->order == -2;
+}
+
 /* return true if the task is not adequate as candidate victim task. */
 static bool oom_unkillable_task(struct task_struct *p)
 {
@@ -1108,6 +1117,19 @@ enum oom_status out_of_memory(struct oom_control *oc)
 		 */
 		if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
 			panic("System is deadlocked on memory\n");
+
+		/* Bail out from the charge path if we can't find a victim. */
+		if (is_memcg_oom(oc)) {
+			if (is_max_write_oom(oc))
+				return OOM_SKIPPED;
+
+			return task_in_memcg_oom_set(current,
+						     oc->memcg,
+						     oc->gfp_mask,
+						     oc->order,
+						     true);
+		}
+
 	}
 	if (oc->chosen && oc->chosen != (void *)-1UL)
 		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
-- 
2.18.2





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux