[PATCH] mm, memcg: introduce per memcg oom_score_adj

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



- Why we need a per memcg oom_score_adj setting ?
This is easy to deploy and very convenient for container.
When we use container, we always treat memcg as a whole, if we have a per
memcg oom_score_adj setting we don't need to set it process by process.
It will make the user exhausted to set it to all processes in a memcg.

In this patch, a file named memory.oom.score_adj is introduced.
The valid value of it is from -1000 to +1000, which is same with
process-level oom_score_adj.
When OOM is invoked, the effective oom_score_adj is as bellow,
    effective oom_score_adj = original oom_score_adj + memory.oom.score_adj
The valid effective value is also from -1000 to +1000.
This is something like a hook to re-calculate the oom_score_adj.

Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
Cc: Roman Gushchin <guro@xxxxxx>
---
 include/linux/memcontrol.h | 24 ++++++++++++++++++++++++
 mm/memcontrol.c            | 38 ++++++++++++++++++++++++++++++++++++++
 mm/oom_kill.c              | 20 ++++++++------------
 3 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2cd4359..d2dbde5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,6 +21,7 @@
 #include <linux/vmstat.h>
 #include <linux/writeback.h>
 #include <linux/page-flags.h>
+#include <linux/oom.h>
 
 struct mem_cgroup;
 struct page;
@@ -224,6 +225,7 @@ struct mem_cgroup {
 	 * Should the OOM killer kill all belonging tasks, had it kill one?
 	 */
 	bool oom_group;
+	short oom_score_adj;
 
 	/* protected by memcg_oom_lock */
 	bool		oom_lock;
@@ -538,6 +540,23 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
 	return p->memcg_in_oom;
 }
 
+static inline int mem_cgroup_score_adj(struct task_struct *p, int task_adj)
+{
+	struct mem_cgroup *memcg;
+	int adj = task_adj;
+
+	memcg = mem_cgroup_from_task(p);
+	if (memcg != root_mem_cgroup) {
+		adj += memcg->oom_score_adj;
+		if (adj < OOM_SCORE_ADJ_MIN)
+			adj = OOM_SCORE_ADJ_MIN;
+		else if (adj > OOM_SCORE_ADJ_MAX)
+			adj = OOM_SCORE_ADJ_MAX;
+	}
+
+	return adj;
+}
+
 bool mem_cgroup_oom_synchronize(bool wait);
 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
 					    struct mem_cgroup *oom_domain);
@@ -987,6 +1006,11 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
 	return false;
 }
 
+static inline int mem_cgroup_score_adj(struct task_struct *p, int task_adj)
+{
+	return task_adj;
+}
+
 static inline bool mem_cgroup_oom_synchronize(bool wait)
 {
 	return false;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6f5c0c5..065285c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5856,6 +5856,38 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+static int memory_oom_score_adj_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+	seq_printf(m, "%d\n", memcg->oom_score_adj);
+
+	return 0;
+}
+
+static ssize_t memory_oom_score_adj_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int oom_score_adj;
+	int ret;
+
+	buf = strstrip(buf);
+	if (!buf)
+		return -EINVAL;
+
+	ret = kstrtoint(buf, 0, &oom_score_adj);
+	if (ret)
+		return ret;
+
+	if (oom_score_adj > 1000 || oom_score_adj < -1000)
+		return -EINVAL;
+
+	memcg->oom_score_adj = oom_score_adj;
+
+	return nbytes;
+}
+
 static struct cftype memory_files[] = {
 	{
 		.name = "current",
@@ -5909,6 +5941,12 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
 		.seq_show = memory_oom_group_show,
 		.write = memory_oom_group_write,
 	},
+	{
+		.name = "oom.score_adj",
+		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+		.seq_show = memory_oom_score_adj_show,
+		.write = memory_oom_score_adj_write,
+	},
 	{ }	/* terminate */
 };
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eda2e2a..f3b0276 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -212,13 +212,7 @@ unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
 	 * unkillable or have been already oom reaped or the are in
 	 * the middle of vfork
 	 */
-	adj = (long)p->signal->oom_score_adj;
-	if (adj == OOM_SCORE_ADJ_MIN ||
-			test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
-			in_vfork(p)) {
-		task_unlock(p);
-		return 0;
-	}
+	adj = mem_cgroup_score_adj(p, p->signal->oom_score_adj);
 
 	/*
 	 * The baseline for the badness score is the proportion of RAM that each
@@ -404,7 +398,8 @@ static int dump_task(struct task_struct *p, void *arg)
 		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 		mm_pgtables_bytes(task->mm),
 		get_mm_counter(task->mm, MM_SWAPENTS),
-		task->signal->oom_score_adj, task->comm);
+		mem_cgroup_score_adj(task, task->signal->oom_score_adj),
+		task->comm);
 	task_unlock(task);
 
 	return 0;
@@ -453,7 +448,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
 {
 	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
 		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
-			current->signal->oom_score_adj);
+		mem_cgroup_score_adj(current, current->signal->oom_score_adj));
 	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
 		pr_warn("COMPACTION is disabled!!!\n");
 
@@ -939,8 +934,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
  */
 static int oom_kill_memcg_member(struct task_struct *task, void *message)
 {
-	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
-	    !is_global_init(task)) {
+	if (mem_cgroup_score_adj(task, task->signal->oom_score_adj) !=
+	    OOM_SCORE_ADJ_MIN && !is_global_init(task)) {
 		get_task_struct(task);
 		__oom_kill_process(task, message);
 	}
@@ -1085,7 +1080,8 @@ bool out_of_memory(struct oom_control *oc)
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
 	    current->mm && !oom_unkillable_task(current) &&
 	    oom_cpuset_eligible(current, oc) &&
-	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+	    mem_cgroup_score_adj(current, current->signal->oom_score_adj) !=
+	    OOM_SCORE_ADJ_MIN) {
 		get_task_struct(current);
 		oc->chosen = current;
 		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
-- 
1.8.3.1





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux