Re: [PATCH 4/4] oom: don't ignore rss in nascent mm

KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> · Mon, 27 Sep 2010 11:50:02 +0900 (JST)

> On 09/16, KOSAKI Motohiro wrote:
> >
> > ChangeLog
> >  o since v1
> >    - Always use thread group leader's ->in_exec_mm.
> 
> Confused ;)
> 
> > +static unsigned long oom_rss_swap_usage(struct task_struct *p)
> > +{
> > +	struct task_struct *t = p;
> > +	struct task_struct *leader = p->group_leader;
> > +	unsigned long points = 0;
> > +
> > +	do {
> > +		task_lock(t);
> > +		if (t->mm) {
> > +			points += get_mm_rss(t->mm);
> > +			points += get_mm_counter(t->mm, MM_SWAPENTS);
> > +			task_unlock(t);
> > +			break;
> > +		}
> > +		task_unlock(t);
> > +	} while_each_thread(p, t);
> > +
> > +	/*
> > +	 * If the process is in execve() processing, we have to concern
> > +	 * about both old and new mm.
> > +	 */
> > +	task_lock(leader);
> > +	if (leader->in_exec_mm) {
> > +		points += get_mm_rss(leader->in_exec_mm);
> > +		points += get_mm_counter(leader->in_exec_mm, MM_SWAPENTS);
> > +	}
> > +	task_unlock(leader);
> > +
> > +	return points;
> > +}
> 
> This patch relies on fact that we can't race with de_thread() (and btw
> the change in de_thread() looks bogus). Then why ->in_exec_mm lives in
> task_struct ?
> 
> To me, this looks a bit strange. I think we should either do not use
> ->group_leader to hold ->in_exec_mm like your previous patch did, or
> move ->in_exec_mm into signal_struct. The previous 3/4 ensures that
> only one thread can set ->in_exec_mm.

hm. okey. I'll do.


> 
> And I don't think oom_rss_swap_usage() should replace find_lock_task_mm()
> in oom_badness(), I mean something like this:
> 
> 	static unsigned long oom_rss_swap_usage(struct mm_struct *mm)
> 	{
> 		return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS);
> 	}
> 
> 	unsigned int oom_badness(struct task_struct *p, ...)
> 	{
> 		int points = 0;
> 
> 		if (unlikely(p->signal->in_exec_mm)) {
> 			task_lock(p->group_leader);
> 			if (p->signal->in_exec_mm)
> 				points = oom_rss_swap_usage(p->signal->in_exec_mm);
> 			task_unlock(p->group_leader);
> 		}
> 
> 		p = find_lock_task_mm(p);
> 		if (!p)
> 			return points;
> 
> 		...
> 	}
> 
> but this is the matter of taste.
> 
> What do you think?

Personally I don't think this is big matter. but I always take reviewer's
opinion if I have no reason to oppose. Will fix.



---------------------------------------------------------------------------
>From 882ba08dd61de3ebd429470ac11ac979e50d1615 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx>
Date: Sun, 12 Sep 2010 13:26:11 +0900
Subject: [PATCH] oom: don't ignore rss in nascent mm

ChangeLog
 o since v2
   - Move ->in_exec_mm from task_struct to signal_struct
   - clean up oom_rss_swap_usage()
 o since v1
   - Always use thread group leader's ->in_exec_mm.
     It slightly makes efficient oom when a process has many thread.
   - Add the link of Brad's explanation to the description.

Brad Spengler published a local memory-allocation DoS that
evades the OOM-killer (though not the virtual memory RLIMIT):
http://www.grsecurity.net/~spender/64bit_dos.c

Because execve() makes new mm struct and setup stack and
copy argv. It mean the task have two mm while execve() temporary.
Unfortunately this nascent mm is not pointed any tasks, then
OOM-killer can't detect this memory usage. therefore OOM-killer
may kill incorrect task.

Thus, this patch added task->in_exec_mm member and track
nascent mm usage.

Cc: pageexec@xxxxxxxxxxx
Cc: Roland McGrath <roland@xxxxxxxxxx>
Cc: Solar Designer <solar@xxxxxxxxxxxx>
Cc: Eugene Teo <eteo@xxxxxxxxxx>
Reported-by: Brad Spengler <spender@xxxxxxxxxxxxxx>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx>
---
 fs/compat.c             |    4 +++-
 fs/exec.c               |   16 +++++++++++++++-
 include/linux/binfmts.h |    1 +
 include/linux/sched.h   |    1 +
 mm/oom_kill.c           |   26 +++++++++++++++++++-------
 5 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/fs/compat.c b/fs/compat.c
index 718c706..b631120 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1567,8 +1567,10 @@ int compat_do_execve(char * filename,
 	return retval;
 
 out:
-	if (bprm->mm)
+	if (bprm->mm) {
+		set_exec_mm(NULL);
 		mmput(bprm->mm);
+	}
 
 out_file:
 	if (bprm->file) {
diff --git a/fs/exec.c b/fs/exec.c
index 160eb46..15ab7b3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -347,6 +347,8 @@ int bprm_mm_init(struct linux_binprm *bprm)
 	if (err)
 		goto err;
 
+	set_exec_mm(mm);
+
 	return 0;
 
 err:
@@ -745,6 +747,7 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
+	tsk->signal->in_exec_mm = NULL;
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
@@ -1314,6 +1317,15 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 
 EXPORT_SYMBOL(search_binary_handler);
 
+void set_exec_mm(struct mm_struct *mm)
+{
+	struct task_struct *leader = current->group_leader;
+
+	task_lock(leader);
+	leader->signal->in_exec_mm = mm;
+	task_unlock(leader);
+}
+
 /*
  * sys_execve() executes a new program.
  */
@@ -1402,8 +1414,10 @@ int do_execve(const char * filename,
 	return retval;
 
 out:
-	if (bprm->mm)
+	if (bprm->mm) {
+		set_exec_mm(NULL);
 		mmput (bprm->mm);
+	}
 
 out_file:
 	if (bprm->file) {
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index a065612..2fde1ba 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -133,6 +133,7 @@ extern void install_exec_creds(struct linux_binprm *bprm);
 extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
 extern void set_binfmt(struct linux_binfmt *new);
 extern void free_bprm(struct linux_binprm *);
+extern void set_exec_mm(struct mm_struct *mm);
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_BINFMTS_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 960a867..10a771d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -627,6 +627,7 @@ struct signal_struct {
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
 					 * (notably. ptrace) */
+	struct mm_struct *in_exec_mm;	/* temporary nascent mm in execve */
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c1beda0..18c12d1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -120,6 +120,15 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
 	return NULL;
 }
 
+/*
+ * The baseline for the badness score is the proportion of RAM that each
+ * task's rss and swap space use.
+ */
+static unsigned long oom_rss_swap_usage(struct mm_struct *mm)
+{
+	return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS);
+}
+
 /* return true if the task is not adequate as candidate victim task. */
 static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
 			   const nodemask_t *nodemask)
@@ -151,7 +160,7 @@ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
 unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *mem,
 			  const nodemask_t *nodemask)
 {
-	unsigned long points;
+	unsigned long points = 0;
 	unsigned long points_orig;
 	int oom_adj = p->signal->oom_adj;
 	long oom_score_adj = p->signal->oom_score_adj;
@@ -169,15 +178,18 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *mem,
 	if (p->flags & PF_OOM_ORIGIN)
 		return ULONG_MAX;
 
+	/* The task is now processing execve(). then it has second mm */
+	if (unlikely(p->signal->in_exec_mm)) {
+		task_lock(p->group_leader);
+		if (p->signal->in_exec_mm)
+			points = oom_rss_swap_usage(p->signal->in_exec_mm);
+		task_unlock(p->group_leader);
+	}
+
 	p = find_lock_task_mm(p);
 	if (!p)
 		return 0;
-
-	/*
-	 * The baseline for the badness score is the proportion of RAM that each
-	 * task's rss and swap space use.
-	 */
-	points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS));
+	points += oom_rss_swap_usage(p->mm);
 	task_unlock(p);
 
 	/*
-- 
1.6.5.2






--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html