> Brad Spengler published a local memory-allocation DoS that > evades the OOM-killer (though not the virtual memory RLIMIT): > http://www.grsecurity.net/~spender/64bit_dos.c > > The recent changes to create a stack guard page helps slightly to > discourage this attack, but it is not sufficient. Compiling it statically > moves the libraries out of the way, allowing the stack VMA to fill the > entire TASK_SIZE. > > There are two issues: > 1) the OOM killer doesn't notice this argv memory explosion > 2) the argv expansion does not check if rlim[RLIMIT_STACK].rlim_cur is -1. > > I figure a quick solution for #2 would be the following patch. However, > running multiple copies of this program could result in similar OOM > behavior, so issue #1 still needs a solution. > >Reported-by: Brad Spengler <spender@xxxxxxxxxxxxxx> >Signed-off-by: Kees Cook <kees.cook@xxxxxxxxxxxxx> Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> And, I have a patch for #1. Can you please see this? Alternative idea is to change rss accounting itself. >From d4e114e5d31b14ebfc399d4b1fb142c7dfce0ca4 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> Date: Thu, 19 Aug 2010 20:40:20 +0900 Subject: [PATCH] oom: don't ignore temporary rss while execve execve() makes new mm struct and setup stack and argv vector, Unfortunately this new mm is not pointed any tasks, then oom-kill can't detect this memory usage. therefore oom-kill may kill incorrect task. This patch added in-exec rss treatness to oom. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> --- fs/compat.c | 8 ++++++-- fs/exec.c | 19 +++++++++++++++++-- include/linux/binfmts.h | 1 + include/linux/sched.h | 1 + mm/oom_kill.c | 36 +++++++++++++++++++++++++++--------- 5 files changed, 52 insertions(+), 13 deletions(-) diff --git a/fs/compat.c b/fs/compat.c index 718c706..643140c 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1527,6 +1527,7 @@ int compat_do_execve(char * filename, retval = bprm_mm_init(bprm); if (retval) goto out_file; + set_exec_mm(bprm->mm); bprm->argc = compat_count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) @@ -1560,6 +1561,7 @@ int compat_do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; + set_exec_mm(NULL); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1567,8 +1569,10 @@ int compat_do_execve(char * filename, return retval; out: - if (bprm->mm) - mmput(bprm->mm); + if (current->in_exec_mm) { + struct mm_struct *in_exec_mm = set_exec_mm(NULL); + mmput (in_exec_mm); + } out_file: if (bprm->file) { diff --git a/fs/exec.c b/fs/exec.c index 2d94552..85192e1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1314,6 +1314,17 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) EXPORT_SYMBOL(search_binary_handler); +struct mm_struct* set_exec_mm(struct mm_struct *mm) +{ + struct mm_struct *old = current->in_exec_mm; + + task_lock(current); + current->in_exec_mm = mm; + task_unlock(current); + + return old; +} + /* * sys_execve() executes a new program. */ @@ -1361,6 +1372,7 @@ int do_execve(const char * filename, retval = bprm_mm_init(bprm); if (retval) goto out_file; + set_exec_mm(bprm->mm); bprm->argc = count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) @@ -1395,6 +1407,7 @@ int do_execve(const char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; + set_exec_mm(NULL); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1402,8 +1415,10 @@ int do_execve(const char * filename, return retval; out: - if (bprm->mm) - mmput (bprm->mm); + if (current->in_exec_mm) { + struct mm_struct *in_exec_mm = set_exec_mm(NULL); + mmput (in_exec_mm); + } out_file: if (bprm->file) { diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index a065612..8cf61eb 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -133,6 +133,7 @@ extern void install_exec_creds(struct linux_binprm *bprm); extern void do_coredump(long signr, int exit_code, struct pt_regs *regs); extern void set_binfmt(struct linux_binfmt *new); extern void free_bprm(struct linux_binprm *); +extern struct mm_struct* set_exec_mm(struct mm_struct *mm); #endif /* __KERNEL__ */ #endif /* _LINUX_BINFMTS_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 1e2a6db..d413757 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1217,6 +1217,7 @@ struct task_struct { struct plist_node pushable_tasks; struct mm_struct *mm, *active_mm; + struct mm_struct *in_exec_mm; #if defined(SPLIT_RSS_COUNTING) struct task_rss_stat rss_stat; #endif diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 57c05f7..7fc6916 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -120,6 +120,30 @@ struct task_struct *find_lock_task_mm(struct task_struct *p) return NULL; } +static unsigned long calculate_rss_swap(struct task_struct *p) +{ + struct task_struct *t = p; + int mm_accounted = 0; + unsigned long points = 0; + + do { + task_lock(t); + if (!mm_accounted && t->mm) { + points += get_mm_rss(t->mm); + points += get_mm_counter(t->mm, MM_SWAPENTS); + mm_accounted = 1; + } + if (t->in_exec_mm) { + points += get_mm_rss(t->in_exec_mm); + points += get_mm_counter(t->in_exec_mm, MM_SWAPENTS); + } + task_unlock(t); + } while_each_thread(p, t); + + return points; +} + + /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem, const nodemask_t *nodemask) @@ -157,16 +181,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, if (oom_unkillable_task(p, mem, nodemask)) return 0; - p = find_lock_task_mm(p); - if (!p) - return 0; - /* * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't * need to be executed for something that cannot be killed. */ if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - task_unlock(p); return 0; } @@ -175,7 +194,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, * priority for oom killing. */ if (p->flags & PF_OOM_ORIGIN) { - task_unlock(p); return 1000; } @@ -190,9 +208,9 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, * The baseline for the badness score is the proportion of RAM that each * task's rss and swap space use. */ - points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / - totalpages; - task_unlock(p); + points = calculate_rss_swap(p) * 1000 / totalpages; + if (!points) + return 0; /* * Root processes get 3% bonus, just like the __vm_enough_memory() -- 1.6.5.2 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html