Michal and Johannes, can we please stop this stupid behavior now? Reproducer: ---------- #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <sched.h> #include <sys/mman.h> #define NUMTHREADS 256 #define MMAPSIZE 4 * 10485760 #define STACKSIZE 4096 static int pipe_fd[2] = { EOF, EOF }; static int memory_eater(void *unused) { int fd = open("/dev/zero", O_RDONLY); char *buf = mmap(NULL, MMAPSIZE, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, EOF, 0); read(pipe_fd[0], buf, 1); read(fd, buf, MMAPSIZE); pause(); return 0; } int main(int argc, char *argv[]) { int i; char *stack; FILE *fp; const unsigned long size = 1048576 * 200; mkdir("/sys/fs/cgroup/memory/test1", 0755); fp = fopen("/sys/fs/cgroup/memory/test1/memory.limit_in_bytes", "w"); fprintf(fp, "%lu\n", size); fclose(fp); fp = fopen("/sys/fs/cgroup/memory/test1/tasks", "w"); fprintf(fp, "%u\n", getpid()); fclose(fp); if (setgid(-2) || setuid(-2) || pipe(pipe_fd)) return 1; if (fork() == 0) { stack = mmap(NULL, STACKSIZE * NUMTHREADS, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, EOF, 0); for (i = 0; i < NUMTHREADS; i++) if (clone(memory_eater, stack + (i + 1) * STACKSIZE, CLONE_SIGHAND | CLONE_THREAD | CLONE_VM | CLONE_FS | CLONE_FILES, NULL) == -1) break; close(pipe_fd[1]); pause(); } close(pipe_fd[0]); for (i = 0; i < NUMTHREADS / 2; i++) if (fork() == 0) { close(pipe_fd[1]); pause(); } sleep(1); close(pipe_fd[1]); pause(); return 0; } ---------- Complete log is at http://I-love.SAKURA.ne.jp/tmp/serial-20190106.txt.xz : ---------- [ 79.104729] a.out invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0 (...snipped...) [ 79.237203] memory: usage 204800kB, limit 204800kB, failcnt 2834 [ 79.242176] memory+swap: usage 204800kB, limit 9007199254740988kB, failcnt 0 [ 79.245175] kmem: usage 23420kB, limit 9007199254740988kB, failcnt 0 [ 79.247945] Memory cgroup stats for /test1: cache:177456KB rss:3420KB rss_huge:0KB shmem:177456KB mapped_file:177540KB dirty:0KB writeback:0KB swap:0KB inactive_anon:177676KB active_anon:3612KB inactive_file:0KB active_file:0KB unevictable:0KB [ 79.256726] oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,mems_allowed=0,oom_memcg=/test1,task_memcg=/test1,task=a.out,pid=8204,uid=-2 [ 79.262470] Memory cgroup out of memory: Kill process 8204 (a.out) score 822 or sacrifice child [ 79.266901] Killed process 8204 (a.out) total-vm:10491132kB, anon-rss:92kB, file-rss:444kB, shmem-rss:167028kB [ 79.272974] oom_reaper: reaped process 8447 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:167488kB [ 79.277733] a.out invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0 (...snipped...) [ 79.386222] memory: usage 204708kB, limit 204800kB, failcnt 2837 [ 79.412519] Killed process 8205 (a.out) total-vm:4348kB, anon-rss:92kB, file-rss:0kB, shmem-rss:0kB (...snipped...) [ 79.539042] memory: usage 204600kB, limit 204800kB, failcnt 2838 [ 79.564617] Killed process 8206 (a.out) total-vm:4348kB, anon-rss:92kB, file-rss:0kB, shmem-rss:0kB (...snipped...) [ 81.967741] Memory cgroup out of memory: Kill process 8203 (a.out) score 6 or sacrifice child [ 81.971760] Killed process 8203 (a.out) total-vm:4348kB, anon-rss:92kB, file-rss:1156kB, shmem-rss:0kB [ 81.977329] a.out invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0 (...snipped...) [ 81.977529] memory: usage 187160kB, limit 204800kB, failcnt 2838 [ 81.977530] memory+swap: usage 187160kB, limit 9007199254740988kB, failcnt 0 [ 81.977531] kmem: usage 8264kB, limit 9007199254740988kB, failcnt 0 [ 81.977532] Memory cgroup stats for /test1: cache:178248KB rss:372KB rss_huge:0KB shmem:178248KB mapped_file:178332KB dirty:0KB writeback:0KB swap:0KB inactive_anon:178568KB active_anon:0KB inactive_file:0KB active_file:0KB unevictable:0KB [ 81.977545] Out of memory and no killable processes... (...snipped...) [ 87.914960] a.out invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), order=0, oom_score_adj=0 (...snipped...) [ 88.019110] memory: usage 183472kB, limit 204800kB, failcnt 2838 [ 88.021629] memory+swap: usage 183472kB, limit 9007199254740988kB, failcnt 0 [ 88.024513] kmem: usage 4448kB, limit 9007199254740988kB, failcnt 0 [ 88.027137] Memory cgroup stats for /test1: cache:178512KB rss:372KB rss_huge:0KB shmem:178512KB mapped_file:178464KB dirty:0KB writeback:0KB swap:0KB inactive_anon:178760KB active_anon:0KB inactive_file:0KB active_file:0KB unevictable:0KB [ 88.036008] Out of memory and no killable processes... ---------- >From 0fb58415770a83d6c40d471e1840f8bc4a35ca83 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> Date: Wed, 26 Dec 2018 19:13:35 +0900 Subject: [PATCH] memcg: killed threads should not invoke memcg OOM killer If $N > $M, a single process with $N threads in a memcg group can easily kill all $M processes in that memcg group, for mem_cgroup_out_of_memory() does not check if current thread needs to invoke the memcg OOM killer. T1@P1 |T2...$N@P1|P2...$M |OOM reaper ----------+----------+----------+---------- # all sleeping try_charge() mem_cgroup_out_of_memory() mutex_lock(oom_lock) try_charge() mem_cgroup_out_of_memory() mutex_lock(oom_lock) out_of_memory() select_bad_process() oom_kill_process(P1) wake_oom_reaper() oom_reap_task() # ignores P1 mutex_unlock(oom_lock) out_of_memory() select_bad_process(P2...M) # all killed by T2...N@P1 wake_oom_reaper() oom_reap_task() # ignores P2...M mutex_unlock(oom_lock) We don't need to invoke the memcg OOM killer if current thread was killed when waiting for oom_lock, for mem_cgroup_oom_synchronize(true) and memory_max_write() can bail out upon SIGKILL, and try_charge() allows already killed/exiting threads to make forward progress. Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> --- mm/memcontrol.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b860dd4f7..b0d3bf3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1389,8 +1389,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, }; bool ret; - mutex_lock(&oom_lock); - ret = out_of_memory(&oc); + if (mutex_lock_killable(&oom_lock)) + return true; + /* + * A few threads which were not waiting at mutex_lock_killable() can + * fail to bail out. Therefore, check again after holding oom_lock. + */ + ret = fatal_signal_pending(current) || out_of_memory(&oc); mutex_unlock(&oom_lock); return ret; } -- 1.8.3.1