The patch titled Subject: kernel/exit.c: release ptraced tasks before zap_pid_ns_processes has been added to the -mm tree. Its filename is kernel-release-ptraced-tasks-before-zap_pid_ns_processes.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/kernel-release-ptraced-tasks-before-zap_pid_ns_processes.patch and later at http://ozlabs.org/~akpm/mmotm/broken-out/kernel-release-ptraced-tasks-before-zap_pid_ns_processes.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Andrei Vagin <avagin@xxxxxxxxx> Subject: kernel/exit.c: release ptraced tasks before zap_pid_ns_processes Currently, exit_ptrace() adds all ptraced tasks in a dead list then zap_pid_ns_processes() waits all tasks in a current pid ns, and only then are tasks from the dead list released. zap_pid_ns_processes() can get stuck waiting for tasks from the dead list. In this case, we will have one unkillable process with one or more dead children. The reproducer for this problem is below. Here you can find its effect without this fix: We have one alive process which sticks in zap_pid_ns_processes: $ ps axf ... 11831 pts/0 S 0:00 [ptrace_pidns] 11833 pts/0 Zl 0:00 \_ [ptrace_pidns] <defunct> $ cat /proc/11831/stack [<0>] do_wait+0x1fa/0x2c0 [<0>] kernel_wait4+0x9e/0x150 [<0>] zap_pid_ns_processes+0x17d/0x270 [<0>] do_exit+0xa15/0xbd0 [<0>] do_group_exit+0x47/0xc0 [<0>] get_signal+0x28c/0x850 [<0>] do_signal+0x36/0x630 [<0>] exit_to_usermode_loop+0x62/0xc0 [<0>] prepare_exit_to_usermode+0xb4/0xe0 [<0>] retint_user+0x8/0x18 [<0>] 0xffffffffffffffff The child process has two threads which were ptraced by parent: $ ls /proc/11833/task/ 11833 11834 The parent can't wait for the child, becase a thread group isn't empty, but a thread is in the dead list: $ cat /proc/1183{1,3,4}/status | grep '\(NSpid\|TracerPid\|State\)' State: S (sleeping) TracerPid: 0 NSpid: 11831 1 State: Z (zombie) TracerPid: 0 NSpid: 11833 2 State: X (dead) TracerPid: 0 NSpid: 11834 3 ====== ptrace_pidns.c ======= #define _GNU_SOURCE /* See feature_test_macros(7) */ #include <unistd.h> #include <sys/ptrace.h> #include <sys/types.h> #include <sys/time.h> #include <sys/resource.h> #include <sys/wait.h> #include <sys/syscall.h> /* For SYS_xxx definitions */ #include <pthread.h> #include <stdio.h> #include <stdlib.h> int pfd[2]; void *thread_fn() { pid_t tid = syscall(SYS_gettid); write(pfd[1], &tid, sizeof(tid)); sleep(1000); return NULL; } int main() { pid_t pid, tid, ppid = getpid(); pthread_t t; if (pipe(pfd)) return 1; pid = fork(); if (pid < 0) return 1; if (pid == 0) { pthread_create(&t, NULL, thread_fn, (void *)(unsigned long)ppid); sleep(1000); return 0; } printf("fork: %d\n", pid); if (read(pfd[0], &tid, sizeof(tid)) != sizeof(tid)) return 1; printf("thread: %d\n", tid); if (ptrace(PTRACE_ATTACH, tid, 0, 0)) return 1; if (wait4(tid, NULL, __WALL, NULL) != tid) return 1; if (ptrace(PTRACE_ATTACH, pid, 0, 0)) return 1; if (wait4(pid, NULL, __WALL, NULL) != pid) return 1; kill(pid, SIGKILL); *((int *)(0)) = 0xdead; return 0; } Link: http://lkml.kernel.org/r/20190102205939.26231-1-avagin@xxxxxxxxx Signed-off-by: Andrei Vagin <avagin@xxxxxxxxx> Cc: Oleg Nesterov <oleg@xxxxxxxxxx> Cc: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- --- a/kernel/exit.c~kernel-release-ptraced-tasks-before-zap_pid_ns_processes +++ a/kernel/exit.c @@ -664,9 +664,6 @@ static void forget_original_parent(struc { struct task_struct *p, *t, *reaper; - if (unlikely(!list_empty(&father->ptraced))) - exit_ptrace(father, dead); - /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father); if (list_empty(&father->children)) @@ -705,8 +702,18 @@ static void exit_notify(struct task_stru LIST_HEAD(dead); write_lock_irq(&tasklist_lock); - forget_original_parent(tsk, &dead); + if (unlikely(!list_empty(&tsk->ptraced))) + exit_ptrace(tsk, &dead); + write_unlock_irq(&tasklist_lock); + + /* Ptraced tasks have to be released before zap_pid_ns_processes(). */ + list_for_each_entry_safe(p, n, &dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } + write_lock_irq(&tasklist_lock); + forget_original_parent(tsk, &dead); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); _ Patches currently in -mm which might be from avagin@xxxxxxxxx are ptrace-take-into-account-saved_sigmask-in-ptrace_getsetsigmask.patch kernel-release-ptraced-tasks-before-zap_pid_ns_processes.patch include-replace-tsk-to-task-in-linux-sched-signalh.patch