回复: [PATCH] pid: add handling of too many zombie processes

<huyd12@xxxxxxxxxxxxxxx> · Thu, 9 Feb 2023 15:14:57 +0800

Any comments will be appreciated.



-----邮件原件-----
发件人: liuq131@xxxxxxxxxxxxxxx <liuq131@xxxxxxxxxxxxxxx> 
发送时间: 2023年2月8日 17:49
收件人: akpm@xxxxxxxxxxxxxxxxxxxx
抄送: agruenba@xxxxxxxxxx; linux-mm@xxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx;
huyd12@xxxxxxxxxxxxxxx; liuq <liuq131@xxxxxxxxxxxxxxx>
主题: [PATCH] pid: add handling of too many zombie processes

There is a common situation that a parent process forks many child processes
to execute tasks, but the parent process does not execute wait/waitpid when
the child process exits, resulting in a large number of child processes
becoming zombie processes.

At this time, if the number of processes in the system out of
kernel.pid_max, the new fork syscall will fail, and the system will not be
able to execute any command at this time (unless an old process exits)

eg:
[root@lq-workstation ~]# ls
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: Resource temporarily unavailable [root@lq-workstation ~]#
reboot
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: Resource temporarily unavailable

I dealt with this situation in the alloc_pid function, and found a process
with the most zombie subprocesses, and more than 10(or other reasonable
values?) zombie subprocesses, so I tried to kill this process to release the
pid resources.

Signed-off-by: liuq <liuq131@xxxxxxxxxxxxxxx>
---
 include/linux/mm.h |  2 ++
 kernel/pid.c       |  6 +++-
 mm/oom_kill.c      | 70 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h index
8f857163ac89..afcff08a3878 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1940,6 +1940,8 @@ static inline void clear_page_pfmemalloc(struct page
*page)
  * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
  */
 extern void pagefault_out_of_memory(void);
+extern void pid_max_oom_check(struct pid_namespace *ns);
+
 
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
 #define offset_in_thp(page, p)	((unsigned long)(p) & (thp_size(page) - 1))
diff --git a/kernel/pid.c b/kernel/pid.c index 3fbc5e46b721..1a9a60e19ab6
100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -237,7 +237,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t
*set_tid,
 		idr_preload_end();
 
 		if (nr < 0) {
-			retval = (nr == -ENOSPC) ? -EAGAIN : nr;
+			retval = nr;
+			if (nr == -ENOSPC) {
+				retval = -EAGAIN;
+				pid_max_oom_check(tmp);
+			}
 			goto out_free;
 		}
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1276e49b31b0..18d05d706f48
100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1260,3 +1260,73 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd,
unsigned int, flags)
 	return -ENOSYS;
 #endif /* CONFIG_MMU */
 }
+
+static void oom_pid_evaluate_task(struct task_struct *p,
+	struct task_struct **max_zombie_task, int *max_zombie_num) {
+	struct task_struct *child;
+	int zombie_num = 0;
+
+	list_for_each_entry(child, &p->children, sibling) {
+		if (child->exit_state == EXIT_ZOMBIE)
+			zombie_num++;
+	}
+	if (zombie_num > *max_zombie_num) {
+		*max_zombie_num = zombie_num;
+		*max_zombie_task = p;
+	}
+}
+#define MAX_ZOMBIE_NUM 10
+struct task_struct *pid_max_bad_process(struct pid_namespace *ns) {
+	int max_zombie_num = 0;
+	struct task_struct *max_zombie_task = &init_task;
+	struct task_struct *p;
+
+	rcu_read_lock();
+	for_each_process(p)
+		oom_pid_evaluate_task(p, &max_zombie_task, &max_zombie_num);
+	rcu_read_unlock();
+
+	if (max_zombie_num > MAX_ZOMBIE_NUM) {
+		pr_info("process %d has %d zombie child\n",
+			task_pid_nr_ns(max_zombie_task, ns),
max_zombie_num);
+		return max_zombie_task;
+	}
+
+	return NULL;
+}
+
+void pid_max_oom_kill_process(struct task_struct *task) {
+	struct oom_control oc = {
+		.zonelist = NULL,
+		.nodemask = NULL,
+		.memcg = NULL,
+		.gfp_mask = 0,
+		.order = 0,
+	};
+
+	get_task_struct(task);
+	oc.chosen = task;
+
+	if (mem_cgroup_oom_synchronize(true))
+		return;
+
+	if (!mutex_trylock(&oom_lock))
+		return;
+
+	oom_kill_process(&oc, "Out of pid max(oom_kill_allocating_task)");
+	mutex_unlock(&oom_lock);
+}
+
+void pid_max_oom_check(struct pid_namespace *ns) {
+	struct task_struct *p;
+
+	p = pid_max_bad_process(ns);
+	if (p) {
+		pr_info("oom_kill process %d\n", task_pid_nr_ns(p, ns));
+		pid_max_oom_kill_process(p);
+	}
+}
--
2.27.0