Any comments will be appreciated. -----邮件原件----- 发件人: liuq131@xxxxxxxxxxxxxxx <liuq131@xxxxxxxxxxxxxxx> 发送时间: 2023年2月8日 17:49 收件人: akpm@xxxxxxxxxxxxxxxxxxxx 抄送: agruenba@xxxxxxxxxx; linux-mm@xxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; huyd12@xxxxxxxxxxxxxxx; liuq <liuq131@xxxxxxxxxxxxxxx> 主题: [PATCH] pid: add handling of too many zombie processes There is a common situation that a parent process forks many child processes to execute tasks, but the parent process does not execute wait/waitpid when the child process exits, resulting in a large number of child processes becoming zombie processes. At this time, if the number of processes in the system out of kernel.pid_max, the new fork syscall will fail, and the system will not be able to execute any command at this time (unless an old process exits) eg: [root@lq-workstation ~]# ls -bash: fork: retry: Resource temporarily unavailable -bash: fork: retry: Resource temporarily unavailable -bash: fork: retry: Resource temporarily unavailable -bash: fork: retry: Resource temporarily unavailable -bash: fork: Resource temporarily unavailable [root@lq-workstation ~]# reboot -bash: fork: retry: Resource temporarily unavailable -bash: fork: retry: Resource temporarily unavailable -bash: fork: retry: Resource temporarily unavailable -bash: fork: retry: Resource temporarily unavailable -bash: fork: Resource temporarily unavailable I dealt with this situation in the alloc_pid function, and found a process with the most zombie subprocesses, and more than 10(or other reasonable values?) zombie subprocesses, so I tried to kill this process to release the pid resources. Signed-off-by: liuq <liuq131@xxxxxxxxxxxxxxx> --- include/linux/mm.h | 2 ++ kernel/pid.c | 6 +++- mm/oom_kill.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f857163ac89..afcff08a3878 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1940,6 +1940,8 @@ static inline void clear_page_pfmemalloc(struct page *page) * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. */ extern void pagefault_out_of_memory(void); +extern void pid_max_oom_check(struct pid_namespace *ns); + #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) diff --git a/kernel/pid.c b/kernel/pid.c index 3fbc5e46b721..1a9a60e19ab6 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -237,7 +237,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, idr_preload_end(); if (nr < 0) { - retval = (nr == -ENOSPC) ? -EAGAIN : nr; + retval = nr; + if (nr == -ENOSPC) { + retval = -EAGAIN; + pid_max_oom_check(tmp); + } goto out_free; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1276e49b31b0..18d05d706f48 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1260,3 +1260,73 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) return -ENOSYS; #endif /* CONFIG_MMU */ } + +static void oom_pid_evaluate_task(struct task_struct *p, + struct task_struct **max_zombie_task, int *max_zombie_num) { + struct task_struct *child; + int zombie_num = 0; + + list_for_each_entry(child, &p->children, sibling) { + if (child->exit_state == EXIT_ZOMBIE) + zombie_num++; + } + if (zombie_num > *max_zombie_num) { + *max_zombie_num = zombie_num; + *max_zombie_task = p; + } +} +#define MAX_ZOMBIE_NUM 10 +struct task_struct *pid_max_bad_process(struct pid_namespace *ns) { + int max_zombie_num = 0; + struct task_struct *max_zombie_task = &init_task; + struct task_struct *p; + + rcu_read_lock(); + for_each_process(p) + oom_pid_evaluate_task(p, &max_zombie_task, &max_zombie_num); + rcu_read_unlock(); + + if (max_zombie_num > MAX_ZOMBIE_NUM) { + pr_info("process %d has %d zombie child\n", + task_pid_nr_ns(max_zombie_task, ns), max_zombie_num); + return max_zombie_task; + } + + return NULL; +} + +void pid_max_oom_kill_process(struct task_struct *task) { + struct oom_control oc = { + .zonelist = NULL, + .nodemask = NULL, + .memcg = NULL, + .gfp_mask = 0, + .order = 0, + }; + + get_task_struct(task); + oc.chosen = task; + + if (mem_cgroup_oom_synchronize(true)) + return; + + if (!mutex_trylock(&oom_lock)) + return; + + oom_kill_process(&oc, "Out of pid max(oom_kill_allocating_task)"); + mutex_unlock(&oom_lock); +} + +void pid_max_oom_check(struct pid_namespace *ns) { + struct task_struct *p; + + p = pid_max_bad_process(ns); + if (p) { + pr_info("oom_kill process %d\n", task_pid_nr_ns(p, ns)); + pid_max_oom_kill_process(p); + } +} -- 2.27.0