On Tue, Nov 05, 2024 at 11:10:24AM +0800, Yun Zhou wrote: > It is necessary to have a different pid_max in different containers. > For example, multiple containers are running on a host, one of which > is Android, and its 32 bit bionic libc only accepts pid <= 65535. So > it requires the global pid_max <= 65535. This will cause configuration > conflicts with other containers and also limit the maximum number of > tasks for the entire system. > > Signed-off-by: Yun Zhou <yun.zhou@xxxxxxxxxxxxx> > --- > - Remove sentinels from ctl_table arrays. > v1 - https://lore.kernel.org/all/20241030052933.1041408-1-yun.zhou@xxxxxxxxxxxxx/ > --- > include/linux/pid_namespace.h | 1 + > kernel/pid.c | 12 +++++------ > kernel/pid_namespace.c | 34 ++++++++++++++++++++++++++----- > kernel/sysctl.c | 9 -------- > kernel/trace/pid_list.c | 2 +- > kernel/trace/trace.h | 2 -- > kernel/trace/trace_sched_switch.c | 2 +- > 7 files changed, 38 insertions(+), 24 deletions(-) ... > diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c > index d70ab49d5b4a..a5a8254825d5 100644 > --- a/kernel/pid_namespace.c > +++ b/kernel/pid_namespace.c > @@ -111,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns > ns->user_ns = get_user_ns(user_ns); > ns->ucounts = ucounts; > ns->pid_allocated = PIDNS_ADDING; > + ns->pid_max = parent_pid_ns->pid_max; > #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); > #endif > @@ -280,19 +281,44 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write, > > return ret; > } > +#endif /* CONFIG_CHECKPOINT_RESTORE */ > + > +static int pid_max_ns_ctl_handler(const struct ctl_table *table, int write, > + void *buffer, size_t *lenp, loff_t *ppos) > +{ > + struct pid_namespace *pid_ns = task_active_pid_ns(current); > + struct ctl_table tmp = *table; > + > + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) > + return -EPERM; > + > + tmp.data = &pid_ns->pid_max; > + if (pid_ns->parent) > + tmp.extra2 = &pid_ns->parent->pid_max; > + > + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); > +} > > -extern int pid_max; > static struct ctl_table pid_ns_ctl_table[] = { > +#ifdef CONFIG_CHECKPOINT_RESTORE > { > .procname = "ns_last_pid", > .maxlen = sizeof(int), > .mode = 0666, /* permissions are checked in the handler */ > .proc_handler = pid_ns_ctl_handler, > .extra1 = SYSCTL_ZERO, > - .extra2 = &pid_max, > + .extra2 = &init_pid_ns.pid_max, > }, > -}; > #endif /* CONFIG_CHECKPOINT_RESTORE */ > + { > + .procname = "pid_max", > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = pid_max_ns_ctl_handler, > + .extra1 = &pid_max_min, > + .extra2 = &pid_max_max, > + }, > +}; I see here that the sysctls are without sentinel. Reviewed-by: Joel Granados <joel.granados@xxxxxxxxxx> -- Joel Granados