On Fri, Feb 21, 2025 at 06:02:49PM +0100, Michal Koutný wrote: > Noone would need to use this allocation strategy (it's slower, pid > numbers collide sooner). Its primary purpose are pid namespaces in > conjunction with pids.max cgroup limit which keeps (virtual) pid numbers > below the given limit. This is for 32-bit userspace programs that may > not work well with pid numbers above 65536. > > Link: https://lore.kernel.org/r/20241122132459.135120-1-aleksandr.mikhalitsyn@xxxxxxxxxxxxx/ > Signed-off-by: Michal Koutný <mkoutny@xxxxxxxx> > --- > Documentation/admin-guide/sysctl/kernel.rst | 2 ++ > include/linux/pid_namespace.h | 3 +++ > kernel/pid.c | 12 +++++++-- > kernel/pid_namespace.c | 28 +++++++++++++++------ > 4 files changed, 36 insertions(+), 9 deletions(-) > > diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst > index a43b78b4b6464..f5e68d1c8849f 100644 > --- a/Documentation/admin-guide/sysctl/kernel.rst > +++ b/Documentation/admin-guide/sysctl/kernel.rst > @@ -1043,6 +1043,8 @@ The last pid allocated in the current (the one task using this sysctl > lives in) pid namespace. When selecting a pid for a next task on fork > kernel tries to allocate a number starting from this one. > > +When set to -1, first-fit pid numbering is used instead of the next-fit. I strongly disagree with this approach. This is way worse then making pid_max per pid namespace. I'm fine if you come up with something else that's purely based on cgroups somehow and is uniform across 64-bit and 32-bit. Allowing to change the pid allocation strategy just for 32-bit is not the solution and not mergable. > + > > powersave-nap (PPC only) > ======================== > diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h > index f9f9931e02d6a..10bf66ca78590 100644 > --- a/include/linux/pid_namespace.h > +++ b/include/linux/pid_namespace.h > @@ -41,6 +41,9 @@ struct pid_namespace { > #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > int memfd_noexec_scope; > #endif > +#ifdef CONFIG_IA32_EMULATION > + bool pid_noncyclic; > +#endif > } __randomize_layout; > > extern struct pid_namespace init_pid_ns; > diff --git a/kernel/pid.c b/kernel/pid.c > index aa2a7d4da4555..e9da1662b8821 100644 > --- a/kernel/pid.c > +++ b/kernel/pid.c > @@ -191,6 +191,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, > > for (i = ns->level; i >= 0; i--) { > int tid = 0; > + bool pid_noncyclic = 0; > +#ifdef CONFIG_IA32_EMULATION > + pid_noncyclic = READ_ONCE(tmp->pid_noncyclic); > +#endif > > if (set_tid_size) { > tid = set_tid[ns->level - i]; > @@ -235,8 +239,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, > * Store a null pointer so find_pid_ns does not find > * a partially initialized PID (see below). > */ > - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, > - pid_max, GFP_ATOMIC); > + if (likely(!pid_noncyclic)) > + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, > + pid_max, GFP_ATOMIC); > + else > + nr = idr_alloc(&tmp->idr, NULL, pid_min, > + pid_max, GFP_ATOMIC); > } > spin_unlock_irq(&pidmap_lock); > idr_preload_end(); > diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c > index 0f23285be4f92..ceda94a064294 100644 > --- a/kernel/pid_namespace.c > +++ b/kernel/pid_namespace.c > @@ -113,6 +113,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns > ns->pid_allocated = PIDNS_ADDING; > #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); > +#endif > +#ifdef CONFIG_IA32_EMULATION > + ns->pid_noncyclic = READ_ONCE(parent_pid_ns->pid_noncyclic); > #endif > return ns; > > @@ -260,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) > return; > } > > -#ifdef CONFIG_CHECKPOINT_RESTORE > +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION) > static int pid_ns_ctl_handler(const struct ctl_table *table, int write, > void *buffer, size_t *lenp, loff_t *ppos) > { > @@ -271,12 +274,23 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write, > if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) > return -EPERM; > > - next = idr_get_cursor(&pid_ns->idr) - 1; > + next = -1; > +#ifdef CONFIG_IA32_EMULATION > + if (!pid_ns->pid_noncyclic) > +#endif > + next += idr_get_cursor(&pid_ns->idr); > > tmp.data = &next; > ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); > - if (!ret && write) > - idr_set_cursor(&pid_ns->idr, next + 1); > + if (!ret && write) { > + if (next > -1) > + idr_set_cursor(&pid_ns->idr, next + 1); > + else if (!IS_ENABLED(CONFIG_IA32_EMULATION)) > + ret = -EINVAL; > +#ifdef CONFIG_IA32_EMULATION > + WRITE_ONCE(pid_ns->pid_noncyclic, next == -1); > +#endif > + } > > return ret; > } > @@ -288,11 +302,11 @@ static const struct ctl_table pid_ns_ctl_table[] = { > .maxlen = sizeof(int), > .mode = 0666, /* permissions are checked in the handler */ > .proc_handler = pid_ns_ctl_handler, > - .extra1 = SYSCTL_ZERO, > + .extra1 = SYSCTL_NEG_ONE, > .extra2 = &pid_max, > }, > }; > -#endif /* CONFIG_CHECKPOINT_RESTORE */ > +#endif /* CONFIG_CHECKPOINT_RESTORE || CONFIG_IA32_EMULATION */ > > int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) > { > @@ -449,7 +463,7 @@ static __init int pid_namespaces_init(void) > { > pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); > > -#ifdef CONFIG_CHECKPOINT_RESTORE > +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION) > register_sysctl_init("kernel", pid_ns_ctl_table); > #endif > > -- > 2.48.1 >