Am Fr., 21. Feb. 2025 um 18:02 Uhr schrieb Michal Koutný <mkoutny@xxxxxxxx>: > > Noone would need to use this allocation strategy (it's slower, pid > numbers collide sooner). Its primary purpose are pid namespaces in > conjunction with pids.max cgroup limit which keeps (virtual) pid numbers > below the given limit. This is for 32-bit userspace programs that may > not work well with pid numbers above 65536. > > Link: https://lore.kernel.org/r/20241122132459.135120-1-aleksandr.mikhalitsyn@xxxxxxxxxxxxx/ > Signed-off-by: Michal Koutný <mkoutny@xxxxxxxx> Dear Michal, sorry for such a long delay with reply on your patches. > --- > Documentation/admin-guide/sysctl/kernel.rst | 2 ++ > include/linux/pid_namespace.h | 3 +++ > kernel/pid.c | 12 +++++++-- > kernel/pid_namespace.c | 28 +++++++++++++++------ > 4 files changed, 36 insertions(+), 9 deletions(-) > > diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst > index a43b78b4b6464..f5e68d1c8849f 100644 > --- a/Documentation/admin-guide/sysctl/kernel.rst > +++ b/Documentation/admin-guide/sysctl/kernel.rst > @@ -1043,6 +1043,8 @@ The last pid allocated in the current (the one task using this sysctl > lives in) pid namespace. When selecting a pid for a next task on fork > kernel tries to allocate a number starting from this one. > > +When set to -1, first-fit pid numbering is used instead of the next-fit. > + > > powersave-nap (PPC only) > ======================== > diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h > index f9f9931e02d6a..10bf66ca78590 100644 > --- a/include/linux/pid_namespace.h > +++ b/include/linux/pid_namespace.h > @@ -41,6 +41,9 @@ struct pid_namespace { > #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > int memfd_noexec_scope; > #endif > +#ifdef CONFIG_IA32_EMULATION Unfortunately, this does not work for our use case as it's x86-specific. In the original cover letter [1] it was written: >In any case, there are workloads that have expections about how large >pid numbers they accept. Either for historical reasons or architectural >reasons. One concreate example is the 32-bit version of Android's bionic >libc which requires pid numbers less than 65536. There are workloads >where it is run in a 32-bit container on a 64-bit kernel. If the host And I have just confirmed with folks from Canonical, who work on Anbox (Android in container project), that they use Arm machines (both armhf/arm64). And one of the reasons to add this feature is to make legacy 32-bit Android Bionic libc to work [2]. [1] https://lore.kernel.org/all/20241122132459.135120-1-aleksandr.mikhalitsyn@xxxxxxxxxxxxx/ [2] https://android.googlesource.com/platform/bionic.git/+/HEAD/docs/32-bit-abi.md#is-too-small-for-large-pids Kind regards, Alex > + bool pid_noncyclic; > +#endif > } __randomize_layout; > > extern struct pid_namespace init_pid_ns; > diff --git a/kernel/pid.c b/kernel/pid.c > index aa2a7d4da4555..e9da1662b8821 100644 > --- a/kernel/pid.c > +++ b/kernel/pid.c > @@ -191,6 +191,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, > > for (i = ns->level; i >= 0; i--) { > int tid = 0; > + bool pid_noncyclic = 0; > +#ifdef CONFIG_IA32_EMULATION > + pid_noncyclic = READ_ONCE(tmp->pid_noncyclic); > +#endif > > if (set_tid_size) { > tid = set_tid[ns->level - i]; > @@ -235,8 +239,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, > * Store a null pointer so find_pid_ns does not find > * a partially initialized PID (see below). > */ > - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, > - pid_max, GFP_ATOMIC); > + if (likely(!pid_noncyclic)) > + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, > + pid_max, GFP_ATOMIC); > + else > + nr = idr_alloc(&tmp->idr, NULL, pid_min, > + pid_max, GFP_ATOMIC); > } > spin_unlock_irq(&pidmap_lock); > idr_preload_end(); > diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c > index 0f23285be4f92..ceda94a064294 100644 > --- a/kernel/pid_namespace.c > +++ b/kernel/pid_namespace.c > @@ -113,6 +113,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns > ns->pid_allocated = PIDNS_ADDING; > #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); > +#endif > +#ifdef CONFIG_IA32_EMULATION > + ns->pid_noncyclic = READ_ONCE(parent_pid_ns->pid_noncyclic); > #endif > return ns; > > @@ -260,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) > return; > } > > -#ifdef CONFIG_CHECKPOINT_RESTORE > +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION) > static int pid_ns_ctl_handler(const struct ctl_table *table, int write, > void *buffer, size_t *lenp, loff_t *ppos) > { > @@ -271,12 +274,23 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write, > if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) > return -EPERM; > > - next = idr_get_cursor(&pid_ns->idr) - 1; > + next = -1; > +#ifdef CONFIG_IA32_EMULATION > + if (!pid_ns->pid_noncyclic) > +#endif > + next += idr_get_cursor(&pid_ns->idr); > > tmp.data = &next; > ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); > - if (!ret && write) > - idr_set_cursor(&pid_ns->idr, next + 1); > + if (!ret && write) { > + if (next > -1) > + idr_set_cursor(&pid_ns->idr, next + 1); > + else if (!IS_ENABLED(CONFIG_IA32_EMULATION)) > + ret = -EINVAL; > +#ifdef CONFIG_IA32_EMULATION > + WRITE_ONCE(pid_ns->pid_noncyclic, next == -1); > +#endif > + } > > return ret; > } > @@ -288,11 +302,11 @@ static const struct ctl_table pid_ns_ctl_table[] = { > .maxlen = sizeof(int), > .mode = 0666, /* permissions are checked in the handler */ > .proc_handler = pid_ns_ctl_handler, > - .extra1 = SYSCTL_ZERO, > + .extra1 = SYSCTL_NEG_ONE, > .extra2 = &pid_max, > }, > }; > -#endif /* CONFIG_CHECKPOINT_RESTORE */ > +#endif /* CONFIG_CHECKPOINT_RESTORE || CONFIG_IA32_EMULATION */ > > int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) > { > @@ -449,7 +463,7 @@ static __init int pid_namespaces_init(void) > { > pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); > > -#ifdef CONFIG_CHECKPOINT_RESTORE > +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION) > register_sysctl_init("kernel", pid_ns_ctl_table); > #endif > > -- > 2.48.1 >