Am Fr., 21. Feb. 2025 um 18:02 Uhr schrieb Michal Koutný <mkoutny@xxxxxxxx>: > > This reverts commit 7863dcc72d0f4b13a641065670426435448b3d80. If we revert this one, then we should also revert a corresponding kselftest: https://github.com/torvalds/linux/commit/615ab43b838bb982dc234feff75ee9ad35447c5d > > It is already difficult for users to troubleshoot which of multiple pid > limits restricts their workload. I'm afraid making pid_max > per-(hierarchical-)NS will contribute to confusion. > Also, the implementation copies the limit upon creation from > parent, this pattern showed cumbersome with some attributes in legacy > cgroup controllers -- it's subject to race condition between parent's > limit modification and children creation and once copied it must be > changed in the descendant. > > This is very similar to what pids.max of a cgroup (already) does that > can be used as an alternative. > > Link: https://lore.kernel.org/r/bnxhqrq7tip6jl2hu6jsvxxogdfii7ugmafbhgsogovrchxfyp@kagotkztqurt/ > Signed-off-by: Michal Koutný <mkoutny@xxxxxxxx> > --- > include/linux/pid.h | 3 + > include/linux/pid_namespace.h | 10 +-- > kernel/pid.c | 125 ++---------------------------- > kernel/pid_namespace.c | 43 +++------- > kernel/sysctl.c | 9 +++ > kernel/trace/pid_list.c | 2 +- > kernel/trace/trace.h | 2 + > kernel/trace/trace_sched_switch.c | 2 +- > 8 files changed, 35 insertions(+), 161 deletions(-) > > diff --git a/include/linux/pid.h b/include/linux/pid.h > index 98837a1ff0f33..fe575fcdb4afa 100644 > --- a/include/linux/pid.h > +++ b/include/linux/pid.h > @@ -108,6 +108,9 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old); > extern void transfer_pid(struct task_struct *old, struct task_struct *new, > enum pid_type); > > +extern int pid_max; > +extern int pid_max_min, pid_max_max; > + > /* > * look up a PID in the hash table. Must be called with the tasklist_lock > * or rcu_read_lock() held. > diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h > index 7c67a58111998..f9f9931e02d6a 100644 > --- a/include/linux/pid_namespace.h > +++ b/include/linux/pid_namespace.h > @@ -30,7 +30,6 @@ struct pid_namespace { > struct task_struct *child_reaper; > struct kmem_cache *pid_cachep; > unsigned int level; > - int pid_max; > struct pid_namespace *parent; > #ifdef CONFIG_BSD_PROCESS_ACCT > struct fs_pin *bacct; > @@ -39,14 +38,9 @@ struct pid_namespace { > struct ucounts *ucounts; > int reboot; /* group exit code if this pidns was rebooted */ > struct ns_common ns; > - struct work_struct work; > -#ifdef CONFIG_SYSCTL > - struct ctl_table_set set; > - struct ctl_table_header *sysctls; > -#if defined(CONFIG_MEMFD_CREATE) > +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > int memfd_noexec_scope; > #endif > -#endif > } __randomize_layout; > > extern struct pid_namespace init_pid_ns; > @@ -123,8 +117,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) > extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); > void pidhash_init(void); > void pid_idr_init(void); > -int register_pidns_sysctls(struct pid_namespace *pidns); > -void unregister_pidns_sysctls(struct pid_namespace *pidns); > > static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) > { > diff --git a/kernel/pid.c b/kernel/pid.c > index 924084713be8b..aa2a7d4da4555 100644 > --- a/kernel/pid.c > +++ b/kernel/pid.c > @@ -61,8 +61,10 @@ struct pid init_struct_pid = { > }, } > }; > > -static int pid_max_min = RESERVED_PIDS + 1; > -static int pid_max_max = PID_MAX_LIMIT; > +int pid_max = PID_MAX_DEFAULT; > + > +int pid_max_min = RESERVED_PIDS + 1; > +int pid_max_max = PID_MAX_LIMIT; > > /* > * PID-map pages start out as NULL, they get allocated upon > @@ -81,7 +83,6 @@ struct pid_namespace init_pid_ns = { > #ifdef CONFIG_PID_NS > .ns.ops = &pidns_operations, > #endif > - .pid_max = PID_MAX_DEFAULT, > #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, > #endif > @@ -190,7 +191,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, > > for (i = ns->level; i >= 0; i--) { > int tid = 0; > - int pid_max = READ_ONCE(tmp->pid_max); > > if (set_tid_size) { > tid = set_tid[ns->level - i]; > @@ -644,118 +644,17 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) > return fd; > } > > -#ifdef CONFIG_SYSCTL > -static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root) > -{ > - return &task_active_pid_ns(current)->set; > -} > - > -static int set_is_seen(struct ctl_table_set *set) > -{ > - return &task_active_pid_ns(current)->set == set; > -} > - > -static int pid_table_root_permissions(struct ctl_table_header *head, > - const struct ctl_table *table) > -{ > - struct pid_namespace *pidns = > - container_of(head->set, struct pid_namespace, set); > - int mode = table->mode; > - > - if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) || > - uid_eq(current_euid(), make_kuid(pidns->user_ns, 0))) > - mode = (mode & S_IRWXU) >> 6; > - else if (in_egroup_p(make_kgid(pidns->user_ns, 0))) > - mode = (mode & S_IRWXG) >> 3; > - else > - mode = mode & S_IROTH; > - return (mode << 6) | (mode << 3) | mode; > -} > - > -static void pid_table_root_set_ownership(struct ctl_table_header *head, > - kuid_t *uid, kgid_t *gid) > -{ > - struct pid_namespace *pidns = > - container_of(head->set, struct pid_namespace, set); > - kuid_t ns_root_uid; > - kgid_t ns_root_gid; > - > - ns_root_uid = make_kuid(pidns->user_ns, 0); > - if (uid_valid(ns_root_uid)) > - *uid = ns_root_uid; > - > - ns_root_gid = make_kgid(pidns->user_ns, 0); > - if (gid_valid(ns_root_gid)) > - *gid = ns_root_gid; > -} > - > -static struct ctl_table_root pid_table_root = { > - .lookup = pid_table_root_lookup, > - .permissions = pid_table_root_permissions, > - .set_ownership = pid_table_root_set_ownership, > -}; > - > -static const struct ctl_table pid_table[] = { > - { > - .procname = "pid_max", > - .data = &init_pid_ns.pid_max, > - .maxlen = sizeof(int), > - .mode = 0644, > - .proc_handler = proc_dointvec_minmax, > - .extra1 = &pid_max_min, > - .extra2 = &pid_max_max, > - }, > -}; > -#endif > - > -int register_pidns_sysctls(struct pid_namespace *pidns) > -{ > -#ifdef CONFIG_SYSCTL > - struct ctl_table *tbl; > - > - setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen); > - > - tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL); > - if (!tbl) > - return -ENOMEM; > - tbl->data = &pidns->pid_max; > - pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max, > - PIDS_PER_CPU_DEFAULT * num_possible_cpus())); > - > - pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl, > - ARRAY_SIZE(pid_table)); > - if (!pidns->sysctls) { > - kfree(tbl); > - retire_sysctl_set(&pidns->set); > - return -ENOMEM; > - } > -#endif > - return 0; > -} > - > -void unregister_pidns_sysctls(struct pid_namespace *pidns) > -{ > -#ifdef CONFIG_SYSCTL > - const struct ctl_table *tbl; > - > - tbl = pidns->sysctls->ctl_table_arg; > - unregister_sysctl_table(pidns->sysctls); > - retire_sysctl_set(&pidns->set); > - kfree(tbl); > -#endif > -} > - > void __init pid_idr_init(void) > { > /* Verify no one has done anything silly: */ > BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); > > /* bump default and minimum pid_max based on number of cpus */ > - init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max, > - PIDS_PER_CPU_DEFAULT * num_possible_cpus())); > + pid_max = min(pid_max_max, max_t(int, pid_max, > + PIDS_PER_CPU_DEFAULT * num_possible_cpus())); > pid_max_min = max_t(int, pid_max_min, > PIDS_PER_CPU_MIN * num_possible_cpus()); > - pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min); > + pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); > > idr_init(&init_pid_ns.idr); > > @@ -766,16 +665,6 @@ void __init pid_idr_init(void) > NULL); > } > > -static __init int pid_namespace_sysctl_init(void) > -{ > -#ifdef CONFIG_SYSCTL > - /* "kernel" directory will have already been initialized. */ > - BUG_ON(register_pidns_sysctls(&init_pid_ns)); > -#endif > - return 0; > -} > -subsys_initcall(pid_namespace_sysctl_init); > - > static struct file *__pidfd_fget(struct task_struct *task, int fd) > { > struct file *file; > diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c > index 8f6cfec87555a..0f23285be4f92 100644 > --- a/kernel/pid_namespace.c > +++ b/kernel/pid_namespace.c > @@ -70,8 +70,6 @@ static void dec_pid_namespaces(struct ucounts *ucounts) > dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); > } > > -static void destroy_pid_namespace_work(struct work_struct *work); > - > static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, > struct pid_namespace *parent_pid_ns) > { > @@ -107,27 +105,17 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns > goto out_free_idr; > ns->ns.ops = &pidns_operations; > > - ns->pid_max = parent_pid_ns->pid_max; > - err = register_pidns_sysctls(ns); > - if (err) > - goto out_free_inum; > - > refcount_set(&ns->ns.count, 1); > ns->level = level; > ns->parent = get_pid_ns(parent_pid_ns); > ns->user_ns = get_user_ns(user_ns); > ns->ucounts = ucounts; > ns->pid_allocated = PIDNS_ADDING; > - INIT_WORK(&ns->work, destroy_pid_namespace_work); > - > #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) > ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); > #endif > - > return ns; > > -out_free_inum: > - ns_free_inum(&ns->ns); > out_free_idr: > idr_destroy(&ns->idr); > kmem_cache_free(pid_ns_cachep, ns); > @@ -149,28 +137,12 @@ static void delayed_free_pidns(struct rcu_head *p) > > static void destroy_pid_namespace(struct pid_namespace *ns) > { > - unregister_pidns_sysctls(ns); > - > ns_free_inum(&ns->ns); > > idr_destroy(&ns->idr); > call_rcu(&ns->rcu, delayed_free_pidns); > } > > -static void destroy_pid_namespace_work(struct work_struct *work) > -{ > - struct pid_namespace *ns = > - container_of(work, struct pid_namespace, work); > - > - do { > - struct pid_namespace *parent; > - > - parent = ns->parent; > - destroy_pid_namespace(ns); > - ns = parent; > - } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); > -} > - > struct pid_namespace *copy_pid_ns(unsigned long flags, > struct user_namespace *user_ns, struct pid_namespace *old_ns) > { > @@ -183,8 +155,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, > > void put_pid_ns(struct pid_namespace *ns) > { > - if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)) > - schedule_work(&ns->work); > + struct pid_namespace *parent; > + > + while (ns != &init_pid_ns) { > + parent = ns->parent; > + if (!refcount_dec_and_test(&ns->ns.count)) > + break; > + destroy_pid_namespace(ns); > + ns = parent; > + } > } > EXPORT_SYMBOL_GPL(put_pid_ns); > > @@ -295,7 +274,6 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write, > next = idr_get_cursor(&pid_ns->idr) - 1; > > tmp.data = &next; > - tmp.extra2 = &pid_ns->pid_max; > ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); > if (!ret && write) > idr_set_cursor(&pid_ns->idr, next + 1); > @@ -303,6 +281,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write, > return ret; > } > > +extern int pid_max; > static const struct ctl_table pid_ns_ctl_table[] = { > { > .procname = "ns_last_pid", > @@ -310,7 +289,7 @@ static const struct ctl_table pid_ns_ctl_table[] = { > .mode = 0666, /* permissions are checked in the handler */ > .proc_handler = pid_ns_ctl_handler, > .extra1 = SYSCTL_ZERO, > - .extra2 = &init_pid_ns.pid_max, > + .extra2 = &pid_max, > }, > }; > #endif /* CONFIG_CHECKPOINT_RESTORE */ > diff --git a/kernel/sysctl.c b/kernel/sysctl.c > index cb57da499ebb1..bb739608680f2 100644 > --- a/kernel/sysctl.c > +++ b/kernel/sysctl.c > @@ -1803,6 +1803,15 @@ static const struct ctl_table kern_table[] = { > .proc_handler = proc_dointvec, > }, > #endif > + { > + .procname = "pid_max", > + .data = &pid_max, > + .maxlen = sizeof(int), > + .mode = 0644, > + .proc_handler = proc_dointvec_minmax, > + .extra1 = &pid_max_min, > + .extra2 = &pid_max_max, > + }, > { > .procname = "panic_on_oops", > .data = &panic_on_oops, > diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c > index c62b9b3cfb3d8..4966e6bbdf6f3 100644 > --- a/kernel/trace/pid_list.c > +++ b/kernel/trace/pid_list.c > @@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) > int i; > > /* According to linux/thread.h, pids can be no bigger that 30 bits */ > - WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30)); > + WARN_ON_ONCE(pid_max > (1 << 30)); > > pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); > if (!pid_list) > diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h > index 9c21ba45b7af6..46c65402ad7e5 100644 > --- a/kernel/trace/trace.h > +++ b/kernel/trace/trace.h > @@ -732,6 +732,8 @@ extern unsigned long tracing_thresh; > > /* PID filtering */ > > +extern int pid_max; > + > bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, > pid_t search_pid); > bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, > diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c > index cb49f7279dc80..573b5d8e8a28e 100644 > --- a/kernel/trace/trace_sched_switch.c > +++ b/kernel/trace/trace_sched_switch.c > @@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void) > if (tgid_map) > return 0; > > - tgid_map_max = init_pid_ns.pid_max; > + tgid_map_max = pid_max; > map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), > GFP_KERNEL); > if (!map) > -- > 2.48.1 >