On 09.12.2024 14:46, Christian Brauner wrote: > So far we've been using an idr to track pidfs inodes. For some time now > each struct pid has a unique 64bit value that is used as the inode > number on 64 bit. That unique inode couldn't be used for looking up a > specific struct pid though. > > Now that we support file handles we need this ability while avoiding to > leak actual pid identifiers into userspace which can be problematic in > containers. > > So far I had used an idr-based mechanism where the idr is used to > generate a 32 bit number and each time it wraps we increment an upper > bit value and generate a unique 64 bit value. The lower 32 bits are used > to lookup the pid. > > I've been looking at the maple tree because it now has > mas_alloc_cyclic(). Since it uses unsigned long it would simplify the > 64bit implementation and its dense node mode supposedly also helps to > mitigate fragmentation. > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> This patch landed in today's linux-next as commit a2c8e88a30f7 ("pidfs: use maple tree"). In my tests I found that it triggers the following lockdep warning, what probably means that something has not been properly initialized: ================================ WARNING: inconsistent lock state 6.13.0-rc1-00015-ga2c8e88a30f7 #15489 Not tainted -------------------------------- inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. swapper/0/0 [HC1[1]:SC0[0]:HE0:SE1] takes: c25d4d10 (&sighand->siglock){?.+.}-{3:3}, at: __lock_task_sighand+0x80/0x1bc {HARDIRQ-ON-W} state was registered at: lockdep_hardirqs_on_prepare+0x1a4/0x2c0 trace_hardirqs_on+0x94/0xa0 _raw_spin_unlock_irq+0x20/0x50 mtree_erase+0x88/0xbc free_pid+0xc/0xd4 release_task+0x630/0x7a8 do_exit+0x6b8/0xa64 call_usermodehelper_exec_async+0x120/0x144 ret_from_fork+0x14/0x28 irq event stamp: 1017892 hardirqs last enabled at (1017891): [<c0c8e510>] default_idle_call+0x18/0x13c hardirqs last disabled at (1017892): [<c0100b94>] __irq_svc+0x54/0xd0 softirqs last enabled at (1017868): [<c013b410>] handle_softirqs+0x328/0x520 softirqs last disabled at (1017835): [<c013b7b4>] __irq_exit_rcu+0x144/0x1f0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&sighand->siglock); <Interrupt> lock(&sighand->siglock); *** DEADLOCK *** 2 locks held by swapper/0/0: #0: c137b4d0 (rcu_read_lock){....}-{1:3}, at: kill_pid_info_type+0x2c/0x168 #1: c137b4d0 (rcu_read_lock){....}-{1:3}, at: __lock_task_sighand+0x0/0x1bc stack backtrace: CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.13.0-rc1-00015-ga2c8e88a30f7 #15489 Hardware name: Samsung Exynos (Flattened Device Tree) Call trace: unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x68/0x88 dump_stack_lvl from print_usage_bug.part.0+0x24c/0x270 print_usage_bug.part.0 from mark_lock.part.0+0xc20/0x129c mark_lock.part.0 from __lock_acquire+0xae8/0x2a00 __lock_acquire from lock_acquire+0x134/0x384 lock_acquire from _raw_spin_lock_irqsave+0x4c/0x68 _raw_spin_lock_irqsave from __lock_task_sighand+0x80/0x1bc __lock_task_sighand from group_send_sig_info+0x120/0x1b4 group_send_sig_info from kill_pid_info_type+0x8c/0x168 kill_pid_info_type from it_real_fn+0x5c/0x120 it_real_fn from __hrtimer_run_queues+0xcc/0x538 __hrtimer_run_queues from hrtimer_interrupt+0x128/0x2c4 hrtimer_interrupt from exynos4_mct_tick_isr+0x44/0x7c exynos4_mct_tick_isr from handle_percpu_devid_irq+0x84/0x158 handle_percpu_devid_irq from generic_handle_domain_irq+0x24/0x34 generic_handle_domain_irq from gic_handle_irq+0x88/0xa8 gic_handle_irq from generic_handle_arch_irq+0x34/0x44 generic_handle_arch_irq from __irq_svc+0x8c/0xd0 Exception stack(0xc1301f20 to 0xc1301f68) ... __irq_svc from default_idle_call+0x1c/0x13c default_idle_call from do_idle+0x23c/0x2ac do_idle from cpu_startup_entry+0x28/0x2c cpu_startup_entry from kernel_init+0x0/0x12c --->8--- > --- > fs/pidfs.c | 52 +++++++++++++++++++++++++++++++--------------------- > kernel/pid.c | 34 +++++++++++++++++----------------- > 2 files changed, 48 insertions(+), 38 deletions(-) > > diff --git a/fs/pidfs.c b/fs/pidfs.c > index 7a1d606b09d7b315e780c81fc7341f4ec82f2639..4a622f906fc210d5e81ba2ac856cfe0ca930f219 100644 > --- a/fs/pidfs.c > +++ b/fs/pidfs.c > @@ -19,14 +19,15 @@ > #include <linux/ipc_namespace.h> > #include <linux/time_namespace.h> > #include <linux/utsname.h> > +#include <linux/maple_tree.h> > #include <net/net_namespace.h> > > #include "internal.h" > #include "mount.h" > > -static DEFINE_IDR(pidfs_ino_idr); > - > -static u32 pidfs_ino_upper_32_bits = 0; > +static struct maple_tree pidfs_ino_mtree = MTREE_INIT(pidfs_ino_mtree, > + MT_FLAGS_ALLOC_RANGE | > + MT_FLAGS_LOCK_IRQ); > > #if BITS_PER_LONG == 32 > /* > @@ -34,8 +35,6 @@ static u32 pidfs_ino_upper_32_bits = 0; > * the higher 32 bits are the generation number. The starting > * value for the inode number and the generation number is one. > */ > -static u32 pidfs_ino_lower_32_bits = 1; > - > static inline unsigned long pidfs_ino(u64 ino) > { > return lower_32_bits(ino); > @@ -49,8 +48,6 @@ static inline u32 pidfs_gen(u64 ino) > > #else > > -static u32 pidfs_ino_lower_32_bits = 0; > - > /* On 64 bit simply return ino. */ > static inline unsigned long pidfs_ino(u64 ino) > { > @@ -71,30 +68,43 @@ static inline u32 pidfs_gen(u64 ino) > */ > int pidfs_add_pid(struct pid *pid) > { > - u32 upper; > - int lower; > + static unsigned long lower_next = 0; > + static u32 pidfs_ino_upper_32_bits = 0; > + unsigned long lower; > + int ret; > + MA_STATE(mas, &pidfs_ino_mtree, 0, 0); > > /* > * Inode numbering for pidfs start at 2. This avoids collisions > * with the root inode which is 1 for pseudo filesystems. > */ > - lower = idr_alloc_cyclic(&pidfs_ino_idr, pid, 2, 0, GFP_ATOMIC); > - if (lower >= 0 && lower < pidfs_ino_lower_32_bits) > - pidfs_ino_upper_32_bits++; > - upper = pidfs_ino_upper_32_bits; > - pidfs_ino_lower_32_bits = lower; > - if (lower < 0) > - return lower; > - > - pid->ino = ((u64)upper << 32) | lower; > + mtree_lock(&pidfs_ino_mtree); > + ret = mas_alloc_cyclic(&mas, &lower, pid, 2, ULONG_MAX, &lower_next, > + GFP_KERNEL); > + if (ret < 0) > + goto out_unlock; > + > +#if BITS_PER_LONG == 32 > + if (ret == 1) { > + u32 higher; > + > + if (check_add_overflow(pidfs_ino_upper_32_bits, 1, &higher)) > + goto out_unlock; > + pidfs_ino_upper_32_bits = higher; > + } > +#endif > + pid->ino = ((u64)pidfs_ino_upper_32_bits << 32) | lower; > pid->stashed = NULL; > - return 0; > + > +out_unlock: > + mtree_unlock(&pidfs_ino_mtree); > + return ret; > } > > /* The idr number to remove is the lower 32 bits of the inode. */ > void pidfs_remove_pid(struct pid *pid) > { > - idr_remove(&pidfs_ino_idr, lower_32_bits(pid->ino)); > + mtree_erase(&pidfs_ino_mtree, pidfs_ino(pid->ino)); > } > > #ifdef CONFIG_PROC_FS > @@ -522,7 +532,7 @@ static struct pid *pidfs_ino_get_pid(u64 ino) > > guard(rcu)(); > > - pid = idr_find(&pidfs_ino_idr, lower_32_bits(pid_ino)); > + pid = mtree_load(&pidfs_ino_mtree, pid_ino); > if (!pid) > return NULL; > > diff --git a/kernel/pid.c b/kernel/pid.c > index 6131543e7c090c164a2bac014f8eeee61926b13d..28100bbac4c130e192abf65d88cca9d330971c5c 100644 > --- a/kernel/pid.c > +++ b/kernel/pid.c > @@ -131,6 +131,8 @@ void free_pid(struct pid *pid) > int i; > unsigned long flags; > > + pidfs_remove_pid(pid); > + > spin_lock_irqsave(&pidmap_lock, flags); > for (i = 0; i <= pid->level; i++) { > struct upid *upid = pid->numbers + i; > @@ -152,7 +154,6 @@ void free_pid(struct pid *pid) > } > > idr_remove(&ns->idr, upid->nr); > - pidfs_remove_pid(pid); > } > spin_unlock_irqrestore(&pidmap_lock, flags); > > @@ -249,16 +250,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, > tmp = tmp->parent; > } > > - /* > - * ENOMEM is not the most obvious choice especially for the case > - * where the child subreaper has already exited and the pid > - * namespace denies the creation of any new processes. But ENOMEM > - * is what we have exposed to userspace for a long time and it is > - * documented behavior for pid namespaces. So we can't easily > - * change it even if there were an error code better suited. > - */ > - retval = -ENOMEM; > - > get_pid_ns(ns); > refcount_set(&pid->count, 1); > spin_lock_init(&pid->lock); > @@ -269,12 +260,23 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, > INIT_HLIST_HEAD(&pid->inodes); > > upid = pid->numbers + ns->level; > - idr_preload(GFP_KERNEL); > - spin_lock_irq(&pidmap_lock); > - if (!(ns->pid_allocated & PIDNS_ADDING)) > - goto out_unlock; > + > retval = pidfs_add_pid(pid); > if (retval) > + goto out_free; > + > + /* > + * ENOMEM is not the most obvious choice especially for the case > + * where the child subreaper has already exited and the pid > + * namespace denies the creation of any new processes. But ENOMEM > + * is what we have exposed to userspace for a long time and it is > + * documented behavior for pid namespaces. So we can't easily > + * change it even if there were an error code better suited. > + */ > + retval = -ENOMEM; > + > + spin_lock_irq(&pidmap_lock); > + if (!(ns->pid_allocated & PIDNS_ADDING)) > goto out_unlock; > for ( ; upid >= pid->numbers; --upid) { > /* Make the PID visible to find_pid_ns. */ > @@ -282,13 +284,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, > upid->ns->pid_allocated++; > } > spin_unlock_irq(&pidmap_lock); > - idr_preload_end(); > > return pid; > > out_unlock: > spin_unlock_irq(&pidmap_lock); > - idr_preload_end(); > put_pid_ns(ns); > > out_free: > Best regards -- Marek Szyprowski, PhD Samsung R&D Institute Poland