Re: [PATCH RFC v2 2/2] pidfs: use maple tree

Marek Szyprowski <m.szyprowski@xxxxxxxxxxx> · Fri, 13 Dec 2024 11:35:48 +0100

On 09.12.2024 14:46, Christian Brauner wrote:
> So far we've been using an idr to track pidfs inodes. For some time now
> each struct pid has a unique 64bit value that is used as the inode
> number on 64 bit. That unique inode couldn't be used for looking up a
> specific struct pid though.
>
> Now that we support file handles we need this ability while avoiding to
> leak actual pid identifiers into userspace which can be problematic in
> containers.
>
> So far I had used an idr-based mechanism where the idr is used to
> generate a 32 bit number and each time it wraps we increment an upper
> bit value and generate a unique 64 bit value. The lower 32 bits are used
> to lookup the pid.
>
> I've been looking at the maple tree because it now has
> mas_alloc_cyclic(). Since it uses unsigned long it would simplify the
> 64bit implementation and its dense node mode supposedly also helps to
> mitigate fragmentation.
>
> Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>

This patch landed in today's linux-next as commit a2c8e88a30f7 ("pidfs: 
use maple tree"). In my tests I found that it triggers the following 
lockdep warning, what probably means that something has not been 
properly initialized:

================================
WARNING: inconsistent lock state
6.13.0-rc1-00015-ga2c8e88a30f7 #15489 Not tainted
--------------------------------
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
swapper/0/0 [HC1[1]:SC0[0]:HE0:SE1] takes:
c25d4d10 (&sighand->siglock){?.+.}-{3:3}, at: __lock_task_sighand+0x80/0x1bc
{HARDIRQ-ON-W} state was registered at:
   lockdep_hardirqs_on_prepare+0x1a4/0x2c0
   trace_hardirqs_on+0x94/0xa0
   _raw_spin_unlock_irq+0x20/0x50
   mtree_erase+0x88/0xbc
   free_pid+0xc/0xd4
   release_task+0x630/0x7a8
   do_exit+0x6b8/0xa64
   call_usermodehelper_exec_async+0x120/0x144
   ret_from_fork+0x14/0x28
irq event stamp: 1017892
hardirqs last  enabled at (1017891): [<c0c8e510>] 
default_idle_call+0x18/0x13c
hardirqs last disabled at (1017892): [<c0100b94>] __irq_svc+0x54/0xd0
softirqs last  enabled at (1017868): [<c013b410>] 
handle_softirqs+0x328/0x520
softirqs last disabled at (1017835): [<c013b7b4>] __irq_exit_rcu+0x144/0x1f0

other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(&sighand->siglock);
   <Interrupt>
     lock(&sighand->siglock);

  *** DEADLOCK ***

2 locks held by swapper/0/0:
  #0: c137b4d0 (rcu_read_lock){....}-{1:3}, at: 
kill_pid_info_type+0x2c/0x168
  #1: c137b4d0 (rcu_read_lock){....}-{1:3}, at: 
__lock_task_sighand+0x0/0x1bc

stack backtrace:
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 
6.13.0-rc1-00015-ga2c8e88a30f7 #15489
Hardware name: Samsung Exynos (Flattened Device Tree)
Call trace:
  unwind_backtrace from show_stack+0x10/0x14
  show_stack from dump_stack_lvl+0x68/0x88
  dump_stack_lvl from print_usage_bug.part.0+0x24c/0x270
  print_usage_bug.part.0 from mark_lock.part.0+0xc20/0x129c
  mark_lock.part.0 from __lock_acquire+0xae8/0x2a00
  __lock_acquire from lock_acquire+0x134/0x384
  lock_acquire from _raw_spin_lock_irqsave+0x4c/0x68
  _raw_spin_lock_irqsave from __lock_task_sighand+0x80/0x1bc
  __lock_task_sighand from group_send_sig_info+0x120/0x1b4
  group_send_sig_info from kill_pid_info_type+0x8c/0x168
  kill_pid_info_type from it_real_fn+0x5c/0x120
  it_real_fn from __hrtimer_run_queues+0xcc/0x538
  __hrtimer_run_queues from hrtimer_interrupt+0x128/0x2c4
  hrtimer_interrupt from exynos4_mct_tick_isr+0x44/0x7c
  exynos4_mct_tick_isr from handle_percpu_devid_irq+0x84/0x158
  handle_percpu_devid_irq from generic_handle_domain_irq+0x24/0x34
  generic_handle_domain_irq from gic_handle_irq+0x88/0xa8
  gic_handle_irq from generic_handle_arch_irq+0x34/0x44
  generic_handle_arch_irq from __irq_svc+0x8c/0xd0
Exception stack(0xc1301f20 to 0xc1301f68)
...
  __irq_svc from default_idle_call+0x1c/0x13c
  default_idle_call from do_idle+0x23c/0x2ac
  do_idle from cpu_startup_entry+0x28/0x2c
  cpu_startup_entry from kernel_init+0x0/0x12c

--->8---

> ---
>   fs/pidfs.c   | 52 +++++++++++++++++++++++++++++++---------------------
>   kernel/pid.c | 34 +++++++++++++++++-----------------
>   2 files changed, 48 insertions(+), 38 deletions(-)
>
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 7a1d606b09d7b315e780c81fc7341f4ec82f2639..4a622f906fc210d5e81ba2ac856cfe0ca930f219 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -19,14 +19,15 @@
>   #include <linux/ipc_namespace.h>
>   #include <linux/time_namespace.h>
>   #include <linux/utsname.h>
> +#include <linux/maple_tree.h>
>   #include <net/net_namespace.h>
>   
>   #include "internal.h"
>   #include "mount.h"
>   
> -static DEFINE_IDR(pidfs_ino_idr);
> -
> -static u32 pidfs_ino_upper_32_bits = 0;
> +static struct maple_tree pidfs_ino_mtree = MTREE_INIT(pidfs_ino_mtree,
> +						      MT_FLAGS_ALLOC_RANGE |
> +						      MT_FLAGS_LOCK_IRQ);
>   
>   #if BITS_PER_LONG == 32
>   /*
> @@ -34,8 +35,6 @@ static u32 pidfs_ino_upper_32_bits = 0;
>    * the higher 32 bits are the generation number. The starting
>    * value for the inode number and the generation number is one.
>    */
> -static u32 pidfs_ino_lower_32_bits = 1;
> -
>   static inline unsigned long pidfs_ino(u64 ino)
>   {
>   	return lower_32_bits(ino);
> @@ -49,8 +48,6 @@ static inline u32 pidfs_gen(u64 ino)
>   
>   #else
>   
> -static u32 pidfs_ino_lower_32_bits = 0;
> -
>   /* On 64 bit simply return ino. */
>   static inline unsigned long pidfs_ino(u64 ino)
>   {
> @@ -71,30 +68,43 @@ static inline u32 pidfs_gen(u64 ino)
>    */
>   int pidfs_add_pid(struct pid *pid)
>   {
> -	u32 upper;
> -	int lower;
> +	static unsigned long lower_next = 0;
> +	static u32 pidfs_ino_upper_32_bits = 0;
> +	unsigned long lower;
> +	int ret;
> +	MA_STATE(mas, &pidfs_ino_mtree, 0, 0);
>   
>           /*
>   	 * Inode numbering for pidfs start at 2. This avoids collisions
>   	 * with the root inode which is 1 for pseudo filesystems.
>            */
> -	lower = idr_alloc_cyclic(&pidfs_ino_idr, pid, 2, 0, GFP_ATOMIC);
> -	if (lower >= 0 && lower < pidfs_ino_lower_32_bits)
> -		pidfs_ino_upper_32_bits++;
> -	upper = pidfs_ino_upper_32_bits;
> -	pidfs_ino_lower_32_bits = lower;
> -	if (lower < 0)
> -		return lower;
> -
> -	pid->ino = ((u64)upper << 32) | lower;
> +	mtree_lock(&pidfs_ino_mtree);
> +	ret = mas_alloc_cyclic(&mas, &lower, pid, 2, ULONG_MAX, &lower_next,
> +			       GFP_KERNEL);
> +	if (ret < 0)
> +		goto out_unlock;
> +
> +#if BITS_PER_LONG == 32
> +	if (ret == 1) {
> +		u32 higher;
> +
> +		if (check_add_overflow(pidfs_ino_upper_32_bits, 1, &higher))
> +			goto out_unlock;
> +		pidfs_ino_upper_32_bits = higher;
> +	}
> +#endif
> +	pid->ino = ((u64)pidfs_ino_upper_32_bits << 32) | lower;
>   	pid->stashed = NULL;
> -	return 0;
> +
> +out_unlock:
> +	mtree_unlock(&pidfs_ino_mtree);
> +	return ret;
>   }
>   
>   /* The idr number to remove is the lower 32 bits of the inode. */
>   void pidfs_remove_pid(struct pid *pid)
>   {
> -	idr_remove(&pidfs_ino_idr, lower_32_bits(pid->ino));
> +	mtree_erase(&pidfs_ino_mtree, pidfs_ino(pid->ino));
>   }
>   
>   #ifdef CONFIG_PROC_FS
> @@ -522,7 +532,7 @@ static struct pid *pidfs_ino_get_pid(u64 ino)
>   
>   	guard(rcu)();
>   
> -	pid = idr_find(&pidfs_ino_idr, lower_32_bits(pid_ino));
> +	pid = mtree_load(&pidfs_ino_mtree, pid_ino);
>   	if (!pid)
>   		return NULL;
>   
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 6131543e7c090c164a2bac014f8eeee61926b13d..28100bbac4c130e192abf65d88cca9d330971c5c 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -131,6 +131,8 @@ void free_pid(struct pid *pid)
>   	int i;
>   	unsigned long flags;
>   
> +	pidfs_remove_pid(pid);
> +
>   	spin_lock_irqsave(&pidmap_lock, flags);
>   	for (i = 0; i <= pid->level; i++) {
>   		struct upid *upid = pid->numbers + i;
> @@ -152,7 +154,6 @@ void free_pid(struct pid *pid)
>   		}
>   
>   		idr_remove(&ns->idr, upid->nr);
> -		pidfs_remove_pid(pid);
>   	}
>   	spin_unlock_irqrestore(&pidmap_lock, flags);
>   
> @@ -249,16 +250,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>   		tmp = tmp->parent;
>   	}
>   
> -	/*
> -	 * ENOMEM is not the most obvious choice especially for the case
> -	 * where the child subreaper has already exited and the pid
> -	 * namespace denies the creation of any new processes. But ENOMEM
> -	 * is what we have exposed to userspace for a long time and it is
> -	 * documented behavior for pid namespaces. So we can't easily
> -	 * change it even if there were an error code better suited.
> -	 */
> -	retval = -ENOMEM;
> -
>   	get_pid_ns(ns);
>   	refcount_set(&pid->count, 1);
>   	spin_lock_init(&pid->lock);
> @@ -269,12 +260,23 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>   	INIT_HLIST_HEAD(&pid->inodes);
>   
>   	upid = pid->numbers + ns->level;
> -	idr_preload(GFP_KERNEL);
> -	spin_lock_irq(&pidmap_lock);
> -	if (!(ns->pid_allocated & PIDNS_ADDING))
> -		goto out_unlock;
> +
>   	retval = pidfs_add_pid(pid);
>   	if (retval)
> +		goto out_free;
> +
> +	/*
> +	 * ENOMEM is not the most obvious choice especially for the case
> +	 * where the child subreaper has already exited and the pid
> +	 * namespace denies the creation of any new processes. But ENOMEM
> +	 * is what we have exposed to userspace for a long time and it is
> +	 * documented behavior for pid namespaces. So we can't easily
> +	 * change it even if there were an error code better suited.
> +	 */
> +	retval = -ENOMEM;
> +
> +	spin_lock_irq(&pidmap_lock);
> +	if (!(ns->pid_allocated & PIDNS_ADDING))
>   		goto out_unlock;
>   	for ( ; upid >= pid->numbers; --upid) {
>   		/* Make the PID visible to find_pid_ns. */
> @@ -282,13 +284,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>   		upid->ns->pid_allocated++;
>   	}
>   	spin_unlock_irq(&pidmap_lock);
> -	idr_preload_end();
>   
>   	return pid;
>   
>   out_unlock:
>   	spin_unlock_irq(&pidmap_lock);
> -	idr_preload_end();
>   	put_pid_ns(ns);
>   
>   out_free:
>
Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland