Re: [PATCH v3 03/10] fs: lockless mntns rbtree lookup

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Dec 13, 2024 at 12:03:42AM +0100, Christian Brauner wrote:
> Currently we use a read-write lock but for the simple search case we can
> make this lockless. Creating a new mount namespace is a rather rare
> event compared with querying mounts in a foreign mount namespace. Once
> this is picked up by e.g., systemd to list mounts in another mount in
> it's isolated services or in containers this will be used a lot so this
> seems worthwhile doing.
> 
> Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>
> ---
>  fs/mount.h     |   5 ++-
>  fs/namespace.c | 119 +++++++++++++++++++++++++++++++++++----------------------
>  2 files changed, 77 insertions(+), 47 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index 185fc56afc13338f8185fe818051444d540cbd5b..36ead0e45e8aa7614c00001102563a711d9dae6e 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -12,7 +12,10 @@ struct mnt_namespace {
>  	struct user_namespace	*user_ns;
>  	struct ucounts		*ucounts;
>  	u64			seq;	/* Sequence number to prevent loops */
> -	wait_queue_head_t poll;
> +	union {
> +		wait_queue_head_t	poll;
> +		struct rcu_head		mnt_ns_rcu;
> +	};
>  	u64 event;
>  	unsigned int		nr_mounts; /* # of mounts in the namespace */
>  	unsigned int		pending_mounts;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 10fa18dd66018fadfdc9d18c59a851eed7bd55ad..52adee787eb1b6ee8831705b2b121854c3370fb3 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem);
>  static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
>  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
>  static DEFINE_RWLOCK(mnt_ns_tree_lock);
> +static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);
> +
>  static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
>  
>  struct mount_kattr {
> @@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
>   */
>  __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
>  
> -static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
> -{
> -	u64 seq_b = ns->seq;
> -
> -	if (seq < seq_b)
> -		return -1;
> -	if (seq > seq_b)
> -		return 1;
> -	return 0;
> -}
> -
>  static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
>  {
>  	if (!node)
> @@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
>  	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
>  }
>  
> -static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
> +static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
>  {
>  	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
>  	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
>  	u64 seq_a = ns_a->seq;
> +	u64 seq_b = ns_b->seq;
> +
> +	if (seq_a < seq_b)
> +		return -1;
> +	if (seq_a > seq_b)
> +		return 1;
> +	return 0;
> +}
>  
> -	return mnt_ns_cmp(seq_a, ns_b) < 0;
> +static inline void mnt_ns_tree_write_lock(void)
> +{
> +	write_lock(&mnt_ns_tree_lock);
> +	write_seqcount_begin(&mnt_ns_tree_seqcount);
> +}
> +
> +static inline void mnt_ns_tree_write_unlock(void)
> +{
> +	write_seqcount_end(&mnt_ns_tree_seqcount);
> +	write_unlock(&mnt_ns_tree_lock);
>  }
>  
>  static void mnt_ns_tree_add(struct mnt_namespace *ns)
>  {
> -	guard(write_lock)(&mnt_ns_tree_lock);
> -	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
> +	struct rb_node *node;
> +
> +	mnt_ns_tree_write_lock();
> +	node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
> +	mnt_ns_tree_write_unlock();
> +
> +	WARN_ON_ONCE(node);
>  }
>  
>  static void mnt_ns_release(struct mnt_namespace *ns)
> @@ -150,41 +163,36 @@ static void mnt_ns_release(struct mnt_namespace *ns)
>  }
>  DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
>  
> +static void mnt_ns_release_rcu(struct rcu_head *rcu)
> +{
> +	struct mnt_namespace *mnt_ns;
> +
> +	mnt_ns = container_of(rcu, struct mnt_namespace, mnt_ns_rcu);
> +	mnt_ns_release(mnt_ns);
> +}
> +
>  static void mnt_ns_tree_remove(struct mnt_namespace *ns)
>  {
>  	/* remove from global mount namespace list */
>  	if (!is_anon_ns(ns)) {
> -		guard(write_lock)(&mnt_ns_tree_lock);
> +		mnt_ns_tree_write_lock();
>  		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
> +		mnt_ns_tree_write_unlock();
>  	}
>  
> -	mnt_ns_release(ns);
> +	call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
>  }
>  
> -/*
> - * Returns the mount namespace which either has the specified id, or has the
> - * next smallest id afer the specified one.
> - */
> -static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
> +static int mnt_ns_find(const void *key, const struct rb_node *node)
>  {
> -	struct rb_node *node = mnt_ns_tree.rb_node;
> -	struct mnt_namespace *ret = NULL;
> -
> -	lockdep_assert_held(&mnt_ns_tree_lock);
> -
> -	while (node) {
> -		struct mnt_namespace *n = node_to_mnt_ns(node);
> +	const u64 mnt_ns_id = *(u64 *)key;
> +	const struct mnt_namespace *ns = node_to_mnt_ns(node);
>  
> -		if (mnt_ns_id <= n->seq) {
> -			ret = node_to_mnt_ns(node);
> -			if (mnt_ns_id == n->seq)
> -				break;
> -			node = node->rb_left;
> -		} else {
> -			node = node->rb_right;
> -		}
> -	}
> -	return ret;
> +	if (mnt_ns_id < ns->seq)
> +		return -1;
> +	if (mnt_ns_id > ns->seq)
> +		return 1;
> +	return 0;
>  }
>  
>  /*
> @@ -194,18 +202,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
>   * namespace the @namespace_sem must first be acquired. If the namespace has
>   * already shut down before acquiring @namespace_sem, {list,stat}mount() will
>   * see that the mount rbtree of the namespace is empty.
> + *
> + * Note the lookup is lockless protected by a sequence counter. We only
> + * need to guard against false negatives as false positives aren't
> + * possible. So if we didn't find a mount namespace and the sequence
> + * counter has changed we need to retry. If the sequence counter is
> + * still the same we know the search actually failed.
>   */
>  static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
>  {
> -       struct mnt_namespace *ns;
> +	struct mnt_namespace *ns;
> +	struct rb_node *node;
> +	unsigned int seq;
> +
> +	guard(rcu)();
> +	do {
> +		seq = read_seqcount_begin(&mnt_ns_tree_seqcount);
> +		node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
> +		if (node)
> +			break;
> +	} while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq));
>  
> -       guard(read_lock)(&mnt_ns_tree_lock);
> -       ns = mnt_ns_find_id_at(mnt_ns_id);
> -       if (!ns || ns->seq != mnt_ns_id)
> -               return NULL;
> +	if (!node)
> +		return NULL;
>  
> -       refcount_inc(&ns->passive);
> -       return ns;
> +	/*
> +	 * The last reference count is put with RCU delay so we can
> +	 * unconditonally acquire a reference here.
> +	 */
> +	ns = node_to_mnt_ns(node);
> +	refcount_inc(&ns->passive);
> +	return ns;
>  }
>  
>  static inline void lock_mount_hash(void)
> 
> -- 
> 2.45.2
>

Hi Christian Brauner ,

Greetings!

I used Syzkaller and found that there is WARNING in mnt_ns_release in linux v6.13-rc3.

After bisection and the first bad commit is:
"
5eda70f550d7 fs: lockless mntns rbtree lookup
"

All detailed into can be found at:
https://github.com/laifryiee/syzkaller_logs/tree/main/241219_115855_mnt_ns_release
Syzkaller repro code:
https://github.com/laifryiee/syzkaller_logs/tree/main/241219_115855_mnt_ns_release/repro.c
Syzkaller repro syscall steps:
https://github.com/laifryiee/syzkaller_logs/tree/main/241219_115855_mnt_ns_release/repro.prog
Syzkaller report:
https://github.com/laifryiee/syzkaller_logs/tree/main/241219_115855_mnt_ns_release/repro.report
Kconfig(make olddefconfig):
https://github.com/laifryiee/syzkaller_logs/tree/main/241219_115855_mnt_ns_release/kconfig_origin
Bisect info:
https://github.com/laifryiee/syzkaller_logs/tree/main/241219_115855_mnt_ns_release/bisect_info.log
bzImage:
https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/241219_115855_mnt_ns_release/bzImage_fdb298fa865b0136f7be842e6c2e6310dede421a
Issue dmesg:
https://github.com/laifryiee/syzkaller_logs/blob/main/241219_115855_mnt_ns_release/fdb298fa865b0136f7be842e6c2e6310dede421a_dmesg.log

"
[  268.453608] ------------[ cut here ]------------
[  268.453889] WARNING: CPU: 0 PID: 10683 at fs/namespace.c:163 mnt_ns_release+0x18d/0x200
[  268.454274] Modules linked in:
[  268.454431] CPU: 0 UID: 0 PID: 10683 Comm: repro Not tainted 6.13.0-rc3-next-20241217-fdb298fa865b #1
[  268.454857] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qe4
[  268.455380] RIP: 0010:mnt_ns_release+0x18d/0x200
[  268.455604] Code: ff ff 48 c7 c7 30 67 29 87 e8 ff ea ac 03 bf 01 00 00 00 89 c3 89 c6 e8 91 81 7e ff 83 fb 019
[  268.456446] RSP: 0018:ffff88806c409df8 EFLAGS: 00010246
[  268.456693] RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffffffff820995df
[  268.457023] RDX: ffff888020b40000 RSI: ffffffff820995ed RDI: 0000000000000005
[  268.457359] RBP: ffff88806c409e18 R08: ffff888017a66e48 R09: fffffbfff15085a3
[  268.457861] R10: 0000000000000001 R11: 0000000000000001 R12: ffff888017a66e00
[  268.458188] R13: ffff88806c409eb8 R14: ffffffff816e536a R15: 0000000000000003
[  268.458523] FS:  00007f7f8ffbf600(0000) GS:ffff88806c400000(0000) knlGS:0000000000000000
[  268.458880] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  268.459150] CR2: 0000000000000000 CR3: 0000000017be6005 CR4: 0000000000770ef0
[  268.459486] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  268.459808] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[  268.460136] PKRU: 55555554
[  268.460271] Call Trace:
[  268.460394]  <IRQ>
[  268.460500]  ? show_regs+0x6d/0x80
[  268.460671]  ? __warn+0xf3/0x390
[  268.460830]  ? report_bug+0x25e/0x4b0
[  268.461008]  ? mnt_ns_release+0x18d/0x200
[  268.461197]  ? report_bug+0x2cb/0x4b0
[  268.461377]  ? mnt_ns_release+0x18d/0x200
[  268.461587]  ? mnt_ns_release+0x18e/0x200
[  268.461781]  ? handle_bug+0xf1/0x190
[  268.461961]  ? exc_invalid_op+0x3c/0x80
[  268.462152]  ? asm_exc_invalid_op+0x1f/0x30
[  268.462357]  ? rcu_core+0x86a/0x1920
[  268.462539]  ? mnt_ns_release+0x17f/0x200
[  268.462733]  ? mnt_ns_release+0x18d/0x200
[  268.462929]  ? mnt_ns_release+0x18d/0x200
[  268.463118]  ? mnt_ns_release+0x18d/0x200
[  268.463313]  ? rcu_core+0x86a/0x1920
[  268.463491]  mnt_ns_release_rcu+0x1f/0x30
[  268.463688]  rcu_core+0x86c/0x1920
[  268.463863]  ? __pfx_rcu_core+0x10/0x10
[  268.464058]  rcu_core_si+0x12/0x20
[  268.464229]  handle_softirqs+0x1c5/0x860
[  268.464431]  __irq_exit_rcu+0x10e/0x170
[  268.464620]  irq_exit_rcu+0x12/0x30
[  268.464795]  sysvec_apic_timer_interrupt+0xb4/0xd0
[  268.465029]  </IRQ>
[  268.465139]  <TASK>
[  268.465251]  asm_sysvec_apic_timer_interrupt+0x1f/0x30
[  268.465517] RIP: 0010:mnt_ns_tree_add+0xbe/0x490
[  268.465745] Code: 0f 85 34 03 00 00 49 bd 00 00 00 00 00 fc ff df 4d 8b 64 24 38 e8 c2 7a 7e ff 48 8d 7b 98 48d
[  268.466596] RSP: 0018:ffff88801f1b7cc8 EFLAGS: 00000246
[  268.466845] RAX: 1ffff11001afa347 RBX: ffff88800d7d1aa0 RCX: ffffffff8209996c
[  268.467180] RDX: ffff888020b40000 RSI: ffffffff8209992e RDI: ffff88800d7d1a38
[  268.467517] RBP: ffff88801f1b7cf8 R08: 0000000000000000 R09: fffffbfff15085a2
[  268.467844] R10: 0000000000000008 R11: 0000000000000001 R12: 0000000000002725
[  268.468167] R13: dffffc0000000000 R14: ffff888014a71c00 R15: ffff88800d7d12a8
[  268.468501]  ? mnt_ns_tree_add+0xec/0x490
[  268.468697]  ? mnt_ns_tree_add+0xae/0x490
[  268.468891]  ? mnt_ns_tree_add+0xae/0x490
[  268.469088]  copy_mnt_ns+0x5ed/0xa90
[  268.469272]  create_new_namespaces+0xe2/0xb40
[  268.469514]  ? security_capable+0x19d/0x1e0
[  268.469727]  unshare_nsproxy_namespaces+0xca/0x200
[  268.469959]  ksys_unshare+0x482/0xae0
[  268.470138]  ? __pfx_ksys_unshare+0x10/0x10
[  268.470346]  ? __audit_syscall_entry+0x39c/0x500
[  268.470572]  __x64_sys_unshare+0x3a/0x50
[  268.470760]  x64_sys_call+0xd3e/0x2140
[  268.470942]  do_syscall_64+0x6d/0x140
[  268.471118]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  268.471362] RIP: 0033:0x7f7f8fc3ee5d
[  268.471535] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 898
[  268.472375] RSP: 002b:00007ffd32de1248 EFLAGS: 00000286 ORIG_RAX: 0000000000000110
[  268.472719] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f7f8fc3ee5d
[  268.473052] RDX: ffffffffffffff80 RSI: ffffffffffffff80 RDI: 0000000020060480
[  268.473386] RBP: 00007ffd32de1250 R08: 00007ffd32de1280 R09: 00007ffd32de1280
[  268.473720] R10: 00007ffd32de1280 R11: 0000000000000286 R12: 00007ffd32de13a8
[  268.474046] R13: 0000000000401713 R14: 0000000000403e08 R15: 00007f7f90006000
[  268.474387]  </TASK>
[  268.474499] irq event stamp: 1528
[  268.474665] hardirqs last  enabled at (1536): [<ffffffff81662885>] __up_console_sem+0x95/0xb0
[  268.475065] hardirqs last disabled at (1543): [<ffffffff8166286a>] __up_console_sem+0x7a/0xb0
[  268.475454] softirqs last  enabled at (0): [<ffffffff81463a8e>] copy_process+0x1d4e/0x6a40
[  268.475830] softirqs last disabled at (661): [<ffffffff8148a84e>] __irq_exit_rcu+0x10e/0x170
[  268.476213] ---[ end trace 0000000000000000 ]---
"

Regards,
Yi Lai

---

If you don't need the following environment to reproduce the problem or if you
already have one reproduced environment, please ignore the following information.

How to reproduce:
git clone https://gitlab.com/xupengfe/repro_vm_env.git
cd repro_vm_env
tar -xvf repro_vm_env.tar.gz
cd repro_vm_env; ./start3.sh  // it needs qemu-system-x86_64 and I used v7.1.0
  // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel
  // You could change the bzImage_xxx as you want
  // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version
You could use below command to log in, there is no password for root.
ssh -p 10023 root@localhost

After login vm(virtual machine) successfully, you could transfer reproduced
binary to the vm by below way, and reproduce the problem in vm:
gcc -pthread -o repro repro.c
scp -P 10023 repro root@localhost:/root/

Get the bzImage for target kernel:
Please use target kconfig and copy it to kernel_src/.config
make olddefconfig
make -jx bzImage           //x should equal or less than cpu num your pc has

Fill the bzImage file into above start3.sh to load the target kernel in vm.


Tips:
If you already have qemu-system-x86_64, please ignore below info.
If you want to install qemu v7.1.0 version:
git clone https://github.com/qemu/qemu.git
cd qemu
git checkout -f v7.1.0
mkdir build
cd build
yum install -y ninja-build.x86_64
yum -y install libslirp-devel.x86_64
../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp
make
make install 




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux