Re: [2.6.38] Deadlock between rename_lock and vfsmount_lock.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I established steps to reproduce.

On a 2.6.38 kernel built with CONFIG_SMP=y running on an SMP machine, run

  while :; do newns /sbin/pivot_root /proc/ /proc/sys/; done

on one terminal and run

  while :; do /bin/ls -l /proc/*/exe; done

on another terminal. (The "newns" is a program that unshares the mnt namespace
before execve() using CLONE_NEWNS.)

Below patch releases/reacquires vfsmount_lock when rescheduling is required.
---
 fs/dcache.c             |   15 ++++++++++++++-
 fs/namespace.c          |    6 ++++++
 include/linux/sched.h   |    2 ++
 include/linux/seqlock.h |    4 ++++
 4 files changed, 26 insertions(+), 1 deletion(-)

--- linux-2.6.38.orig/include/linux/sched.h
+++ linux-2.6.38/include/linux/sched.h
@@ -1528,6 +1528,8 @@ struct task_struct {
 		unsigned long memsw_bytes; /* uncharged mem+swap usage */
 	} memcg_batch;
 #endif
+	u8 vfsmntlock_held_for_write;
+	u8 retry_vfsmntlock;
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
--- linux-2.6.38.orig/include/linux/seqlock.h
+++ linux-2.6.38/include/linux/seqlock.h
@@ -82,6 +82,8 @@ static inline int write_tryseqlock(seqlo
 	return ret;
 }
 
+extern bool need_to_giveup(const seqlock_t *sl);
+
 /* Start of read calculation -- fetch last complete writer token */
 static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
 {
@@ -92,6 +94,8 @@ repeat:
 	smp_rmb();
 	if (unlikely(ret & 1)) {
 		cpu_relax();
+		if (need_to_giveup(sl))
+			return ret;
 		goto repeat;
 	}
 
--- linux-2.6.38.orig/fs/dcache.c
+++ linux-2.6.38/fs/dcache.c
@@ -2861,6 +2861,7 @@ int is_subdir(struct dentry *new_dentry,
 	if (new_dentry == old_dentry)
 		return 1;
 
+	current->retry_vfsmntlock = 0;
 	do {
 		/* for restarting inner loop in case of seq retry */
 		seq = read_seqbegin(&rename_lock);
@@ -2874,11 +2875,23 @@ int is_subdir(struct dentry *new_dentry,
 		else
 			result = 0;
 		rcu_read_unlock();
-	} while (read_seqretry(&rename_lock, seq));
+	} while (read_seqretry(&rename_lock, seq) && !current->retry_vfsmntlock);
 
 	return result;
 }
 
+bool need_to_giveup(const seqlock_t *sl)
+{
+	if (sl == &rename_lock && current->vfsmntlock_held_for_write &&
+	    need_resched()) {
+		current->retry_vfsmntlock = true;
+		WARN(1, "Requesting for releasing vfsmount_lock");
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL(need_to_giveup);
+
 int path_is_under(struct path *path1, struct path *path2)
 {
 	struct vfsmount *mnt = path1->mnt;
--- linux-2.6.38.orig/fs/namespace.c
+++ linux-2.6.38/fs/namespace.c
@@ -2515,7 +2515,9 @@ SYSCALL_DEFINE2(pivot_root, const char _
 		goto out2; /* not attached */
 	/* make sure we can reach put_old from new_root */
 	tmp = old.mnt;
+retry_vfsmnt_lock:
 	br_write_lock(vfsmount_lock);
+	current->vfsmntlock_held_for_write = 1;
 	if (tmp != new.mnt) {
 		for (;;) {
 			if (tmp->mnt_parent == tmp)
@@ -2536,6 +2538,7 @@ SYSCALL_DEFINE2(pivot_root, const char _
 	attach_mnt(new.mnt, &root_parent);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	br_write_unlock(vfsmount_lock);
+	current->vfsmntlock_held_for_write = 0;
 	chroot_fs_refs(&root, &new);
 
 	error = 0;
@@ -2552,6 +2555,9 @@ out0:
 	return error;
 out3:
 	br_write_unlock(vfsmount_lock);
+	current->vfsmntlock_held_for_write = 0;
+	if (current->retry_vfsmntlock)
+		goto retry_vfsmnt_lock;
 	goto out2;
 }
 

You will see warning like below.

[  330.216989] ------------[ cut here ]------------
[  330.218601] WARNING: at fs/dcache.c:2888 need_to_giveup+0x52/0x60()
[  330.219509] Hardware name: VMware Virtual Platform
[  330.220644] Requesting for releasing vfsmount_lock
[  330.221426] Modules linked in: ipv6 video sbs sbshc battery ac floppy rtc_cmos button rtc_core serio_raw rtc_lib pcnet32 mii i2c_piix4 i2c_core mptspi mptscsih mptbase scsi_transport_spi ext3 jbd mbcache
[  330.230432] Pid: 8936, comm: pivot_root Tainted: G        W   2.6.38 #2
[  330.232003] Call Trace:
[  330.233153]  [<c04d2bb2>] ? need_to_giveup+0x52/0x60
[  330.234420]  [<c043f6bc>] ? warn_slowpath_common+0x7c/0xa0
[  330.235418]  [<c04d2bb2>] ? need_to_giveup+0x52/0x60
[  330.236417]  [<c043f75e>] ? warn_slowpath_fmt+0x2e/0x30
[  330.237459]  [<c04d2bb2>] ? need_to_giveup+0x52/0x60
[  330.238492]  [<c04d2d37>] ? is_subdir+0xb7/0xe0
[  330.239429]  [<c04d2cbd>] ? is_subdir+0x3d/0xe0
[  330.240420]  [<c04da8eb>] ? sys_pivot_root+0x22b/0x2a0
[  330.241775]  [<c06c519c>] ? restore_all_notrace+0x0/0x18
[  330.243005]  [<c04289a0>] ? do_page_fault+0x0/0x420
[  330.244414]  [<c0402c50>] ? sysenter_do_call+0x12/0x36
[  330.245416] ---[ end trace 5017bf44a4ddf1e2 ]---

3.6.37 seems to be OK because __d_path() was protected by dcache_lock.

Regards.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux