I established steps to reproduce. On a 2.6.38 kernel built with CONFIG_SMP=y running on an SMP machine, run while :; do newns /sbin/pivot_root /proc/ /proc/sys/; done on one terminal and run while :; do /bin/ls -l /proc/*/exe; done on another terminal. (The "newns" is a program that unshares the mnt namespace before execve() using CLONE_NEWNS.) Below patch releases/reacquires vfsmount_lock when rescheduling is required. --- fs/dcache.c | 15 ++++++++++++++- fs/namespace.c | 6 ++++++ include/linux/sched.h | 2 ++ include/linux/seqlock.h | 4 ++++ 4 files changed, 26 insertions(+), 1 deletion(-) --- linux-2.6.38.orig/include/linux/sched.h +++ linux-2.6.38/include/linux/sched.h @@ -1528,6 +1528,8 @@ struct task_struct { unsigned long memsw_bytes; /* uncharged mem+swap usage */ } memcg_batch; #endif + u8 vfsmntlock_held_for_write; + u8 retry_vfsmntlock; }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ --- linux-2.6.38.orig/include/linux/seqlock.h +++ linux-2.6.38/include/linux/seqlock.h @@ -82,6 +82,8 @@ static inline int write_tryseqlock(seqlo return ret; } +extern bool need_to_giveup(const seqlock_t *sl); + /* Start of read calculation -- fetch last complete writer token */ static __always_inline unsigned read_seqbegin(const seqlock_t *sl) { @@ -92,6 +94,8 @@ repeat: smp_rmb(); if (unlikely(ret & 1)) { cpu_relax(); + if (need_to_giveup(sl)) + return ret; goto repeat; } --- linux-2.6.38.orig/fs/dcache.c +++ linux-2.6.38/fs/dcache.c @@ -2861,6 +2861,7 @@ int is_subdir(struct dentry *new_dentry, if (new_dentry == old_dentry) return 1; + current->retry_vfsmntlock = 0; do { /* for restarting inner loop in case of seq retry */ seq = read_seqbegin(&rename_lock); @@ -2874,11 +2875,23 @@ int is_subdir(struct dentry *new_dentry, else result = 0; rcu_read_unlock(); - } while (read_seqretry(&rename_lock, seq)); + } while (read_seqretry(&rename_lock, seq) && !current->retry_vfsmntlock); return result; } +bool need_to_giveup(const seqlock_t *sl) +{ + if (sl == &rename_lock && current->vfsmntlock_held_for_write && + need_resched()) { + current->retry_vfsmntlock = true; + WARN(1, "Requesting for releasing vfsmount_lock"); + return true; + } + return false; +} +EXPORT_SYMBOL(need_to_giveup); + int path_is_under(struct path *path1, struct path *path2) { struct vfsmount *mnt = path1->mnt; --- linux-2.6.38.orig/fs/namespace.c +++ linux-2.6.38/fs/namespace.c @@ -2515,7 +2515,9 @@ SYSCALL_DEFINE2(pivot_root, const char _ goto out2; /* not attached */ /* make sure we can reach put_old from new_root */ tmp = old.mnt; +retry_vfsmnt_lock: br_write_lock(vfsmount_lock); + current->vfsmntlock_held_for_write = 1; if (tmp != new.mnt) { for (;;) { if (tmp->mnt_parent == tmp) @@ -2536,6 +2538,7 @@ SYSCALL_DEFINE2(pivot_root, const char _ attach_mnt(new.mnt, &root_parent); touch_mnt_namespace(current->nsproxy->mnt_ns); br_write_unlock(vfsmount_lock); + current->vfsmntlock_held_for_write = 0; chroot_fs_refs(&root, &new); error = 0; @@ -2552,6 +2555,9 @@ out0: return error; out3: br_write_unlock(vfsmount_lock); + current->vfsmntlock_held_for_write = 0; + if (current->retry_vfsmntlock) + goto retry_vfsmnt_lock; goto out2; } You will see warning like below. [ 330.216989] ------------[ cut here ]------------ [ 330.218601] WARNING: at fs/dcache.c:2888 need_to_giveup+0x52/0x60() [ 330.219509] Hardware name: VMware Virtual Platform [ 330.220644] Requesting for releasing vfsmount_lock [ 330.221426] Modules linked in: ipv6 video sbs sbshc battery ac floppy rtc_cmos button rtc_core serio_raw rtc_lib pcnet32 mii i2c_piix4 i2c_core mptspi mptscsih mptbase scsi_transport_spi ext3 jbd mbcache [ 330.230432] Pid: 8936, comm: pivot_root Tainted: G W 2.6.38 #2 [ 330.232003] Call Trace: [ 330.233153] [<c04d2bb2>] ? need_to_giveup+0x52/0x60 [ 330.234420] [<c043f6bc>] ? warn_slowpath_common+0x7c/0xa0 [ 330.235418] [<c04d2bb2>] ? need_to_giveup+0x52/0x60 [ 330.236417] [<c043f75e>] ? warn_slowpath_fmt+0x2e/0x30 [ 330.237459] [<c04d2bb2>] ? need_to_giveup+0x52/0x60 [ 330.238492] [<c04d2d37>] ? is_subdir+0xb7/0xe0 [ 330.239429] [<c04d2cbd>] ? is_subdir+0x3d/0xe0 [ 330.240420] [<c04da8eb>] ? sys_pivot_root+0x22b/0x2a0 [ 330.241775] [<c06c519c>] ? restore_all_notrace+0x0/0x18 [ 330.243005] [<c04289a0>] ? do_page_fault+0x0/0x420 [ 330.244414] [<c0402c50>] ? sysenter_do_call+0x12/0x36 [ 330.245416] ---[ end trace 5017bf44a4ddf1e2 ]--- 3.6.37 seems to be OK because __d_path() was protected by dcache_lock. Regards. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html