[PATCH 3/3] vfs: fix filesystem_sync vs write race on rw=>ro remount v2

Dmitry Monakhov <dmonakhov@xxxxxxxxxx> · Sat, 27 Feb 2010 03:25:49 +0300

Currently on rw=>ro remount we have following race
| mount /mnt -oremount,ro | write-task |
|-------------------------+------------|
|                         | open(RDWR) |
| shrink_dcache_sb(sb);   |            |
| sync_filesystem(sb);    |            |
|                         | write()    |
|                         | close()    |
| fs_may_remount_ro(sb)   |            |
| sb->s_flags = new_flags |            |

Later writeback or sync() will result in error due to MS_RDONLY flag
In case of ext4 this result in jbd2_start failure on writeback
LOG: ext4_da_writepages: jbd2_start: 1024 pages, ino 1431; err -30

This patch fix the issue like follows:
1) Introduce ST_REMOUNT_RO bit with will be set on remount start
   This bit will be cleared if new writers appear.
2) Redesign fs_may_remount_ro. Now it is just calculate sum of
   corresponding vfsmounts
3) The rest is simple, we just perform sync and check than no new
   writers appear.

##TESTCASE_BEGIN:
#! /bin/bash -x
DEV=/dev/sdb5
FSTYPE=ext4
BINDIR=/home/dmon
MNTOPT="data=ordered"
umount /mnt
mkfs.${FSTYPE}  ${DEV} || exit 1
mount  ${DEV} /mnt -o${MNTOPT} || exit 1
${BINDIR}/fsstress -p1 -l999999999 -n9999999999 -d /mnt/test &
sleep 15
mount /mnt -oremount,ro,${MNTOPT}
sleep 1
killall -9 fsstress
sync
# after this you may get following message in dmesg
# "ext4_da_writepages: jbd2_start: 1024 pages, ino 1431; err -30"
##TESTCASE_END

Signed-off-by: Dmitry Monakhov <dmonakhov@xxxxxxxxxx>
---
 fs/file_table.c    |   24 ------------------------
 fs/namespace.c     |   48 ++++++++++++++++++++++++++++++++++++++++++++----
 fs/super.c         |   33 +++++++++++++++++++++++++++------
 include/linux/fs.h |    1 +
 4 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index b98404b..32d37af 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -348,30 +348,6 @@ void file_kill(struct file *file)
 	}
 }
 
-int fs_may_remount_ro(struct super_block *sb)
-{
-	struct file *file;
-
-	/* Check that no files are currently opened for writing. */
-	file_list_lock();
-	list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
-		struct inode *inode = file->f_path.dentry->d_inode;
-
-		/* File with pending delete? */
-		if (inode->i_nlink == 0)
-			goto too_bad;
-
-		/* Writeable file? */
-		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
-			goto too_bad;
-	}
-	file_list_unlock();
-	return 1; /* Tis' cool bro. */
-too_bad:
-	file_list_unlock();
-	return 0;
-}
-
 /**
  *	mark_files_ro - mark all files read-only
  *	@sb: superblock in question
diff --git a/fs/namespace.c b/fs/namespace.c
index e816097..bc79f1f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -277,6 +277,13 @@ int mnt_want_write(struct vfsmount *mnt)
 		dec_mnt_writers(mnt);
 		ret = -EROFS;
 		goto out;
+	} else {
+		/*
+		 * Clear ST_REMOUNT_RO flag to let remount task know
+		 * about new writers.
+		 */
+		if (unlikely(test_bit(ST_REMOUNT_RO, &mnt->mnt_sb->s_state)))
+			clear_bit(ST_REMOUNT_RO, &mnt->mnt_sb->s_state);
 	}
 out:
 	preempt_enable();
@@ -341,11 +348,10 @@ void mnt_drop_write(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
-static int mnt_make_readonly(struct vfsmount *mnt)
+static int mnt_check_writers(struct vfsmount *mnt, int set_ro)
 {
 	int ret = 0;
 
-	spin_lock(&vfsmount_lock);
 	mnt->mnt_flags |= MNT_WRITE_HOLD;
 	/*
 	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -371,14 +377,23 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 	 */
 	if (count_mnt_writers(mnt) > 0)
 		ret = -EBUSY;
-	else
-		mnt->mnt_flags |= MNT_READONLY;
+	else {
+		if (likely(set_ro))
+			mnt->mnt_flags |= MNT_READONLY;
+	}
 	/*
 	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
 	 * that become unheld will see MNT_READONLY.
 	 */
 	smp_wmb();
 	mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+	return ret;
+}
+static int mnt_make_readonly(struct vfsmount *mnt)
+{
+	int ret;
+	spin_lock(&vfsmount_lock);
+	ret = mnt_check_writers(mnt, 1);
 	spin_unlock(&vfsmount_lock);
 	return ret;
 }
@@ -389,6 +404,31 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
 	mnt->mnt_flags &= ~MNT_READONLY;
 	spin_unlock(&vfsmount_lock);
 }
+/**
+ * Check whenever is it possible to remount given sb to readonly.
+ * @sb : super block in question
+ *
+ * Caller is responsible to set ST_REMOUNT_RO state before the call.
+ */
+int fs_may_remount_ro(struct super_block *sb)
+{
+	struct vfsmount *mnt;
+	int ret = 1;
+	spin_lock(&vfsmount_lock);
+	list_for_each_entry(mnt, &sb->s_vfsmount, mnt_sb_list) {
+		ret = !mnt_check_writers(mnt, 0);
+		if (ret)
+			break;
+	}
+	spin_unlock(&vfsmount_lock);
+	/*
+	 * If new writer appears after we have checked all vfsmounts.
+	 * Then ST_REMOUNT_RO bit will be cleared.
+	 */
+	if (!test_bit(ST_REMOUNT_RO, &sb->s_state))
+		ret = 0;
+	return ret;
+}
 
 void super_add_mnt(struct super_block *sb, struct vfsmount *mnt)
 {
diff --git a/fs/super.c b/fs/super.c
index d3e0083..8cf148b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -571,6 +571,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	int retval;
 	int remount_rw, remount_ro;
 
+	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+	remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
+
 	if (sb->s_frozen != SB_UNFROZEN)
 		return -EBUSY;
 
@@ -581,18 +584,33 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 
 	if (flags & MS_RDONLY)
 		acct_auto_close(sb);
+	if (remount_ro) {
+		if (force)
+			mark_files_ro(sb);
+		/*
+		 * Store ST_REMOUNT_RO flag. New writers (in any) will clrear
+		 * this bit.
+		 */
+		set_bit(ST_REMOUNT_RO, &sb->s_state);
+		/*
+		 * This store should be visible before we do.
+		 */
+		smp_mb();
+		/*
+		 * To prevent sync vs write race condition we have to check
+		 * writers before filesystem sync.
+		 */
+		if (!fs_may_remount_ro(sb))
+			return -EBUSY;
+	}
 	shrink_dcache_sb(sb);
 	sync_filesystem(sb);
 
-	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
-	remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
 
 	/* If we are remounting RDONLY and current sb is read/write,
-	   make sure there are no rw files opened */
+	   make sure there are no new rw files opened */
 	if (remount_ro) {
-		if (force)
-			mark_files_ro(sb);
-		else if (!fs_may_remount_ro(sb))
+		if (!fs_may_remount_ro(sb))
 			return -EBUSY;
 		retval = vfs_dq_off(sb, 1);
 		if (retval < 0 && retval != -ENOSYS)
@@ -617,6 +635,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	 */
 	if (remount_ro && sb->s_bdev)
 		invalidate_bdev(sb->s_bdev);
+
+	WARN_ON(remount_ro && !fs_may_remount_ro(sb));
+	clear_bit(ST_REMOUNT_RO, &sb->s_state);
 	return 0;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75f057d..0ad7aa4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1308,6 +1308,7 @@ extern int send_sigurg(struct fown_struct *fown);
 
 enum {
 	ST_FS_SYNC, 	/* fssync is about to happen */
+	ST_REMOUNT_RO,	/* readonly remount is in progress */
 };
 extern struct list_head super_blocks;
 extern spinlock_t sb_lock;
-- 
1.6.6

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html