[PATCH 7/9] fsfreeze: freeze_super and thaw_bdev don't play well together

Fernando Luis Vázquez Cao <fernando_b1@xxxxxxxxxxxxx> · Fri, 05 Oct 2012 14:42:40 +0900

thaw_bdev() has re-entrancy guards to allow freezes to nest
together. That is, it ensures that the filesystem is not thawed
until the last thaw command is issued. This is needed to prevent the
filesystem from being unfrozen while an existing freezer is still
operating on the filesystem in a frozen state (e.g. dm-snapshot).

Currently, freeze_super() and thaw_super() bypasses these guards,
and as a result manual freezing and unfreezing via the ioctl methods
do not nest correctly. hence mixing userspace directed freezes with
block device level freezes result in inconsistency due to premature
thawing of the filesystem.

Move the re-enterency guards to the superblock and into freeze_super
and thaw_super() so that userspace directed freezes nest correctly
again. Caveat: Things work as expected as long as direct calls to
thaw_super are always in response to a previous sb level freeze. In
other words an unpaired call to thaw_super can still thaw a
filesystem frozen using freeze_bdev (this issue could be addressed
in a follow-up patch if deemed necessary).

This patch retains the bdev level mutex and counter to keep the
"feature" that we can freeze a block device that does not have a
filesystem mounted yet.

IMPORTANT CAVEAT: This patch restores the nesting feature of the fsfreeze ioctl
which got removed by commit 18e9e5104fcd9a973ffe3eed3816c87f2a1b6cd2
("Introduce freeze_super and thaw_super for the fsfreeze ioctl"). It could be
argued that it is too late to fix the userland ABI breakage caused by that
patch and that the current ABI is the one that should be kept. If this is the
respective maintainer(s) opinion this could be modified to not allow fsfreeze
ioctl nesting.

Cc: Josef Bacik <jbacik@xxxxxxxxxxxx>
Cc: Eric Sandeen <sandeen@xxxxxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxxxxxxxxx>
Cc: Jan Kara <jack@xxxxxxx>
Cc: Dave Chinner <dchinner@xxxxxxxxxx>
Cc: Konishi Ryusuke <konishi.ryusuke@xxxxxxxxxxxxx>
Cc: Steven Whitehouse <swhiteho@xxxxxxxxxx>
Signed-off-by: Fernando Luis Vazquez Cao <fernando@xxxxxxxxxxxxx>
---

diff -urNp linux-3.6-orig/fs/block_dev.c linux-3.6/fs/block_dev.c

--- linux-3.6-orig/fs/block_dev.c	2012-10-04 15:05:42.168084928 +0900
+++ linux-3.6/fs/block_dev.c	2012-10-04 15:08:40.228086607 +0900
@@ -257,16 +257,18 @@ int fsync_bdev(struct block_device *bdev
 EXPORT_SYMBOL(fsync_bdev);
 
 /**
- * freeze_bdev  --  lock a filesystem and force it into a consistent state
+ * freeze_bdev  --  lock a block device
  * @bdev:	blockdevice to lock
  *
- * If a superblock is found on this device, we take the s_umount semaphore
- * on it to make sure nobody unmounts until the snapshot creation is done.
- * The reference counter (bd_fsfreeze_count) guarantees that only the last
- * unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
- * actually.
+ * Locks the block device and, if present, the associated filesystem too.
+ *
+ * The reference counter (bd_fsfreeze_count) is used to implement the feature
+ * that allows one to freeze a block device that does not have a filesystem
+ * mounted yet. For filesystems using mount_bdev the kernel takes care of
+ * things by preventing the mount operation from succeeding if the underlying
+ * block device is frozen. Other filesystems should check this counter or risk
+ * a situation where a freeze_bdev user (e.g. dm snapshot) and mount race,
+ * which may lead to inconsistencies.
  */
 struct super_block *freeze_bdev(struct block_device *bdev)
 {
@@ -274,17 +276,7 @@ struct super_block *freeze_bdev(struct b
 	int error = 0;
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (++bdev->bd_fsfreeze_count > 1) {
-		/*
-		 * We don't even need to grab a reference - the first call
-		 * to freeze_bdev grab an active reference and only the last
-		 * thaw_bdev drops it.
-		 */
-		sb = get_super(bdev);
-		drop_super(sb);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return sb;
-	}
+	bdev->bd_fsfreeze_count++;
 
 	sb = get_active_super(bdev);
 	if (!sb)
@@ -297,36 +289,48 @@ struct super_block *freeze_bdev(struct b
 		return ERR_PTR(error);
 	}
 	deactivate_super(sb);
- out:
+out:
 	sync_blockdev(bdev);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-	return sb;	/* thaw_bdev releases s->s_umount */
+	return sb;
 }
 EXPORT_SYMBOL(freeze_bdev);
 
 /**
- * thaw_bdev  -- unlock filesystem
+ * thaw_bdev  -- unlock a block device
  * @bdev:	blockdevice to unlock
  * @sb:		associated superblock
  *
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ * Unlocks the block device and, if present, the associated filesystem too.
  */
 int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
 	int error = -EINVAL;
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
+
 	if (!bdev->bd_fsfreeze_count)
 		goto out;
 
-	if (--bdev->bd_fsfreeze_count > 0 || !sb) {
+	bdev->bd_fsfreeze_count--;
+
+	if (!sb) {
 		error = 0;
 		goto out;
 	}
 
 	error = thaw_super(sb);
-	if (error)
+	/* If the superblock is already unfrozen, i.e. thaw_super() returned
+	 * -EINVAL, we consider the block device level thaw successful. This
+	 * behavior is important in a scenario where a filesystem frozen using
+	 * freeze_bdev() is thawed through the superblock level API; if we
+	 * caused the subsequent thaw_bdev() to fail bdev->bd_fsfreeze_count
+	 * would not go back to 0 which means that future calls to freeze_bdev()
+	 * would not freeze the superblock, just increase the counter. */
+	if (error && error != -EINVAL)
 		bdev->bd_fsfreeze_count++;
+	else
+		error = 0;
 out:
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return error;
diff -urNp linux-3.6-orig/fs/gfs2/ops_fstype.c linux-3.6/fs/gfs2/ops_fstype.c
--- linux-3.6-orig/fs/gfs2/ops_fstype.c	2012-10-01 08:47:46.000000000 +0900
+++ linux-3.6/fs/gfs2/ops_fstype.c	2012-10-04 15:08:40.228086607 +0900
@@ -1289,9 +1289,13 @@ static struct dentry *gfs2_mount(struct
 		return ERR_CAST(bdev);
 
 	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
+	 * Once the super is inserted into the list by sget, s_umount will
+	 * protect the block device level fsfreeze code from trying to start a
+	 * snapshot while we are mounting. There is one caveat though: the
+	 * mount can still fail after sget(), in which case we would release
+	 * s_umount before the superblock is fully initialized. The fsfreeze
+	 * code takes care of the latter case by canceling the freeze and
+	 * bailing out if the MS_BORN flag is not set in the superblock.
 	 */
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
 	if (bdev->bd_fsfreeze_count > 0) {
diff -urNp linux-3.6-orig/fs/nilfs2/super.c linux-3.6/fs/nilfs2/super.c
--- linux-3.6-orig/fs/nilfs2/super.c	2012-10-01 08:47:46.000000000 +0900
+++ linux-3.6/fs/nilfs2/super.c	2012-10-04 15:08:40.228086607 +0900
@@ -1274,9 +1274,13 @@ nilfs_mount(struct file_system_type *fs_
 	}
 
 	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
+	 * Once the super is inserted into the list by sget, s_umount will
+	 * protect the block device level fsfreeze code from trying to start a
+	 * snapshot while we are mounting. There is one caveat though: the
+	 * mount can still fail after sget(), in which case we would release
+	 * s_umount before the superblock is fully initialized. The fsfreeze
+	 * code takes care of the latter case by canceling the freeze and
+	 * bailing out if the MS_BORN flag is not set in the superblock.
 	 */
 	mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
 	if (sd.bdev->bd_fsfreeze_count > 0) {
diff -urNp linux-3.6-orig/fs/super.c linux-3.6/fs/super.c
--- linux-3.6-orig/fs/super.c	2012-10-04 15:06:00.264085275 +0900
+++ linux-3.6/fs/super.c	2012-10-04 15:09:21.164086840 +0900
@@ -1033,9 +1033,13 @@ struct dentry *mount_bdev(struct file_sy
 		return ERR_CAST(bdev);
 
 	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
+	 * Once the super is inserted into the list by sget, s_umount will
+	 * protect the block device level fsfreeze code from trying to start a
+	 * snapshot while we are mounting. There is one caveat though: the
+	 * mount can still fail after sget(), in which case we would release
+	 * s_umount before the superblock is fully initialized. The fsfreeze
+	 * code takes care of the latter case by canceling the freeze and
+	 * bailing out if the MS_BORN flag is not set in the superblock.
 	 */
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
 	if (bdev->bd_fsfreeze_count > 0) {
@@ -1330,8 +1334,11 @@ static void sb_wait_write(struct super_b
  * @sb: the super to lock
  *
  * Syncs the super to make sure the filesystem is consistent and calls the fs's
- * freeze_fs.  Subsequent calls to this without first thawing the fs will return
- * -EBUSY.
+ * freeze_fs. The reference counter (s_freeze_count) guarantees that only the
+ * last unfreeze process can unfreeze the frozen filesystem actually when
+ * multiple freeze requests arrive simultaneously. It counts up in
+ * freeze_super() and counts down in thaw_super(). When it becomes 0,
+ * thaw_super() will execute the unfreeze.
  *
  * During this function, sb->s_writers.frozen goes through these values:
  *
@@ -1356,29 +1363,29 @@ static void sb_wait_write(struct super_b
  * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
  * mostly auxiliary for filesystems to verify they do not modify frozen fs.
  *
- * sb->s_writers.frozen is protected by sb->s_umount.
+ * sb->s_writers.frozen and sb->s_freeze_count are protected by sb->s_umount.
  */
 int freeze_super(struct super_block *sb)
 {
-	int ret;
+	int ret = 0;
 
 	atomic_inc(&sb->s_active);
 	down_write(&sb->s_umount);
-	if (sb->s_writers.frozen != SB_UNFROZEN) {
-		deactivate_locked_super(sb);
-		return -EBUSY;
-	}
 
-	if (!(sb->s_flags & MS_BORN)) {
-		up_write(&sb->s_umount);
-		return 0;	/* sic - it's "nothing to do" */
-	}
+	if (++sb->s_freeze_count > 1)
+		goto out_deactivate;
+
+	/* If MS_BORN is not set it means we have a failing mount (this is
+	 * possible if we got here from freeze_bdev()). Keep an active
+	 * reference so that the superblock is not killed until it is thawed
+	 * via thaw_bdev(). */
+	if (!(sb->s_flags & MS_BORN))
+		goto out_unlock;
 
 	if (sb->s_flags & MS_RDONLY) {
 		/* Nothing to do really... */
 		sb->s_writers.frozen = SB_FREEZE_COMPLETE;
-		up_write(&sb->s_umount);
-		return 0;
+		goto out_unlock;
 	}
 
 	/* From now on, no new normal writers can start */
@@ -1410,11 +1417,11 @@ int freeze_super(struct super_block *sb)
 		if (ret) {
 			printk(KERN_ERR
 				"VFS:Filesystem freeze failed\n");
+			sb->s_freeze_count--;
 			sb->s_writers.frozen = SB_UNFROZEN;
 			smp_wmb();
 			wake_up(&sb->s_writers.wait_unfrozen);
-			deactivate_locked_super(sb);
-			return ret;
+			goto out_deactivate;
 		}
 	}
 	/*
@@ -1422,8 +1429,12 @@ int freeze_super(struct super_block *sb)
 	 * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
 	 */
 	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
+out_unlock:
 	up_write(&sb->s_umount);
-	return 0;
+	return ret;
+out_deactivate:
+	deactivate_locked_super(sb);
+	return ret;
 }
 EXPORT_SYMBOL(freeze_super);
 
@@ -1433,6 +1444,10 @@ EXPORT_SYMBOL(freeze_super);
  *
  * Unlocks the filesystem and marks it writeable again.
  *
+ * Returns -EINVAL if @sb is not frozen, the number of nested freezes remaining
+ * after this thaw if it succeeded or the corresponding error code otherwise.
+ * If the unfreeze fails, @sb is left in the frozen state.
+ *
  * This is the unlocked version of thaw_super, so it is the caller's job to
  * to protect the superblock by grabbing the s_umount semaphore in write mode
  * and release it again on return. See thaw_super() for an example.
@@ -1441,11 +1456,16 @@ static int __thaw_super(struct super_blo
 {
 	int error = 0;
 
-	if (sb->s_writers.frozen == SB_UNFROZEN) {
+	if (!sb->s_freeze_count) {
 		error = -EINVAL;
 		goto out;
 	}
 
+	if (--sb->s_freeze_count > 0) {
+		error = sb->s_freeze_count;
+		goto out;
+	}
+
 	if (sb->s_flags & MS_RDONLY)
 		goto out_thaw;
 
@@ -1454,6 +1474,7 @@ static int __thaw_super(struct super_blo
 		if (error) {
 			printk(KERN_ERR
 				"VFS:Filesystem thaw failed\n");
+			sb->s_freeze_count++;
 			goto out;
 		}
 	}
@@ -1477,11 +1498,11 @@ int thaw_super(struct super_block *sb)
 	int res;
 	down_write(&sb->s_umount);
 	res = __thaw_super(sb);
-	if (!res)
+	if (!res) /* Active reference released after last thaw. */
 		deactivate_locked_super(sb);
 	else
 		up_write(&sb->s_umount);
-	return res;
+	return res > 0 ? 0 : res;
 }
 EXPORT_SYMBOL(thaw_super);
 
@@ -1489,6 +1510,9 @@ static void do_thaw_one(struct super_blo
 {
 	int res;
 
+	if (sb->s_writers.frozen == SB_UNFROZEN)
+		return;
+
 	if (sb->s_bdev) {
 		char b[BDEVNAME_SIZE];
 		printk(KERN_WARNING "Emergency Thaw on %s.\n",
@@ -1497,6 +1521,7 @@ static void do_thaw_one(struct super_blo
 
 	/* We got here from __iterate_supers with the superblock lock taken
 	 * so we can call the lockless version of thaw_super() safely. */
+	sb->s_freeze_count = 1; /* avoid multiple calls to __thaw_super */
 	res = __thaw_super(sb);
 	/* If we are going to drop the final active reference call
 	 * deactivate_locked_super to clean things up. In the general case
diff -urNp linux-3.6-orig/include/linux/fs.h linux-3.6/include/linux/fs.h
--- linux-3.6-orig/include/linux/fs.h	2012-10-04 15:06:00.264085275 +0900
+++ linux-3.6/include/linux/fs.h	2012-10-04 15:08:40.232086608 +0900
@@ -1578,6 +1578,9 @@ struct super_block {
 
 	/* Being remounted read-only */
 	int s_readonly_remount;
+
+	/* Number of nested freezes */
+	int s_freeze_count;
 };
 
 /* superblock cache pruning functions */


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html