[PATCH 3/8] fsfreeze: freeze_super and thaw_bdev don't play well together

Fernando Luis Vázquez Cao <fernando_b1@xxxxxxxxxxxxx> · Thu, 12 Jul 2012 18:05:43 +0900

Changes from Dave Chinner's version:
 - Remove s_frozen check in freeze_super which is not needed now that it is
   re-entrant.
 - Decrement freeze counter if the freeze_fs callback fails.

---

thaw_bdev() has re-entrancy guards to allow freezes to nest
together. That is, it ensures that the filesystem is not thawed
until the last thaw command is issued. This is needed to prevent the
filesystem from being unfrozen while an existing freezer is still
operating on the filesystem in a frozen state (e.g. dm-snapshot).

Currently, freeze_super() and thaw_super() bypasses these guards,
and as a result manual freezing and unfreezing via the ioctl methods
do not nest correctly. hence mixing userspace directed freezes with
block device level freezes result in inconsistency due to premature
thawing of the filesystem.

Move the re-enterency guards to the superblock and into freeze_super
and thaw_super() so that userspace directed freezes nest correctly
again. Caveat: Things work as expected as long as direct calls to
thaw_super are always in response to a previous sb level freeze. In
other words an unpaired call to thaw_super can still thaw a
filesystem frozen using freeze_bdev (this issue could be addressed
in a follow-up patch if deemed necessary).

This patch retains the bdev level mutex and counter to keep the
"feature" that we can freeze a block device that does not have a
filesystem mounted yet.

Cc: Christoph Hellwig <hch@xxxxxxxxxxxxx>
Cc: Jan Kara <jack@xxxxxxx>
Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
Signed-off-by: Fernando Luis Vazquez Cao <fernando@xxxxxxxxxxxxx>
---

diff -urNp vfs-orig/fs/block_dev.c vfs/fs/block_dev.c

--- vfs-orig/fs/block_dev.c	2012-07-12 14:31:38.936631141 +0900
+++ vfs/fs/block_dev.c	2012-07-12 15:03:57.032627014 +0900
@@ -257,16 +257,18 @@ int fsync_bdev(struct block_device *bdev
 EXPORT_SYMBOL(fsync_bdev);
 
 /**
- * freeze_bdev  --  lock a filesystem and force it into a consistent state
+ * freeze_bdev  --  lock a block device
  * @bdev:	blockdevice to lock
  *
- * If a superblock is found on this device, we take the s_umount semaphore
- * on it to make sure nobody unmounts until the snapshot creation is done.
- * The reference counter (bd_fsfreeze_count) guarantees that only the last
- * unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
- * actually.
+ * Locks the block device and, if present, the associated filesystem too.
+ *
+ * The reference counter (bd_fsfreeze_count) is used to implement the feature
+ * that allows one to freeze a block device that does not have a filesystem
+ * mounted yet. For filesystems using mount_bdev the kernel takes care of
+ * things by preventing the mount operation from succeeding if the underlying
+ * block device is frozen. Other filesystems should check this counter or risk
+ * a situation where a freeze_bdev user (e.g. dm snapshot) and mount race,
+ * which may lead to inconsistencies.
  */
 struct super_block *freeze_bdev(struct block_device *bdev)
 {
@@ -274,17 +276,7 @@ struct super_block *freeze_bdev(struct b
 	int error = 0;
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (++bdev->bd_fsfreeze_count > 1) {
-		/*
-		 * We don't even need to grab a reference - the first call
-		 * to freeze_bdev grab an active reference and only the last
-		 * thaw_bdev drops it.
-		 */
-		sb = get_super(bdev);
-		drop_super(sb);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return sb;
-	}
+	bdev->bd_fsfreeze_count++;
 
 	sb = get_active_super(bdev);
 	if (!sb)
@@ -297,30 +289,33 @@ struct super_block *freeze_bdev(struct b
 		return ERR_PTR(error);
 	}
 	deactivate_super(sb);
- out:
+out:
 	sync_blockdev(bdev);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-	return sb;	/* thaw_bdev releases s->s_umount */
+	return sb;
 }
 EXPORT_SYMBOL(freeze_bdev);
 
 /**
- * __thaw_bdev  -- unlock filesystem
+ * __thaw_bdev  -- unlock a block device
  * @bdev:	blockdevice to unlock
  * @sb:		associated superblock
  * @emergency:	emergency thaw
  *
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ * Unlocks the block device and, if present, the associated filesystem too.
  */
 static int __thaw_bdev(struct block_device *bdev, struct super_block *sb, int emergency)
 {
 	int error = -EINVAL;
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
+
 	if (!bdev->bd_fsfreeze_count)
 		goto out;
 
-	if (--bdev->bd_fsfreeze_count > 0 || !sb) {
+	bdev->bd_fsfreeze_count--;
+
+	if (!sb) {
 		error = 0;
 		goto out;
 	}
@@ -336,13 +331,6 @@ out:
 	return error;
 }
 
-/**
- * thaw_bdev  -- unlock filesystem
- * @bdev:	blockdevice to unlock
- * @sb:		associated superblock
- *
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
- */
 int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
 	return __thaw_bdev(bdev, sb, 0);
diff -urNp vfs-orig/fs/gfs2/ops_fstype.c vfs/fs/gfs2/ops_fstype.c
--- vfs-orig/fs/gfs2/ops_fstype.c	2012-07-04 18:57:54.000000000 +0900
+++ vfs/fs/gfs2/ops_fstype.c	2012-07-12 15:04:37.924628170 +0900
@@ -1275,11 +1275,6 @@ static struct dentry *gfs2_mount(struct
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
-	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
-	 */
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
 	if (bdev->bd_fsfreeze_count > 0) {
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
diff -urNp vfs-orig/fs/nilfs2/super.c vfs/fs/nilfs2/super.c
--- vfs-orig/fs/nilfs2/super.c	2012-07-04 18:57:54.000000000 +0900
+++ vfs/fs/nilfs2/super.c	2012-07-12 15:04:55.112642078 +0900
@@ -1277,11 +1277,6 @@ nilfs_mount(struct file_system_type *fs_
 		goto failed;
 	}
 
-	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
-	 */
 	mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
 	if (sd.bdev->bd_fsfreeze_count > 0) {
 		mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
diff -urNp vfs-orig/fs/super.c vfs/fs/super.c
--- vfs-orig/fs/super.c	2012-07-12 14:31:38.964628483 +0900
+++ vfs/fs/super.c	2012-07-12 16:50:27.172627639 +0900
@@ -187,6 +187,8 @@ static struct super_block *alloc_super(s
 		s->s_shrink.seeks = DEFAULT_SEEKS;
 		s->s_shrink.shrink = prune_super;
 		s->s_shrink.batch = 1024;
+
+		mutex_init(&s->s_freeze_mutex);
 	}
 out:
 	return s;
@@ -984,11 +986,6 @@ struct dentry *mount_bdev(struct file_sy
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
-	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
-	 */
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
 	if (bdev->bd_fsfreeze_count > 0) {
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
@@ -1168,30 +1165,29 @@ out:
  * @sb: the super to lock
  *
  * Syncs the super to make sure the filesystem is consistent and calls the fs's
- * freeze_fs.  Subsequent calls to this without first thawing the fs will return
- * -EBUSY.
+ * freeze_fs.  The reference counter (s_freeze_count) guarantees that only the
+ * last unfreeze process can unfreeze the frozen filesystem actually when
+ * multiple freeze requests arrive simultaneously. It counts up in
+ * freeze_super() and counts down in thaw_super(). When it becomes 0,
+ * thaw_super() will execute the unfreeze.
  */
 int freeze_super(struct super_block *sb)
 {
-	int ret;
+	int ret = 0;
 
 	atomic_inc(&sb->s_active);
 	down_write(&sb->s_umount);
-	if (sb->s_frozen) {
-		deactivate_locked_super(sb);
-		return -EBUSY;
-	}
+	mutex_lock(&sb->s_freeze_mutex);
+	if (++sb->s_freeze_count > 1)
+		goto out_deactivate;
 
-	if (!(sb->s_flags & MS_BORN)) {
-		up_write(&sb->s_umount);
-		return 0;	/* sic - it's "nothing to do" */
-	}
+	if (!(sb->s_flags & MS_BORN))
+		goto out_active;	/* sic - it's "nothing to do" */
 
 	if (sb->s_flags & MS_RDONLY) {
 		sb->s_frozen = SB_FREEZE_TRANS;
 		smp_wmb();
-		up_write(&sb->s_umount);
-		return 0;
+		goto out_active;
 	}
 
 	sb->s_frozen = SB_FREEZE_WRITE;
@@ -1206,17 +1202,24 @@ int freeze_super(struct super_block *sb)
 	if (sb->s_op->freeze_fs) {
 		ret = sb->s_op->freeze_fs(sb);
 		if (ret) {
+			sb->s_freeze_count--;
 			printk(KERN_ERR
 				"VFS:Filesystem freeze failed\n");
 			sb->s_frozen = SB_UNFROZEN;
 			smp_wmb();
 			wake_up(&sb->s_wait_unfrozen);
-			deactivate_locked_super(sb);
-			return ret;
+			goto out_deactivate;
 		}
 	}
+
+out_active:
 	up_write(&sb->s_umount);
-	return 0;
+out_unlock:
+	mutex_unlock(&sb->s_freeze_mutex);
+	return ret;
+out_deactivate:
+	deactivate_locked_super(sb);
+	goto out_unlock;
 }
 EXPORT_SYMBOL(freeze_super);
 
@@ -1226,6 +1229,10 @@ EXPORT_SYMBOL(freeze_super);
  * @emergency:	emergency thaw
  *
  * Unlocks the filesystem and marks it writeable again after freeze_super().
+ * Returns -EINVAL if @sb is not frozen, 0 if the filesystem specific unfreeze
+ * function was executed and succeeded or the corresponding error code
+ * otherwise. if the unfreeze fails, @sb is left in the frozen state.
+ *
  * This is the unlocked version of thaw_super and has to be called with the
  * sb->s_umount lock held in the non-emergency thaw case.
  */
@@ -1233,29 +1240,33 @@ static int __thaw_super(struct super_blo
 {
 	int error = 0;
 
-	if (sb->s_frozen == SB_UNFROZEN) {
+	mutex_lock(&sb->s_freeze_mutex);
+	if (!sb->s_freeze_count) {
 		error = -EINVAL;
-		goto out;
+		goto out_unlock;
 	}
+	sb->s_freeze_count = emergency ? 1 : sb->s_freeze_count;
+
+	if (--sb->s_freeze_count > 0)
+		goto out_unlock;
 
 	if (sb->s_flags & MS_RDONLY)
-		goto out_thaw;
+		goto out_unfreeze;
 
 	if (sb->s_op->unfreeze_fs) {
 		error = sb->s_op->unfreeze_fs(sb);
 		if (error) {
 			printk(KERN_ERR
 				"VFS:Filesystem thaw failed\n");
+			sb->s_freeze_count++;
 			sb->s_frozen = SB_FREEZE_TRANS;
-			goto out;
+			goto out_unlock;
 		}
 	}
-
-out_thaw:
+out_unfreeze:
 	sb->s_frozen = SB_UNFROZEN;
 	smp_wmb();
 	wake_up(&sb->s_wait_unfrozen);
-
 	/*
 	 * When called from emergency scope, we cannot grab the s_umount lock
 	 * so we cannot deactivate the superblock. This may leave unbalanced
@@ -1264,7 +1275,8 @@ out_thaw:
 	 */
 	if (!emergency)
 		deactivate_locked_super(sb);
-out:
+out_unlock:
+	mutex_unlock(&sb->s_freeze_mutex);
 	return error;
 }
 
diff -urNp vfs-orig/include/linux/fs.h vfs/include/linux/fs.h
--- vfs-orig/include/linux/fs.h	2012-07-12 14:31:39.008626843 +0900
+++ vfs/include/linux/fs.h	2012-07-12 15:58:45.692627465 +0900
@@ -1542,6 +1542,9 @@ struct super_block {
 
 	/* Being remounted read-only */
 	int s_readonly_remount;
+
+	int			s_freeze_count; /* nr of nested freezes */
+	struct mutex		s_freeze_mutex; /* nesting lock */
 };
 
 /* superblock cache pruning functions */


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html