[PATCH v2] loop: don't hold lo->lo_mutex from lo_open() and lo_release()

Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> · Mon, 14 Mar 2022 00:15:22 +0900

syzbot is reporting circular locking problem at __loop_clr_fd() [1], for
commit 87579e9b7d8dc36e ("loop: use worker per cgroup instead of kworker")
is calling destroy_workqueue() with disk->open_mutex held.

Commit 322c4293ecc58110 ("loop: make autoclear operation asynchronous")
tried to fix this problem by deferring __loop_clr_fd() from lo_release()
to a WQ context, but that commit was reverted because there are userspace
programs which depend on that __loop_clr_fd() completes by the moment
close() returns to user mode.

This time, we try to fix this problem by deferring __loop_clr_fd() from
lo_release() to a task work context. Since task_work_add() is not exported
but Christoph does not want to export it, this patch uses anonymous inode
interface as a wrapper for calling task_work_add() via fput().

Although Jan has found two bugs in /bin/mount where one of these was fixed
in util-linux package [2], I found that not holding lo->lo_mutex from
lo_open() causes occasional I/O error [3] (due to

	if (lo->lo_state != Lo_bound)
		return BLK_STS_IOERR;

check in loop_queue_rq()) when systemd-udevd opens a loop device and
reads from it before loop_configure() updates lo->lo_state to Lo_bound.

That is, not only we need to wait for __loop_clr_fd() upon close() event
(in order to avoid unexpected umount() failure) but also we need to wait
for lo->lo_mutex upon open() event (in order to avoid unexpected I/O
errors). Therefore, schedule a task work from both lo_open() and
lo_release().

Link: https://syzkaller.appspot.com/bug?extid=6479585dfd4dedd3f7e1 [1]
Link: https://github.com/util-linux/util-linux/commit/3e1fc3bbee99384c [2]
Link: https://lkml.kernel.org/r/a72c59c6-298b-e4ba-b1f5-2275afab49a1@xxxxxxxxxxxxxxxxxxx [3]
Reported-by: syzbot <syzbot+6479585dfd4dedd3f7e1@xxxxxxxxxxxxxxxxxxxxxxxxx>
Cc: Jan Kara <jack@xxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxx>
Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx>
---
 drivers/block/loop.c | 244 +++++++++++++++++++++++++++++++------------
 drivers/block/loop.h |   1 +
 2 files changed, 180 insertions(+), 65 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 19fe19eaa50e..2f60356ad428 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -80,6 +80,7 @@
 #include <linux/blk-cgroup.h>
 #include <linux/sched/mm.h>
 #include <linux/statfs.h>
+#include <linux/anon_inodes.h>
 
 #include "loop.h"
 
@@ -90,6 +91,8 @@
 static DEFINE_IDR(loop_index_idr);
 static DEFINE_MUTEX(loop_ctl_mutex);
 static DEFINE_MUTEX(loop_validate_mutex);
+static DEFINE_SPINLOCK(loop_delete_spinlock);
+static DECLARE_WAIT_QUEUE_HEAD(loop_rundown_wait);
 
 /**
  * loop_global_lock_killable() - take locks for safe loop_validate_file() test
@@ -1088,7 +1091,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	return error;
 }
 
-static void __loop_clr_fd(struct loop_device *lo, bool release)
+static void __loop_clr_fd(struct loop_device *lo)
 {
 	struct file *filp;
 	gfp_t gfp = lo->old_gfp_mask;
@@ -1150,8 +1153,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	/* let user-space know about this change */
 	kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
 	mapping_set_gfp_mask(filp->f_mapping, gfp);
-	/* This is safe: open() is still holding a reference. */
-	module_put(THIS_MODULE);
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
 	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
@@ -1159,37 +1160,18 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
 		int err;
 
-		/*
-		 * open_mutex has been held already in release path, so don't
-		 * acquire it if this function is called in such case.
-		 *
-		 * If the reread partition isn't from release path, lo_refcnt
-		 * must be at least one and it can only become zero when the
-		 * current holder is released.
-		 */
-		if (!release)
-			mutex_lock(&lo->lo_disk->open_mutex);
+		mutex_lock(&lo->lo_disk->open_mutex);
 		err = bdev_disk_changed(lo->lo_disk, false);
-		if (!release)
-			mutex_unlock(&lo->lo_disk->open_mutex);
+		mutex_unlock(&lo->lo_disk->open_mutex);
 		if (err)
 			pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
 				__func__, lo->lo_number, err);
 		/* Device is gone, no point in returning error */
 	}
 
-	/*
-	 * lo->lo_state is set to Lo_unbound here after above partscan has
-	 * finished. There cannot be anybody else entering __loop_clr_fd() as
-	 * Lo_rundown state protects us from all the other places trying to
-	 * change the 'lo' device.
-	 */
 	lo->lo_flags = 0;
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART;
-	mutex_lock(&lo->lo_mutex);
-	lo->lo_state = Lo_unbound;
-	mutex_unlock(&lo->lo_mutex);
 
 	/*
 	 * Need not hold lo_mutex to fput backing file. Calling fput holding
@@ -1197,6 +1179,11 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	 * fput can take open_mutex which is usually taken before lo_mutex.
 	 */
 	fput(filp);
+	mutex_lock(&lo->lo_mutex);
+	lo->lo_state = Lo_unbound;
+	mutex_unlock(&lo->lo_mutex);
+	wake_up_all(&loop_rundown_wait);
+	module_put(THIS_MODULE);
 }
 
 static int loop_clr_fd(struct loop_device *lo)
@@ -1228,7 +1215,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	lo->lo_state = Lo_rundown;
 	mutex_unlock(&lo->lo_mutex);
 
-	__loop_clr_fd(lo, false);
+	__loop_clr_fd(lo);
 	return 0;
 }
 
@@ -1720,52 +1707,176 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 }
 #endif
 
-static int lo_open(struct block_device *bdev, fmode_t mode)
+static int lo_no_open(struct inode *inode, struct file *file)
 {
-	struct loop_device *lo = bdev->bd_disk->private_data;
-	int err;
+	return -ENXIO;
+}
 
-	err = mutex_lock_killable(&lo->lo_mutex);
-	if (err)
-		return err;
-	if (lo->lo_state == Lo_deleting)
-		err = -ENXIO;
-	else
-		atomic_inc(&lo->lo_refcnt);
-	mutex_unlock(&lo->lo_mutex);
-	return err;
+static int lo_post_open(struct inode *inode, struct file *file)
+{
+	struct loop_device *lo = file->private_data;
+
+	/* Nothing to do if lo_open() failed. */
+	if (!lo)
+		return 0;
+	/* Wait for loop_configure()/loop_post_release() to complete. */
+	if (mutex_lock_killable(&lo->lo_mutex) == 0)
+		mutex_unlock(&lo->lo_mutex);
+	/*
+	 * Also wait for __loop_clr_fd() to complete because
+	 * loop_post_release() releases lo->lo_mutex before __loop_clr_fd()
+	 * is called.
+	 */
+	wait_event_killable(loop_rundown_wait, data_race(lo->lo_state) != Lo_rundown);
+	atomic_dec(&lo->async_pending);
+	return 0;
 }
 
-static void lo_release(struct gendisk *disk, fmode_t mode)
+static const struct file_operations loop_open_fops = {
+	.open = lo_no_open,
+	.release = lo_post_open,
+};
+
+static int lo_post_release(struct inode *inode, struct file *file)
 {
-	struct loop_device *lo = disk->private_data;
+	struct gendisk *disk = file->private_data;
+	struct loop_device *lo;
 
+	/* Nothing to do if lo_open() failed. */
+	if (!disk)
+		return 0;
+	lo = disk->private_data;
 	mutex_lock(&lo->lo_mutex);
-	if (atomic_dec_return(&lo->lo_refcnt))
+	/* Check whether this loop device can be cleared. */
+	if (atomic_dec_return(&lo->lo_refcnt) || lo->lo_state != Lo_bound)
 		goto out_unlock;
-
+	/*
+	 * Clear this loop device since nobody is using. Note that since
+	 * lo_open() increments lo->lo_refcnt without holding lo->lo_mutex,
+	 * I might become no longer the last user, but there is a fact that
+	 * there was no user.
+	 *
+	 * In autoclear mode, destroy WQ and remove configuration.
+	 * Otherwise flush possible ongoing bios in WQ and keep configuration.
+	 */
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
-		if (lo->lo_state != Lo_bound)
-			goto out_unlock;
 		lo->lo_state = Lo_rundown;
 		mutex_unlock(&lo->lo_mutex);
-		/*
-		 * In autoclear mode, stop the loop thread
-		 * and remove configuration after last close.
-		 */
-		__loop_clr_fd(lo, true);
-		return;
-	} else if (lo->lo_state == Lo_bound) {
-		/*
-		 * Otherwise keep thread (if running) and config,
-		 * but flush possible ongoing bios in thread.
-		 */
+		__loop_clr_fd(lo);
+		goto done;
+	} else {
 		blk_mq_freeze_queue(lo->lo_queue);
 		blk_mq_unfreeze_queue(lo->lo_queue);
 	}
 
 out_unlock:
 	mutex_unlock(&lo->lo_mutex);
+done:
+	/* Drop references which would have been dropped after lo_release(). */
+	kobject_put(&disk_to_dev(disk)->kobj);
+	module_put(disk->fops->owner);
+	atomic_dec(&lo->async_pending);
+	return 0;
+}
+
+static const struct file_operations loop_close_fops = {
+	.open = lo_no_open,
+	.release = lo_post_release,
+};
+
+struct release_event {
+	struct list_head head;
+	struct file *file;
+};
+
+static LIST_HEAD(release_event_spool);
+static DEFINE_SPINLOCK(release_event_spinlock);
+
+static int lo_open(struct block_device *bdev, fmode_t mode)
+{
+	struct loop_device *lo = bdev->bd_disk->private_data;
+	int err = 0;
+	/*
+	 * Reserve an anon_inode for calling lo_post_open() before returning to
+	 * user mode. Since fput() by kernel thread defers actual cleanup to WQ
+	 * context, there is no point with calling lo_post_open() from kernel
+	 * threads.
+	 */
+	struct file *open = (current->flags & PF_KTHREAD) ? NULL :
+		anon_inode_getfile("loop.open", &loop_open_fops, NULL, O_CLOEXEC);
+	/*
+	 * Reserve an anon_inode for calling lo_post_release() from
+	 * lo_release() before returning to user mode.
+	 */
+	struct file *close = anon_inode_getfile("loop.close", &loop_close_fops, NULL, O_CLOEXEC);
+	struct release_event *ev = kmalloc(sizeof(*ev), GFP_KERNEL | __GFP_NOWARN);
+
+	if (!ev || IS_ERR(open) || IS_ERR(close)) {
+		err = -ENOMEM;
+		goto cleanup;
+	}
+
+	spin_lock(&loop_delete_spinlock);
+	/* lo->lo_state may be changed to any Lo_* but Lo_deleting. */
+	if (data_race(lo->lo_state) == Lo_deleting)
+		err = -ENXIO;
+	else
+		atomic_inc(&lo->lo_refcnt);
+	spin_unlock(&loop_delete_spinlock);
+	if (err)
+		goto cleanup;
+
+	/* close->private_data is set when lo_release() is called. */
+	ev->file = close;
+	spin_lock(&release_event_spinlock);
+	list_add(&ev->head, &release_event_spool);
+	spin_unlock(&release_event_spinlock);
+
+	/*
+	 * Try to avoid accessing Lo_rundown loop device.
+	 *
+	 * Since the task_work list is LIFO, lo_post_release() will run before
+	 * lo_post_open() runs if an error occur between returned from
+	 * lo_open() and returning to user mode. This means that lo->refcnt may
+	 * be already 0 when lo_post_open() runs. Therefore, use
+	 * lo->async_pending in order to prevent loop_remove() from releasing
+	 * this loop device.
+	 */
+	if (open) {
+		open->private_data = lo;
+		atomic_inc(&lo->async_pending);
+		fput(open);
+	}
+	return 0;
+ cleanup:
+	if (!IS_ERR_OR_NULL(open))
+		fput(open);
+	if (!IS_ERR(close))
+		fput(close);
+	kfree(ev);
+	return err;
+}
+
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+	struct loop_device *lo = disk->private_data;
+	struct release_event *ev;
+
+	atomic_inc(&lo->async_pending);
+	/*
+	 * Fetch from spool. Since a successful lo_open() call is coupled with
+	 * a lo_release() call, we are guaranteed that spool is not empty.
+	 */
+	spin_lock(&release_event_spinlock);
+	ev = list_first_entry(&release_event_spool, typeof(*ev), head);
+	list_del(&ev->head);
+	spin_unlock(&release_event_spinlock);
+	/* Hold references which will be dropped after lo_release(). */
+	__module_get(disk->fops->owner);
+	kobject_get(&disk_to_dev(disk)->kobj);
+	ev->file->private_data = disk;
+	fput(ev->file);
+	kfree(ev);
 }
 
 static const struct block_device_operations lo_fops = {
@@ -2029,6 +2140,7 @@ static int loop_add(int i)
 	if (!part_shift)
 		disk->flags |= GENHD_FL_NO_PART;
 	atomic_set(&lo->lo_refcnt, 0);
+	atomic_set(&lo->async_pending, 0);
 	mutex_init(&lo->lo_mutex);
 	lo->lo_number		= i;
 	spin_lock_init(&lo->lo_lock);
@@ -2070,6 +2182,9 @@ static int loop_add(int i)
 
 static void loop_remove(struct loop_device *lo)
 {
+	/* Wait for task work and/or WQ to complete. */
+	while (atomic_read(&lo->async_pending))
+		schedule_timeout_uninterruptible(1);
 	/* Make this loop device unreachable from pathname. */
 	del_gendisk(lo->lo_disk);
 	blk_cleanup_disk(lo->lo_disk);
@@ -2118,19 +2233,18 @@ static int loop_control_remove(int idx)
 	ret = mutex_lock_killable(&lo->lo_mutex);
 	if (ret)
 		goto mark_visible;
-	if (lo->lo_state != Lo_unbound ||
-	    atomic_read(&lo->lo_refcnt) > 0) {
-		mutex_unlock(&lo->lo_mutex);
+	spin_lock(&loop_delete_spinlock);
+	/* Mark this loop device no longer open()-able if nobody is using. */
+	if (lo->lo_state != Lo_unbound || atomic_read(&lo->lo_refcnt) > 0)
 		ret = -EBUSY;
-		goto mark_visible;
-	}
-	/* Mark this loop device no longer open()-able. */
-	lo->lo_state = Lo_deleting;
+	else
+		lo->lo_state = Lo_deleting;
+	spin_unlock(&loop_delete_spinlock);
 	mutex_unlock(&lo->lo_mutex);
-
-	loop_remove(lo);
-	return 0;
-
+	if (!ret) {
+		loop_remove(lo);
+		return 0;
+	}
 mark_visible:
 	/* Show this loop device again. */
 	mutex_lock(&loop_ctl_mutex);
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 082d4b6bfc6a..20fc5eebe455 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -56,6 +56,7 @@ struct loop_device {
 	struct gendisk		*lo_disk;
 	struct mutex		lo_mutex;
 	bool			idr_visible;
+	atomic_t		async_pending;
 };
 
 struct loop_cmd {
-- 
2.32.0