Re: [PATCH v2 2/2] loop: use task_work for autoclear operation

Jan Kara <jack@xxxxxxx> · Mon, 10 Jan 2022 11:30:57 +0100

On Fri 07-01-22 20:04:31, Tetsuo Handa wrote:
> Jan Stancek is reporting that commit 322c4293ecc58110 ("loop: make
> autoclear operation asynchronous") broke LTP tests which run /bin/mount
> and /bin/umount in close succession like
> 
>   while :; do mount -o loop,ro isofs.iso isofs/; umount isofs/; done
> 
> . This is because since /usr/lib/systemd/systemd-udevd asynchronously
> opens the loop device which /bin/mount and /bin/umount are operating,
> autoclear from lo_release() is likely triggered by systemd-udevd than
> mount or umount. And unfortunately, /bin/mount fails if read of superblock
> (for examining filesystem type) returned an error due to the backing file
> being cleared by __loop_clr_fd(). It turned out that disk->open_mutex was
> by chance serving as a barrier for serializing "__loop_clr_fd() from
> lo_release()" and "vfs_read() after lo_open()", and we need to restore
> this barrier (without reintroducing circular locking dependency).
> 
> Also, the kernel test robot is reporting that that commit broke xfstest
> which does
> 
>   umount ext2 on xfs
>   umount xfs
> 
> sequence.
> 
> One of approaches for fixing these problems is to revert that commit and
> instead remove destroy_workqueue() from __loop_clr_fd(), for it turned out
> that we did not need to call flush_workqueue() from __loop_clr_fd() since
> blk_mq_freeze_queue() blocks until all pending "struct work_struct" are
> processed by loop_process_work(). But we are not sure whether it is safe
> to wait blk_mq_freeze_queue() etc. with disk->open_mutex held; it could
> be simply because dependency is not clear enough for fuzzers to cover and
> lockdep to detect.
> 
> Therefore, before taking revert approach, let's try if we can use task
> work approach which is called without locks held while the caller can
> wait for completion of that task work before returning to user mode.
> 
> This patch tries to make lo_open()/lo_release() to locklessly wait for
> __loop_clr_fd() by inserting a task work into lo_open()/lo_release() if
> possible.
> 
> Reported-by: kernel test robot <oliver.sang@xxxxxxxxx>
> Reported-by: Jan Stancek <jstancek@xxxxxxxxxx>
> Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx>

Eek, I agree that the patch may improve the situation but is the complexity
and ugliness really worth it? I mean using task work in
loop_schedule_rundown() makes a lot of sense because the loop

while :; do mount -o loop,ro isofs.iso isofs/; umount isofs/; done

will not fail because of dangling work in the workqueue after umount ->
__loop_clr_fd(). But when other processes like systemd-udevd start to mess
with the loop device, then you have no control whether the following mount
will see isofs.iso busy or not - it depends on when systemd-udevd decides
to close the loop device. What your waiting in lo_open() achieves is only
that if __loop_clr_fd() from systemd-udevd happens to run at the same time
as lo_open() from mount, then we won't see EBUSY. But IMO that is not worth
the complexity in lo_open() because if systemd-udevd happens to close the
loop device a milisecond later, you will get EBUSY anyway (and you would
get it even in the past) Or am I missing something?

								Honza

> ---
> Changes in v2:
>   Need to also wait on lo_open(), per Jan's testcase.
> 
>  drivers/block/loop.c | 70 ++++++++++++++++++++++++++++++++++++++++----
>  drivers/block/loop.h |  5 +++-
>  2 files changed, 68 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index b1b05c45c07c..8ef6da186c5c 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -89,6 +89,7 @@
>  static DEFINE_IDR(loop_index_idr);
>  static DEFINE_MUTEX(loop_ctl_mutex);
>  static DEFINE_MUTEX(loop_validate_mutex);
> +static DECLARE_WAIT_QUEUE_HEAD(loop_rundown_wait);
>  
>  /**
>   * loop_global_lock_killable() - take locks for safe loop_validate_file() test
> @@ -1172,13 +1173,12 @@ static void loop_rundown_completed(struct loop_device *lo)
>  	mutex_lock(&lo->lo_mutex);
>  	lo->lo_state = Lo_unbound;
>  	mutex_unlock(&lo->lo_mutex);
> +	wake_up_all(&loop_rundown_wait);
>  	module_put(THIS_MODULE);
>  }
>  
> -static void loop_rundown_workfn(struct work_struct *work)
> +static void loop_rundown_start(struct loop_device *lo)
>  {
> -	struct loop_device *lo = container_of(work, struct loop_device,
> -					      rundown_work);
>  	struct block_device *bdev = lo->lo_device;
>  	struct gendisk *disk = lo->lo_disk;
>  
> @@ -1188,6 +1188,18 @@ static void loop_rundown_workfn(struct work_struct *work)
>  	loop_rundown_completed(lo);
>  }
>  
> +static void loop_rundown_callbackfn(struct callback_head *callback)
> +{
> +	loop_rundown_start(container_of(callback, struct loop_device,
> +					rundown.callback));
> +}
> +
> +static void loop_rundown_workfn(struct work_struct *work)
> +{
> +	loop_rundown_start(container_of(work, struct loop_device,
> +					rundown.work));
> +}
> +
>  static void loop_schedule_rundown(struct loop_device *lo)
>  {
>  	struct block_device *bdev = lo->lo_device;
> @@ -1195,8 +1207,13 @@ static void loop_schedule_rundown(struct loop_device *lo)
>  
>  	__module_get(disk->fops->owner);
>  	kobject_get(&bdev->bd_device.kobj);
> -	INIT_WORK(&lo->rundown_work, loop_rundown_workfn);
> -	queue_work(system_long_wq, &lo->rundown_work);
> +	if (!(current->flags & PF_KTHREAD)) {
> +		init_task_work(&lo->rundown.callback, loop_rundown_callbackfn);
> +		if (!task_work_add(current, &lo->rundown.callback, TWA_RESUME))
> +			return;
> +	}
> +	INIT_WORK(&lo->rundown.work, loop_rundown_workfn);
> +	queue_work(system_long_wq, &lo->rundown.work);
>  }
>  
>  static int loop_clr_fd(struct loop_device *lo)
> @@ -1721,19 +1738,60 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
>  }
>  #endif
>  
> +struct loop_rundown_waiter {
> +	struct callback_head callback;
> +	struct loop_device *lo;
> +};
> +
> +static void loop_rundown_waiter_callbackfn(struct callback_head *callback)
> +{
> +	struct loop_rundown_waiter *lrw =
> +		container_of(callback, struct loop_rundown_waiter, callback);
> +
> +	/*
> +	 * Locklessly wait for completion of __loop_clr_fd().
> +	 * This should be safe because of the following rules.
> +	 *
> +	 *  (a) From locking dependency perspective, this function is called
> +	 *      without any locks held.
> +	 *  (b) From execution ordering perspective, this function is called
> +	 *      by the moment lo_open() from open() syscall returns to user
> +	 *      mode.
> +	 *  (c) From use-after-free protection perspective, this function is
> +	 *      called before loop_remove() is called, for lo->lo_refcnt taken
> +	 *      by lo_open() prevents loop_control_remove() and loop_exit().
> +	 */
> +	wait_event_killable(loop_rundown_wait, data_race(lrw->lo->lo_state) != Lo_rundown);
> +	kfree(lrw);
> +}
> +
>  static int lo_open(struct block_device *bdev, fmode_t mode)
>  {
>  	struct loop_device *lo = bdev->bd_disk->private_data;
> +	struct loop_rundown_waiter *lrw =
> +		kmalloc(sizeof(*lrw), GFP_KERNEL | __GFP_NOWARN);
>  	int err;
>  
> +	if (!lrw)
> +		return -ENOMEM;
>  	err = mutex_lock_killable(&lo->lo_mutex);
> -	if (err)
> +	if (err) {
> +		kfree(lrw);
>  		return err;
> +	}
>  	if (lo->lo_state == Lo_deleting)
>  		err = -ENXIO;
>  	else
>  		atomic_inc(&lo->lo_refcnt);
>  	mutex_unlock(&lo->lo_mutex);
> +	if (!err && !(current->flags & PF_KTHREAD)) {
> +		init_task_work(&lrw->callback, loop_rundown_waiter_callbackfn);
> +		lrw->lo = lo;
> +		if (task_work_add(current, &lrw->callback, TWA_RESUME))
> +			kfree(lrw);
> +	} else {
> +		kfree(lrw);
> +	}
>  	return err;
>  }
>  
> diff --git a/drivers/block/loop.h b/drivers/block/loop.h
> index 918a7a2dc025..596472f9cde3 100644
> --- a/drivers/block/loop.h
> +++ b/drivers/block/loop.h
> @@ -56,7 +56,10 @@ struct loop_device {
>  	struct gendisk		*lo_disk;
>  	struct mutex		lo_mutex;
>  	bool			idr_visible;
> -	struct work_struct      rundown_work;
> +	union {
> +		struct work_struct   work;
> +		struct callback_head callback;
> +	} rundown;
>  };
>  
>  struct loop_cmd {
> -- 
> 2.32.0
> 
> 
-- 
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR