On 2022/01/17 17:15, Christoph Hellwig wrote: > On Sat, Jan 15, 2022 at 09:34:10AM +0900, Tetsuo Handa wrote: >> Christoph is not a fan of proliferating the use of task_work_add(). Can we go with exporting task_work_add() > > Not a fan != NAK. If we can't think of anything better we'll have to do > that. Note that I also have a task_work_add API cleanup pending that makes > it a lot less ugly. > >> for this release cycle? Or instead can we go with providing release() callback without disk->open_mutex held >> ( https://lkml.kernel.org/r/08d703d1-8b32-ec9b-2b50-54b8376d3d40@xxxxxxxxxxxxxxxxxxx ) ? > > This one OTOH is a hard NAK as this is an API that will just cause a lot > of problems. What problems can you think of with [PATCH 1/4] below? I found that patches below are robuster than task_work_add() approach because the loop module does not need to know about refcounts which the core block layer manipulates. If we go with task_work_add() approach, the loop module needs to be updated in sync with refcount manipulations in the core block layer. Subject: [PATCH 1/4] block: add post_open/post_release block device callbacks Commit 322c4293ecc58110 ("loop: make autoclear operation asynchronous") silenced a circular locking dependency warning, but caused a breakage for /bin/mount and /bin/umount users. Further analysis by Jan revealed that waiting for I/O completion with disk->open_mutex held has possibility of deadlock. We need to fix this breakage, without waiting for I/O completion with disk->open_mutex held. We can't temporarily drop disk->open_mutex inside open and release block device callbacks. We could export task_work_add() for the loop module, but Christoph is not a fan of proliferating the use of task_work_add(). Therefore, add post_open and post_release block device callbacks which allows performing an extra operation without holding disk->open_mutex after returning from open and release block device callbacks respectively. Cc: Jan Kara <jack@xxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> --- block/bdev.c | 30 +++++++++++++++++++++--------- include/linux/blkdev.h | 6 ++++++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 8bf93a19041b..efde8ffd0df6 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -683,15 +683,16 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); bdev->bd_openers++; - return 0;; + return 0; } -static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) +static bool blkdev_put_whole(struct block_device *bdev, fmode_t mode) { if (!--bdev->bd_openers) blkdev_flush_mapping(bdev); if (bdev->bd_disk->fops->release) bdev->bd_disk->fops->release(bdev->bd_disk, mode); + return true; } static int blkdev_get_part(struct block_device *part, fmode_t mode) @@ -721,15 +722,15 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) return ret; } -static void blkdev_put_part(struct block_device *part, fmode_t mode) +static bool blkdev_put_part(struct block_device *part, fmode_t mode) { struct block_device *whole = bdev_whole(part); if (--part->bd_openers) - return; + return false; blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; - blkdev_put_whole(whole, mode); + return blkdev_put_whole(whole, mode); } struct block_device *blkdev_get_no_open(dev_t dev) @@ -784,6 +785,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) bool unblock_events = true; struct block_device *bdev; struct gendisk *disk; + bool release_called = false; int ret; ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, @@ -812,10 +814,13 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) goto abort_claiming; if (!try_module_get(disk->fops->owner)) goto abort_claiming; - if (bdev_is_partition(bdev)) + if (bdev_is_partition(bdev)) { ret = blkdev_get_part(bdev, mode); - else + if (ret) + release_called = true; + } else { ret = blkdev_get_whole(bdev, mode); + } if (ret) goto put_module; if (mode & FMODE_EXCL) { @@ -835,6 +840,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) } } mutex_unlock(&disk->open_mutex); + if (bdev->bd_disk->fops->post_open) + bdev->bd_disk->fops->post_open(bdev->bd_disk); if (unblock_events) disk_unblock_events(disk); @@ -845,6 +852,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (mode & FMODE_EXCL) bd_abort_claiming(bdev, holder); mutex_unlock(&disk->open_mutex); + if (release_called && bdev->bd_disk->fops->post_release) + bdev->bd_disk->fops->post_release(bdev->bd_disk); disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); @@ -893,6 +902,7 @@ EXPORT_SYMBOL(blkdev_get_by_path); void blkdev_put(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; + bool release_called; /* * Sync early if it looks like we're the last one. If someone else @@ -944,10 +954,12 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); if (bdev_is_partition(bdev)) - blkdev_put_part(bdev, mode); + release_called = blkdev_put_part(bdev, mode); else - blkdev_put_whole(bdev, mode); + release_called = blkdev_put_whole(bdev, mode); mutex_unlock(&disk->open_mutex); + if (release_called && bdev->bd_disk->fops->post_release) + bdev->bd_disk->fops->post_release(bdev->bd_disk); module_put(disk->fops->owner); blkdev_put_no_open(bdev); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9c95df26fc26..f35e92fd72d0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1227,6 +1227,12 @@ struct block_device_operations { * driver. */ int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector); + /* + * Special callback for doing extra operations without + * disk->open_mutex held. Used by loop driver. + */ + void (*post_open)(struct gendisk *disk); + void (*post_release)(struct gendisk *disk); }; #ifdef CONFIG_COMPAT -- 2.32.0 Subject: [PATCH 2/4] loop: don't hold lo->lo_mutex from lo_open() Waiting for I/O completion with disk->open_mutex held has possibility of deadlock. Since disk->open_mutex => lo->lo_mutex dependency is recorded by lo_open(), and blk_mq_freeze_queue() by e.g. loop_set_status() waits for I/O completion with lo->lo_mutex held, from locking dependency chain perspective waiting for I/O completion with disk->open_mutex held still remains. Introduce loop_delete_spinlock dedicated for protecting lo->lo_state versus lo->lo_refcnt race in lo_open() and loop_remove_control(). Cc: Jan Kara <jack@xxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> --- drivers/block/loop.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index b1b05c45c07c..9b9f821d1ea7 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -89,6 +89,7 @@ static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); static DEFINE_MUTEX(loop_validate_mutex); +static DEFINE_SPINLOCK(loop_delete_spinlock); /** * loop_global_lock_killable() - take locks for safe loop_validate_file() test @@ -1724,16 +1725,15 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, static int lo_open(struct block_device *bdev, fmode_t mode) { struct loop_device *lo = bdev->bd_disk->private_data; - int err; + int err = 0; - err = mutex_lock_killable(&lo->lo_mutex); - if (err) - return err; - if (lo->lo_state == Lo_deleting) + spin_lock(&loop_delete_spinlock); + /* lo->lo_state may be changed to any Lo_* but Lo_deleting. */ + if (data_race(lo->lo_state) == Lo_deleting) err = -ENXIO; else atomic_inc(&lo->lo_refcnt); - mutex_unlock(&lo->lo_mutex); + spin_unlock(&loop_delete_spinlock); return err; } @@ -2119,19 +2119,18 @@ static int loop_control_remove(int idx) ret = mutex_lock_killable(&lo->lo_mutex); if (ret) goto mark_visible; - if (lo->lo_state != Lo_unbound || - atomic_read(&lo->lo_refcnt) > 0) { - mutex_unlock(&lo->lo_mutex); + spin_lock(&loop_delete_spinlock); + /* Mark this loop device no longer open()-able if nobody is using. */ + if (lo->lo_state != Lo_unbound || atomic_read(&lo->lo_refcnt) > 0) ret = -EBUSY; - goto mark_visible; - } - /* Mark this loop device no longer open()-able. */ - lo->lo_state = Lo_deleting; + else + lo->lo_state = Lo_deleting; + spin_unlock(&loop_delete_spinlock); mutex_unlock(&lo->lo_mutex); - - loop_remove(lo); - return 0; - + if (!ret) { + loop_remove(lo); + return 0; + } mark_visible: /* Show this loop device again. */ mutex_lock(&loop_ctl_mutex); -- 2.32.0 Subject: [PATCH 3/4] loop: move lo_release to lo_post_release The kernel test robot is reporting that xfstest which does umount ext2 on xfs umount xfs sequence started failing, for commit 322c4293ecc58110 ("loop: make autoclear operation asynchronous") removed a guarantee that fput() of backing file is processed before lo_release() from close() returns to user mode. To fix this breakage while killing disk->open_mutex => lo->lo_mutex dependency, defer whole lo_release() to lo_post_release() and make autoclear operation synchronous again. Reported-by: kernel test robot <oliver.sang@xxxxxxxxx> Cc: Jan Kara <jack@xxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> --- drivers/block/loop.c | 60 +++++++++++--------------------------------- drivers/block/loop.h | 1 - 2 files changed, 15 insertions(+), 46 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 9b9f821d1ea7..84a3889037d7 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1166,40 +1166,12 @@ static void __loop_clr_fd(struct loop_device *lo) lo->lo_disk->flags |= GENHD_FL_NO_PART; fput(filp); -} - -static void loop_rundown_completed(struct loop_device *lo) -{ mutex_lock(&lo->lo_mutex); lo->lo_state = Lo_unbound; mutex_unlock(&lo->lo_mutex); module_put(THIS_MODULE); } -static void loop_rundown_workfn(struct work_struct *work) -{ - struct loop_device *lo = container_of(work, struct loop_device, - rundown_work); - struct block_device *bdev = lo->lo_device; - struct gendisk *disk = lo->lo_disk; - - __loop_clr_fd(lo); - kobject_put(&bdev->bd_device.kobj); - module_put(disk->fops->owner); - loop_rundown_completed(lo); -} - -static void loop_schedule_rundown(struct loop_device *lo) -{ - struct block_device *bdev = lo->lo_device; - struct gendisk *disk = lo->lo_disk; - - __module_get(disk->fops->owner); - kobject_get(&bdev->bd_device.kobj); - INIT_WORK(&lo->rundown_work, loop_rundown_workfn); - queue_work(system_long_wq, &lo->rundown_work); -} - static int loop_clr_fd(struct loop_device *lo) { int err; @@ -1230,7 +1202,6 @@ static int loop_clr_fd(struct loop_device *lo) mutex_unlock(&lo->lo_mutex); __loop_clr_fd(lo); - loop_rundown_completed(lo); return 0; } @@ -1737,30 +1708,29 @@ static int lo_open(struct block_device *bdev, fmode_t mode) return err; } -static void lo_release(struct gendisk *disk, fmode_t mode) +static void lo_post_release(struct gendisk *disk) { struct loop_device *lo = disk->private_data; mutex_lock(&lo->lo_mutex); - if (atomic_dec_return(&lo->lo_refcnt)) - goto out_unlock; + /* Check whether this loop device can be cleared. */ + if (atomic_dec_return(&lo->lo_refcnt) || lo->lo_state != Lo_bound) + goto out_unlock; + /* + * Clear this loop device since nobody is using. Note that since lo_open() + * increments lo->lo_refcnt without holding lo->lo_mutex, I might become + * no longer the last user, but there is a fact that there was no user. + * + * In autoclear mode, destroy WQ and remove configuration. + * Otherwise flush possible ongoing bios in WQ and keep configuration. + */ if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { - if (lo->lo_state != Lo_bound) - goto out_unlock; lo->lo_state = Lo_rundown; mutex_unlock(&lo->lo_mutex); - /* - * In autoclear mode, stop the loop thread - * and remove configuration after last close. - */ - loop_schedule_rundown(lo); + __loop_clr_fd(lo); return; - } else if (lo->lo_state == Lo_bound) { - /* - * Otherwise keep thread (if running) and config, - * but flush possible ongoing bios in thread. - */ + } else { blk_mq_freeze_queue(lo->lo_queue); blk_mq_unfreeze_queue(lo->lo_queue); } @@ -1772,7 +1742,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode) static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, .open = lo_open, - .release = lo_release, + .post_release = lo_post_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = lo_compat_ioctl, diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 918a7a2dc025..082d4b6bfc6a 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -56,7 +56,6 @@ struct loop_device { struct gendisk *lo_disk; struct mutex lo_mutex; bool idr_visible; - struct work_struct rundown_work; }; struct loop_cmd { -- 2.32.0 Subject: [PATCH 4/4] loop: wait for __loop_clr_fd() to complete upon lo_open() Since lo_open() and lo_release() were previously serialized via disk->open_mutex, new file descriptors returned by open() never reached a loop device in Lo_rundown state unless ioctl(LOOP_CLR_FD) was inside __loop_clr_fd(). But now that since lo_open() and lo_release() no longer hold lo->lo_mutex in order to kill disk->open_mutex => lo->lo_mutex depencency, new file descriptors returned by open() can easily reach a loop device in Lo_rundown state. For example, Jan Stancek is reporting that LTP tests which do mount/umount in close succession like # for i in `seq 1 2`;do mount -t iso9660 -o loop /root/isofs.iso /mnt/isofs; umount /mnt/isofs/; done started failing. Since commit 322c4293ecc58110 ("loop: make autoclear operation asynchronous") removed disk->open_mutex() serialization, /bin/mount started to fail when __loop_clr_fd() was called from WQ context after lo_open() returned [1]. /bin/mount needs to be updated to check ioctl(LOOP_GET_STATUS) after open() in order to confirm that lo->lo_state remains Lo_bound. But we need some migration period for allowing users to update their util-linux package. Thus, meantime emulate serialization between lo_open() and lo_release() without using disk->open_mutex. Link: https://lkml.kernel.org/r/20220113154735.hdzi4cqsz5jt6asp@xxxxxxxxxx [1] Reported-by: Jan Stancek <jstancek@xxxxxxxxxx> Analyzed-by: Jan Kara <jack@xxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> --- drivers/block/loop.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 84a3889037d7..cd19af969209 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -90,6 +90,7 @@ static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); static DEFINE_MUTEX(loop_validate_mutex); static DEFINE_SPINLOCK(loop_delete_spinlock); +static DECLARE_WAIT_QUEUE_HEAD(loop_rundown_wait); /** * loop_global_lock_killable() - take locks for safe loop_validate_file() test @@ -1169,6 +1170,7 @@ static void __loop_clr_fd(struct loop_device *lo) mutex_lock(&lo->lo_mutex); lo->lo_state = Lo_unbound; mutex_unlock(&lo->lo_mutex); + wake_up_all(&loop_rundown_wait); module_put(THIS_MODULE); } @@ -1708,6 +1710,18 @@ static int lo_open(struct block_device *bdev, fmode_t mode) return err; } +static void lo_post_open(struct gendisk *disk) +{ + struct loop_device *lo = disk->private_data; + + /* Wait for lo_post_release() to leave lo->lo_mutex section. */ + if (mutex_lock_killable(&lo->lo_mutex)) + return; + mutex_unlock(&lo->lo_mutex); + /* Also wait for __loop_clr_fd() to complete if Lo_rundown was set. */ + wait_event_killable(loop_rundown_wait, data_race(lo->lo_state) != Lo_rundown); +} + static void lo_post_release(struct gendisk *disk) { struct loop_device *lo = disk->private_data; @@ -1742,6 +1756,7 @@ static void lo_post_release(struct gendisk *disk) static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, .open = lo_open, + .post_open = lo_post_open, .post_release = lo_post_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT -- 2.32.0