Hi, On 04.08.2021 11:41, Christoph Hellwig wrote: > device mapper needs to register holders before it is ready to do I/O. > Currently it does so by registering the disk early, which can leave > the disk and queue in a weird half state where the queue is registered > with the disk, except for sysfs and the elevator. And this state has > been a bit promlematic before, and will get more so when sorting out > the responsibilities between the queue and the disk. > > Support registering holders on an initialized but not registered disk > instead by delaying the sysfs registration until the disk is registered. > > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > Reviewed-by: Mike Snitzer <snitzer@xxxxxxxxxx> This patch landed in today's linux-next (20210810) as commit d62633873590 ("block: support delayed holder registration"). It triggers a following lockdep warning on ARM64's virt 'machine' on QEmu: ====================================================== WARNING: possible circular locking dependency detected 5.14.0-rc4+ #10642 Not tainted ------------------------------------------------------ systemd-udevd/227 is trying to acquire lock: ffffb6b41952d628 (mtd_table_mutex){+.+.}-{3:3}, at: blktrans_open+0x40/0x250 but task is already holding lock: ffff0eacc403bb18 (&disk->open_mutex){+.+.}-{3:3}, at: blkdev_get_by_dev+0x110/0x2f8 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&disk->open_mutex){+.+.}-{3:3}: __mutex_lock+0xa4/0x978 mutex_lock_nested+0x54/0x60 bd_register_pending_holders+0x2c/0x118 __device_add_disk+0x1d8/0x368 device_add_disk+0x10/0x18 add_mtd_blktrans_dev+0x2dc/0x428 mtdblock_add_mtd+0x68/0x98 blktrans_notify_add+0x44/0x70 add_mtd_device+0x430/0x4a0 mtd_device_parse_register+0x1a4/0x2b0 physmap_flash_probe+0x44c/0x780 platform_probe+0x90/0xd8 really_probe+0x138/0x2d0 __driver_probe_device+0x78/0xd8 driver_probe_device+0x40/0x110 __driver_attach+0xcc/0x118 bus_for_each_dev+0x68/0xc8 driver_attach+0x20/0x28 bus_add_driver+0x168/0x1f8 driver_register+0x60/0x110 __platform_driver_register+0x24/0x30 physmap_init+0x18/0x20 do_one_initcall+0x84/0x450 kernel_init_freeable+0x31c/0x38c kernel_init+0x20/0x120 ret_from_fork+0x10/0x18 -> #0 (mtd_table_mutex){+.+.}-{3:3}: __lock_acquire+0xff4/0x1840 lock_acquire+0x130/0x3e8 __mutex_lock+0xa4/0x978 mutex_lock_nested+0x54/0x60 blktrans_open+0x40/0x250 blkdev_get_whole+0x28/0x120 blkdev_get_by_dev+0x15c/0x2f8 blkdev_open+0x50/0xb0 do_dentry_open+0x238/0x3c0 vfs_open+0x28/0x30 path_openat+0x720/0x938 do_filp_open+0x80/0x108 do_sys_openat2+0x1b4/0x2c8 do_sys_open+0x68/0x88 __arm64_compat_sys_openat+0x1c/0x28 invoke_syscall+0x40/0xf8 el0_svc_common+0x60/0x100 do_el0_svc_compat+0x1c/0x48 el0_svc_compat+0x20/0x30 el0t_32_sync_handler+0xec/0x140 el0t_32_sync+0x168/0x16c other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&disk->open_mutex); lock(mtd_table_mutex); lock(&disk->open_mutex); lock(mtd_table_mutex); *** DEADLOCK *** 1 lock held by systemd-udevd/227: #0: ffff0eacc403bb18 (&disk->open_mutex){+.+.}-{3:3}, at: blkdev_get_by_dev+0x110/0x2f8 stack backtrace: CPU: 1 PID: 227 Comm: systemd-udevd Not tainted 5.14.0-rc4+ #10642 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x1d0 show_stack+0x14/0x20 dump_stack_lvl+0x88/0xb0 dump_stack+0x14/0x2c print_circular_bug.isra.50+0x1ac/0x200 check_noncircular+0x134/0x148 __lock_acquire+0xff4/0x1840 lock_acquire+0x130/0x3e8 __mutex_lock+0xa4/0x978 mutex_lock_nested+0x54/0x60 blktrans_open+0x40/0x250 blkdev_get_whole+0x28/0x120 blkdev_get_by_dev+0x15c/0x2f8 blkdev_open+0x50/0xb0 do_dentry_open+0x238/0x3c0 vfs_open+0x28/0x30 path_openat+0x720/0x938 do_filp_open+0x80/0x108 do_sys_openat2+0x1b4/0x2c8 do_sys_open+0x68/0x88 __arm64_compat_sys_openat+0x1c/0x28 invoke_syscall+0x40/0xf8 el0_svc_common+0x60/0x100 do_el0_svc_compat+0x1c/0x48 el0_svc_compat+0x20/0x30 el0t_32_sync_handler+0xec/0x140 el0t_32_sync+0x168/0x16c If this is a false positive, then it should be annotated as such. > --- > block/genhd.c | 10 +++++++ > block/holder.c | 68 ++++++++++++++++++++++++++++++++----------- > include/linux/genhd.h | 5 ++++ > 3 files changed, 66 insertions(+), 17 deletions(-) > > diff --git a/block/genhd.c b/block/genhd.c > index cd4eab744667..db916f779077 100644 > --- a/block/genhd.c > +++ b/block/genhd.c > @@ -447,6 +447,16 @@ static void register_disk(struct device *parent, struct gendisk *disk, > kobject_create_and_add("holders", &ddev->kobj); > disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); > > + /* > + * XXX: this is a mess, can't wait for real error handling in add_disk. > + * Make sure ->slave_dir is NULL if we failed some of the registration > + * so that the cleanup in bd_unlink_disk_holder works properly. > + */ > + if (bd_register_pending_holders(disk) < 0) { > + kobject_put(disk->slave_dir); > + disk->slave_dir = NULL; > + } > + > if (disk->flags & GENHD_FL_HIDDEN) > return; > > diff --git a/block/holder.c b/block/holder.c > index 11e65d99a9fb..4568cc4f6827 100644 > --- a/block/holder.c > +++ b/block/holder.c > @@ -28,6 +28,19 @@ static void del_symlink(struct kobject *from, struct kobject *to) > sysfs_remove_link(from, kobject_name(to)); > } > > +static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk) > +{ > + int ret; > + > + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); > + if (ret) > + return ret; > + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); > + if (ret) > + del_symlink(disk->slave_dir, bdev_kobj(bdev)); > + return ret; > +} > + > /** > * bd_link_disk_holder - create symlinks between holding disk and slave bdev > * @bdev: the claimed slave bdev > @@ -66,7 +79,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) > WARN_ON_ONCE(!bdev->bd_holder); > > /* FIXME: remove the following once add_disk() handles errors */ > - if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) > + if (WARN_ON(!bdev->bd_holder_dir)) > goto out_unlock; > > holder = bd_find_holder_disk(bdev, disk); > @@ -84,28 +97,28 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) > INIT_LIST_HEAD(&holder->list); > holder->bdev = bdev; > holder->refcnt = 1; > - > - ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); > - if (ret) > - goto out_free; > - > - ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); > - if (ret) > - goto out_del; > + if (disk->slave_dir) { > + ret = __link_disk_holder(bdev, disk); > + if (ret) { > + kfree(holder); > + goto out_unlock; > + } > + } > > list_add(&holder->list, &disk->slave_bdevs); > - goto out_unlock; > - > -out_del: > - del_symlink(disk->slave_dir, bdev_kobj(bdev)); > -out_free: > - kfree(holder); > out_unlock: > mutex_unlock(&disk->open_mutex); > return ret; > } > EXPORT_SYMBOL_GPL(bd_link_disk_holder); > > +static void __unlink_disk_holder(struct block_device *bdev, > + struct gendisk *disk) > +{ > + del_symlink(disk->slave_dir, bdev_kobj(bdev)); > + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); > +} > + > /** > * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() > * @bdev: the calimed slave bdev > @@ -123,11 +136,32 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) > mutex_lock(&disk->open_mutex); > holder = bd_find_holder_disk(bdev, disk); > if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { > - del_symlink(disk->slave_dir, bdev_kobj(bdev)); > - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); > + if (disk->slave_dir) > + __unlink_disk_holder(bdev, disk); > list_del_init(&holder->list); > kfree(holder); > } > mutex_unlock(&disk->open_mutex); > } > EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); > + > +int bd_register_pending_holders(struct gendisk *disk) > +{ > + struct bd_holder_disk *holder; > + int ret; > + > + mutex_lock(&disk->open_mutex); > + list_for_each_entry(holder, &disk->slave_bdevs, list) { > + ret = __link_disk_holder(holder->bdev, disk); > + if (ret) > + goto out_undo; > + } > + mutex_unlock(&disk->open_mutex); > + return 0; > + > +out_undo: > + list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list) > + __unlink_disk_holder(holder->bdev, disk); > + mutex_unlock(&disk->open_mutex); > + return ret; > +} > diff --git a/include/linux/genhd.h b/include/linux/genhd.h > index 0721807d76ee..80952f038d79 100644 > --- a/include/linux/genhd.h > +++ b/include/linux/genhd.h > @@ -323,6 +323,7 @@ long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); > #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED > int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); > void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); > +int bd_register_pending_holders(struct gendisk *disk); > #else > static inline int bd_link_disk_holder(struct block_device *bdev, > struct gendisk *disk) > @@ -333,6 +334,10 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, > struct gendisk *disk) > { > } > +static inline int bd_register_pending_holders(struct gendisk *disk) > +{ > + return 0; > +} > #endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ > > dev_t part_devt(struct gendisk *disk, u8 partno); Best regards -- Marek Szyprowski, PhD Samsung R&D Institute Poland