On 11/26, Bart Van Assche wrote: > On 11/26/19 10:29 AM, Jaegeuk Kim wrote: > > On 11/25, Bart Van Assche wrote: > > > diff --git a/drivers/block/loop.c b/drivers/block/loop.c > > > index 739b372a5112..84bdb3a6f6d0 100644 > > > --- a/drivers/block/loop.c > > > +++ b/drivers/block/loop.c > > > @@ -1264,14 +1264,17 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) > > > goto out_unlock; > > > } > > > > > > - if (lo->lo_offset != info->lo_offset || > > > - lo->lo_sizelimit != info->lo_sizelimit) { > > > - sync_blockdev(lo->lo_device); > > > - kill_bdev(lo->lo_device); > > > - } > > > + /* > > > + * Drain the page cache and the request queue. Set the "dying" flag to > > > + * prevent that kill_bdev() locks up. > > > + */ > > > + sync_blockdev(lo->lo_device); > > > > > > - /* I/O need to be drained during transfer transition */ > > > - blk_mq_freeze_queue(lo->lo_queue); > > > + blk_set_queue_dying(lo->lo_queue); > > > + blk_mq_freeze_queue_wait(lo->lo_queue); > > > + > > > + /* Kill buffers that got dirtied after the sync_blockdev() call. */ > > > > Any race condition where we can truncate any dirty pages written between > > sync_blockdev() and kill_bdev()? > > > > Thanks, > > > > > + kill_bdev(lo->lo_device); > > Hi Jaegeuk, > > As you know sync_blockdev() triggers a call to filemap_write_and_wait(). > That function in turn calls mapping->a_ops->writepages() with sync_mode == > WB_SYNC_ALL. I think that causes sync_blockdev() to wait until all dirty > pages have been written so we don't have to worry about pages dirtied before > the sync_blockdev() call started. My concern is on some dirty pages after sync_blockdev(), since truncate_inode_pages() in kill_bdev() will cancel dirty pages. > > Should we try to handle read and write requests that are submitted to a loop > device while the loop device block size, offset and/or size are being > modified or is it OK to fail such requests? The blk_set_queue_dying() and > blk_mq_freeze_queue_wait() calls set the DYING queue flag and also wait for > all ongoing block layer requests submitted to the loop device to finish. All > later submit_bio(), generic_make_request(), direct_make_request() and > blk_get_request() calls will fail with BLK_STS_IOERR or -ENODEV, including > those triggered by kill_bdev(). In other words, the above patch causes I/O > to fail that is submitted concurrently with kill_bdev(). Do you agree with > failing I/O requests submitted to a loop device while the loop device block > size, offset and/or size are being modified? Actually, the previous patch returned EAGAIN, if there was an interim IO which produced some dirty pages betwen kill_bdev() and blk_mq_freeze_queue_wait(). IMHO, we can't do blk_set_queue_dying() which gives EIO to those IOs. Looking at the flow again, how about this? --- drivers/block/loop.c | 67 ++++++++++++++++++++++---------------------- fs/block_dev.c | 12 ++++++++ include/linux/fs.h | 3 ++ 3 files changed, 48 insertions(+), 34 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f6f77eaa7217..0f3eca774655 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1224,6 +1224,23 @@ static int loop_clr_fd(struct loop_device *lo) return __loop_clr_fd(lo, false); } +static void +loop_mq_freeze_queue(struct loop_device *lo) +{ + for (;;) { + sync_blockdev(lo->lo_device); + + /* I/O need to be drained during transfer transition */ + blk_mq_freeze_queue(lo->lo_queue); + + if (is_dirty_bdev(lo->lo_device)) + blk_mq_unfreeze_queue(lo->lo_queue); + else + break; + } + kill_bdev(lo->lo_device); +} + static int loop_set_status(struct loop_device *lo, const struct loop_info64 *info) { @@ -1232,6 +1249,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) kuid_t uid = current_uid(); struct block_device *bdev; bool partscan = false; + bool drop_cache; err = mutex_lock_killable(&loop_ctl_mutex); if (err) @@ -1251,14 +1269,14 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) goto out_unlock; } - if (lo->lo_offset != info->lo_offset || - lo->lo_sizelimit != info->lo_sizelimit) { - sync_blockdev(lo->lo_device); - kill_bdev(lo->lo_device); - } + drop_cache = (lo->lo_offset != info->lo_offset || + lo->lo_sizelimit != info->lo_sizelimit); /* I/O need to be drained during transfer transition */ - blk_mq_freeze_queue(lo->lo_queue); + if (drop_cache) + loop_mq_freeze_queue(lo); + else + blk_mq_freeze_queue(lo->lo_queue); err = loop_release_xfer(lo); if (err) @@ -1283,16 +1301,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if (err) goto out_unfreeze; - if (lo->lo_offset != info->lo_offset || - lo->lo_sizelimit != info->lo_sizelimit) { - /* kill_bdev should have truncated all the pages */ - if (lo->lo_device->bd_inode->i_mapping->nrpages) { - err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } + if (drop_cache) { if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) { err = -EFBIG; goto out_unfreeze; @@ -1518,7 +1527,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg) static int loop_set_block_size(struct loop_device *lo, unsigned long arg) { - int err = 0; + bool drop_cache; if (lo->lo_state != Lo_bound) return -ENXIO; @@ -1526,31 +1535,21 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg)) return -EINVAL; - if (lo->lo_queue->limits.logical_block_size != arg) { - sync_blockdev(lo->lo_device); - kill_bdev(lo->lo_device); - } - - blk_mq_freeze_queue(lo->lo_queue); + drop_cache = (lo->lo_queue->limits.logical_block_size != arg); - /* kill_bdev should have truncated all the pages */ - if (lo->lo_queue->limits.logical_block_size != arg && - lo->lo_device->bd_inode->i_mapping->nrpages) { - err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } + /* I/O need to be drained during transfer transition */ + if (drop_cache) + loop_mq_freeze_queue(lo); + else + blk_mq_freeze_queue(lo->lo_queue); blk_queue_logical_block_size(lo->lo_queue, arg); blk_queue_physical_block_size(lo->lo_queue, arg); blk_queue_io_min(lo->lo_queue, arg); loop_update_dio(lo); -out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue); - return err; + return 0; } static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, diff --git a/fs/block_dev.c b/fs/block_dev.c index 9c073dbdc1b0..81fe3beef92b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -74,6 +74,18 @@ static void bdev_write_inode(struct block_device *bdev) spin_unlock(&inode->i_lock); } +bool is_dirty_bdev(struct block_device *bdev) +{ + struct inode *inode = bdev->bd_inode; + bool dirty; + + spin_lock(&inode->i_lock); + dirty = inode->i_state & I_DIRTY ? true: false; + spin_unlock(&inode->i_lock); + return dirty; +} +EXPORT_SYMBOL(is_dirty_bdev); + /* Kill _all_ buffers and pagecache , dirty or not.. */ void kill_bdev(struct block_device *bdev) { diff --git a/include/linux/fs.h b/include/linux/fs.h index e0d909d35763..02156ad96be2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2570,6 +2570,7 @@ extern void bdput(struct block_device *); extern void invalidate_bdev(struct block_device *); extern void iterate_bdevs(void (*)(struct block_device *, void *), void *); extern int sync_blockdev(struct block_device *bdev); +extern bool is_dirty_bdev(struct block_device *bdev); extern void kill_bdev(struct block_device *); extern struct super_block *freeze_bdev(struct block_device *); extern void emergency_thaw_all(void); @@ -2583,9 +2584,11 @@ static inline bool sb_is_blkdev_sb(struct super_block *sb) { return sb == blockdev_superblock; } + #else static inline void bd_forget(struct inode *inode) {} static inline int sync_blockdev(struct block_device *bdev) { return 0; } +static inline bool is_dirty_bdev(struct block_device *bdev) { return false; } static inline void kill_bdev(struct block_device *bdev) {} static inline void invalidate_bdev(struct block_device *bdev) {} -- 2.19.0.605.g01d371f741-goog