On Thursday 26 February 2004 3:41 pm, Chris Mason wrote: > On Thu, 2004-02-26 at 15:43, Kevin Corry wrote: > > I've been trying to test out the VFS-lock patch, but haven't been having > > any luck with it. I applied it to a clean 2.6.3 kernel (along with the > > kgdb patch from -mm), and it consistently BUGs when it tries to mount the > > root filesystem. I've captured the console output through gdb and > > attached it below, along with the kernel config. My root filesystem is > > ext3 if that helps. > > > > I'll keep hunting for the problem, but I figured I'd send this > > information to you to start with. > > This one should work better for you. I've completed some simple tests (just light loads so far) with the new VFS-lock patch, and things seem to be working correctly. I've included the patch below, along with the proposed changes to dm.c to actually call the APIs. -- Kevin Corry kevcorry@xxxxxxxxxx http://evms.sourceforge.net/ VFS-Lock patch. --- diff/drivers/md/dm.c 2004-02-25 16:19:20.000000000 -0600 +++ source/drivers/md/dm.c 2004-02-26 16:46:04.000000000 -0600 @@ -12,6 +12,7 @@ #include <linux/moduleparam.h> #include <linux/blkpg.h> #include <linux/bio.h> +#include <linux/buffer_head.h> #include <linux/mempool.h> #include <linux/slab.h> @@ -46,6 +47,7 @@ */ #define DMF_BLOCK_IO 0 #define DMF_SUSPENDED 1 +#define DMF_FS_LOCKED 2 struct mapped_device { struct rw_semaphore lock; @@ -826,6 +828,24 @@ return 0; } +static void __lock_disk(struct gendisk *disk) +{ + struct block_device *bdev = bdget_disk(disk, 0); + if (bdev) { + fsync_bdev_lockfs(bdev); + bdput(bdev); + } +} + +static void __unlock_disk(struct gendisk *disk) +{ + struct block_device *bdev = bdget_disk(disk, 0); + if (bdev) { + unlockfs(bdev); + bdput(bdev); + } +} + /* * We need to be able to change a mapping table under a mounted * filesystem. For example we might want to move some data in @@ -837,12 +857,23 @@ { DECLARE_WAITQUEUE(wait, current); - down_write(&md->lock); + /* Flush I/O to the device. */ + down_read(&md->lock); + if (test_bit(DMF_BLOCK_IO, &md->flags)) { + up_read(&md->lock); + return -EINVAL; + } + + if (!test_and_set_bit(DMF_FS_LOCKED, &md->flags)) { + __lock_disk(md->disk); + } + up_read(&md->lock); /* * First we set the BLOCK_IO flag so no more ios will be * mapped. */ + down_write(&md->lock); if (test_bit(DMF_BLOCK_IO, &md->flags)) { up_write(&md->lock); return -EINVAL; @@ -892,11 +923,13 @@ dm_table_resume_targets(md->map); clear_bit(DMF_SUSPENDED, &md->flags); clear_bit(DMF_BLOCK_IO, &md->flags); + clear_bit(DMF_FS_LOCKED, &md->flags); def = bio_list_get(&md->deferred); __flush_deferred_io(md, def); up_write(&md->lock); + __unlock_disk(md->disk); blk_run_queues(); return 0; --- diff/fs/block_dev.c 2004-02-26 15:50:53.000000000 -0600 +++ source/fs/block_dev.c 2004-02-26 15:50:41.000000000 -0600 @@ -242,6 +242,7 @@ { memset(bdev, 0, sizeof(*bdev)); sema_init(&bdev->bd_sem, 1); + sema_init(&bdev->bd_mount_sem, 1); INIT_LIST_HEAD(&bdev->bd_inodes); INIT_LIST_HEAD(&bdev->bd_list); inode_init_once(&ei->vfs_inode); --- diff/fs/buffer.c 2004-02-18 10:39:07.000000000 -0600 +++ source/fs/buffer.c 2004-02-26 10:59:52.000000000 -0600 @@ -259,6 +259,17 @@ return sync_blockdev(bdev); } +int fsync_bdev_lockfs(struct block_device *bdev) +{ + int res; + res = fsync_bdev(bdev); + if (res) + return res; + sync_super_lockfs(bdev); + return sync_blockdev(bdev); +} +EXPORT_SYMBOL(fsync_bdev_lockfs); + /* * sync everything. Start out by waking pdflush, because that writes back * all queues in parallel. --- diff/fs/reiserfs/super.c 2004-02-17 21:57:47.000000000 -0600 +++ source/fs/reiserfs/super.c 2004-02-26 10:59:51.000000000 -0600 @@ -82,7 +82,7 @@ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); reiserfs_block_writes(&th) ; - journal_end(&th, s, 1) ; + journal_end_sync(&th, s, 1) ; } s->s_dirt = dirty; reiserfs_write_unlock(s); --- diff/fs/super.c 2004-02-18 10:39:09.000000000 -0600 +++ source/fs/super.c 2004-02-26 10:59:52.000000000 -0600 @@ -293,6 +293,62 @@ } /* + * triggered by the device mapper code to lock a filesystem and force + * it into a consistent state. + * + * This takes the block device bd_mount_sem to make sure no new mounts + * happen on bdev until unlockfs is called. If a super is found on this + * block device, we hould a read lock on the s->s_umount sem to make sure + * nobody unmounts until the snapshot creation is done + */ +void sync_super_lockfs(struct block_device *bdev) +{ + struct super_block *sb; + down(&bdev->bd_mount_sem); + sb = get_super(bdev); + if (sb) { + lock_super(sb); + if (sb->s_dirt && sb->s_op->write_super) + sb->s_op->write_super(sb); + if (sb->s_op->write_super_lockfs) + sb->s_op->write_super_lockfs(sb); + unlock_super(sb); + } + /* unlockfs releases s->s_umount and bd_mount_sem */ +} + +void unlockfs(struct block_device *bdev) +{ + struct list_head *p; + /* + * copied from get_super, but we need to + * do special things since lockfs left the + * s_umount sem held + */ + spin_lock(&sb_lock); + list_for_each(p, &super_blocks) { + struct super_block *s = sb_entry(p); + /* + * if there is a super for this block device + * in the list, get_super must have found it + * during sync_super_lockfs, so our drop_super + * will drop the reference created there. + */ + if (s->s_bdev == bdev && s->s_root) { + spin_unlock(&sb_lock); + if (s->s_op->unlockfs) + s->s_op->unlockfs(s); + drop_super(s); + goto unlock; + } + } + spin_unlock(&sb_lock); +unlock: + up(&bdev->bd_mount_sem); +} +EXPORT_SYMBOL(unlockfs); + +/* * Note: check the dirty flag before waiting, so we don't * hold up the sync while mounting a device. (The newly * mounted device won't need syncing.) @@ -613,7 +669,14 @@ if (IS_ERR(bdev)) return (struct super_block *)bdev; + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + down(&bdev->bd_mount_sem); s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); + up(&bdev->bd_mount_sem); if (IS_ERR(s)) goto out; --- diff/include/linux/buffer_head.h 2004-02-17 21:57:12.000000000 -0600 +++ source/include/linux/buffer_head.h 2004-02-26 10:59:52.000000000 -0600 @@ -164,6 +164,8 @@ wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); void wake_up_buffer(struct buffer_head *bh); int fsync_bdev(struct block_device *); +int fsync_bdev_lockfs(struct block_device *); +void unlockfs(struct block_device *); int fsync_super(struct super_block *); int fsync_no_super(struct block_device *); struct buffer_head *__find_get_block(struct block_device *, sector_t, int); --- diff/include/linux/fs.h 2004-02-18 10:39:10.000000000 -0600 +++ source/include/linux/fs.h 2004-02-26 10:59:52.000000000 -0600 @@ -346,6 +346,7 @@ struct inode * bd_inode; /* will die */ int bd_openers; struct semaphore bd_sem; /* open/close mutex */ + struct semaphore bd_mount_sem; /* mount mutex */ struct list_head bd_inodes; void * bd_holder; int bd_holders; @@ -1221,6 +1222,7 @@ extern int filemap_fdatawait(struct address_space *); extern int filemap_write_and_wait(struct address_space *mapping); extern void sync_supers(void); +extern void sync_super_lockfs(struct block_device *); extern void sync_filesystems(int wait); extern void emergency_sync(void); extern void emergency_remount(void);