Currently on rw=>ro remount we have following race | mount /mnt -oremount,ro | write-task | |-------------------------+------------| | | open(RDWR) | | shrink_dcache_sb(sb); | | | sync_filesystem(sb); | | | | write() | | | close() | | fs_may_remount_ro(sb) | | | sb->s_flags = new_flags | | Later writeback or sync() will result in error due to MS_RDONLY flag In case of ext4 this result in jbd2_start failure on writeback LOG: ext4_da_writepages: jbd2_start: 1024 pages, ino 1431; err -30 This patch fix the issue like follows: 1) Introduce ST_REMOUNT_RO bit with will be set on remount start This bit will be cleared if new writers appear. 2) Redesign fs_may_remount_ro. Now it is just calculate sum of corresponding vfsmounts 3) The rest is simple, we just perform sync and check than no new writers appear. ##TESTCASE_BEGIN: #! /bin/bash -x DEV=/dev/sdb5 FSTYPE=ext4 BINDIR=/home/dmon MNTOPT="data=ordered" umount /mnt mkfs.${FSTYPE} ${DEV} || exit 1 mount ${DEV} /mnt -o${MNTOPT} || exit 1 ${BINDIR}/fsstress -p1 -l999999999 -n9999999999 -d /mnt/test & sleep 15 mount /mnt -oremount,ro,${MNTOPT} sleep 1 killall -9 fsstress sync # after this you may get following message in dmesg # "ext4_da_writepages: jbd2_start: 1024 pages, ino 1431; err -30" ##TESTCASE_END Signed-off-by: Dmitry Monakhov <dmonakhov@xxxxxxxxxx> --- fs/file_table.c | 24 ------------------------ fs/namespace.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- fs/super.c | 33 +++++++++++++++++++++++++++------ include/linux/fs.h | 1 + 4 files changed, 72 insertions(+), 34 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index b98404b..32d37af 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -348,30 +348,6 @@ void file_kill(struct file *file) } } -int fs_may_remount_ro(struct super_block *sb) -{ - struct file *file; - - /* Check that no files are currently opened for writing. */ - file_list_lock(); - list_for_each_entry(file, &sb->s_files, f_u.fu_list) { - struct inode *inode = file->f_path.dentry->d_inode; - - /* File with pending delete? */ - if (inode->i_nlink == 0) - goto too_bad; - - /* Writeable file? */ - if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) - goto too_bad; - } - file_list_unlock(); - return 1; /* Tis' cool bro. */ -too_bad: - file_list_unlock(); - return 0; -} - /** * mark_files_ro - mark all files read-only * @sb: superblock in question diff --git a/fs/namespace.c b/fs/namespace.c index e816097..bc79f1f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -277,6 +277,13 @@ int mnt_want_write(struct vfsmount *mnt) dec_mnt_writers(mnt); ret = -EROFS; goto out; + } else { + /* + * Clear ST_REMOUNT_RO flag to let remount task know + * about new writers. + */ + if (unlikely(test_bit(ST_REMOUNT_RO, &mnt->mnt_sb->s_state))) + clear_bit(ST_REMOUNT_RO, &mnt->mnt_sb->s_state); } out: preempt_enable(); @@ -341,11 +348,10 @@ void mnt_drop_write(struct vfsmount *mnt) } EXPORT_SYMBOL_GPL(mnt_drop_write); -static int mnt_make_readonly(struct vfsmount *mnt) +static int mnt_check_writers(struct vfsmount *mnt, int set_ro) { int ret = 0; - spin_lock(&vfsmount_lock); mnt->mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store @@ -371,14 +377,23 @@ static int mnt_make_readonly(struct vfsmount *mnt) */ if (count_mnt_writers(mnt) > 0) ret = -EBUSY; - else - mnt->mnt_flags |= MNT_READONLY; + else { + if (likely(set_ro)) + mnt->mnt_flags |= MNT_READONLY; + } /* * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); mnt->mnt_flags &= ~MNT_WRITE_HOLD; + return ret; +} +static int mnt_make_readonly(struct vfsmount *mnt) +{ + int ret; + spin_lock(&vfsmount_lock); + ret = mnt_check_writers(mnt, 1); spin_unlock(&vfsmount_lock); return ret; } @@ -389,6 +404,31 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt) mnt->mnt_flags &= ~MNT_READONLY; spin_unlock(&vfsmount_lock); } +/** + * Check whenever is it possible to remount given sb to readonly. + * @sb : super block in question + * + * Caller is responsible to set ST_REMOUNT_RO state before the call. + */ +int fs_may_remount_ro(struct super_block *sb) +{ + struct vfsmount *mnt; + int ret = 1; + spin_lock(&vfsmount_lock); + list_for_each_entry(mnt, &sb->s_vfsmount, mnt_sb_list) { + ret = !mnt_check_writers(mnt, 0); + if (ret) + break; + } + spin_unlock(&vfsmount_lock); + /* + * If new writer appears after we have checked all vfsmounts. + * Then ST_REMOUNT_RO bit will be cleared. + */ + if (!test_bit(ST_REMOUNT_RO, &sb->s_state)) + ret = 0; + return ret; +} void super_add_mnt(struct super_block *sb, struct vfsmount *mnt) { diff --git a/fs/super.c b/fs/super.c index d3e0083..8cf148b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -571,6 +571,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) int retval; int remount_rw, remount_ro; + remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); + if (sb->s_frozen != SB_UNFROZEN) return -EBUSY; @@ -581,18 +584,33 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (flags & MS_RDONLY) acct_auto_close(sb); + if (remount_ro) { + if (force) + mark_files_ro(sb); + /* + * Store ST_REMOUNT_RO flag. New writers (in any) will clrear + * this bit. + */ + set_bit(ST_REMOUNT_RO, &sb->s_state); + /* + * This store should be visible before we do. + */ + smp_mb(); + /* + * To prevent sync vs write race condition we have to check + * writers before filesystem sync. + */ + if (!fs_may_remount_ro(sb)) + return -EBUSY; + } shrink_dcache_sb(sb); sync_filesystem(sb); - remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); - remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); /* If we are remounting RDONLY and current sb is read/write, - make sure there are no rw files opened */ + make sure there are no new rw files opened */ if (remount_ro) { - if (force) - mark_files_ro(sb); - else if (!fs_may_remount_ro(sb)) + if (!fs_may_remount_ro(sb)) return -EBUSY; retval = vfs_dq_off(sb, 1); if (retval < 0 && retval != -ENOSYS) @@ -617,6 +635,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) */ if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); + + WARN_ON(remount_ro && !fs_may_remount_ro(sb)); + clear_bit(ST_REMOUNT_RO, &sb->s_state); return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 75f057d..0ad7aa4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1308,6 +1308,7 @@ extern int send_sigurg(struct fown_struct *fown); enum { ST_FS_SYNC, /* fssync is about to happen */ + ST_REMOUNT_RO, /* readonly remount is in progress */ }; extern struct list_head super_blocks; extern spinlock_t sb_lock; -- 1.6.6 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html