There are sysfs interfaces in some fs, only btrfs yet, which will modify on-disk data. Unlike normal file operation routine we can use mnt_want_write_file() to protect the operation, change through sysfs won't to be binded to any file in the filesystem. So introduce new sb_want_write() to do the protection against a super block, which acts much like mnt_want_write() but will return success if the super block is read-write. The implement is to use a atomic value as a simplified rw-semaphore, which only provides non-block lock method. We don't use the traditional rw-sem because in do_umount(), we need to block incoming sb_want_write() until the sb is killed if this is the last mount instance. However kill_sb() can be delayed to other thread, so down_write() and up_write() will happen in different thread, and this is not allowed. This patch also slightly modified struct super_block and do_umount/remount(), where we do extra check for blocking sb_want_write() and don't allow the umount of the *last* mount instance of a super block or remount it ro. Cc: linux-fsdevel <linux-fsdevel@xxxxxxxxxxxxxxx> Signed-off-by: Qu Wenruo <quwenruo@xxxxxxxxxxxxxx> Signed-off-by: Al Viro <viro@xxxxxxxxxxxxxxxxxx> Reviewed-by: David Sterba <dsterba@xxxxxxx> --- Changelog: v4: Newly introduced. v5: Change name to sb_want_write() and receive sb and parameter. v6: Add better check when umounting the last instance of a super block. So sb_want_write() waiting for fs unfrozen/transaction will prevent umount. v7: Use atomic instead of manually implemented rw-sem. Add check for remount ro. Fix some missing unlock in error handler. Add internal helper function instead open-code. --- fs/internal.h | 25 ++++++++++++++++++ fs/namespace.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/super.c | 15 ++++++++++- include/linux/fs.h | 6 +++++ include/linux/mount.h | 2 ++ 5 files changed, 117 insertions(+), 1 deletion(-) diff --git a/fs/internal.h b/fs/internal.h index e9a61fe..8d6ef11 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -69,6 +69,31 @@ extern int __mnt_want_write_file(struct file *); extern void __mnt_drop_write(struct vfsmount *); extern void __mnt_drop_write_file(struct file *); +/* rw_sem like read/write down trylock helpers for sb_want_write() */ +static inline int __sb_read_down_trylock(struct super_block *sb) +{ + if (!atomic_add_unless(&sb->s_want_write_count, 1, -1)) + return 0; + return 1; +} + +static inline int __sb_write_down_trylock(struct super_block *sb) +{ + if (atomic_cmpxchg(&sb->s_want_write_count, 0, -1)) + return 0; + return 1; +} + +static inline void __sb_read_up(struct super_block *sb) +{ + atomic_dec(&sb->s_want_write_count); +} + +static inline void __sb_write_up(struct super_block *sb) +{ + atomic_set(&sb->s_want_write_count, 0); +} + /* * fs_struct.c */ diff --git a/fs/namespace.c b/fs/namespace.c index cd1e968..a4e8946 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1105,6 +1105,47 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); +/** + * sb_want_write - get write acess to a super block + * @sb: the superblock of the filesystem + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that the writes are allowed (superblock is read-write, + * filesystem is not frozen) before returning success. + * When the write operation is finished, sb_drop_write() must be called. + * This is much like mnt_want_write() as a refcount, but only needs + * the superblock to be read-write. + */ +int sb_want_write(struct super_block *sb) +{ + + if (!__sb_read_down_trylock(sb)) + return -EBUSY; + + sb_start_write(sb); + if (sb->s_readonly_remount || sb->s_flags & MS_RDONLY) { + sb_drop_write(sb); + return -EROFS; + } + return 0; +} +EXPORT_SYMBOL(sb_want_write); + +/** + * sb_drop_write - give up write acess to a super block + * @sb: the superblock on which to give up write access + * + * Tells the low-level filesystem that we are done performing writes to it and + * also allows filesystem to be frozen/remount ro again. Must be matched with + * sb_want_write() call above. + */ +void sb_drop_write(struct super_block *sb) +{ + sb_end_write(sb); + __sb_read_up(sb); +} +EXPORT_SYMBOL(sb_drop_write); + struct vfsmount *mnt_clone_internal(struct path *path) { struct mount *p; @@ -1382,6 +1423,9 @@ static void shrink_submounts(struct mount *mnt); static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; + struct mount *tmp; + int mounts = 0; + int sb_write_hold = 0; int retval; retval = security_sb_umount(&mnt->mnt, flags); @@ -1455,6 +1499,25 @@ static int do_umount(struct mount *mnt, int flags) lock_mount_hash(); event++; + /* + * Check for blocking sb_want_write if the mount is the last mount + * instance of the superblock (+1 for namespace mount), and block + * further comming sb_want_write(). + */ + list_for_each_entry(tmp, &sb->s_mounts, mnt_instance) { + mounts++; + if (mounts > 2) + break; + } + + if (mounts == 2) { + if (!__sb_write_down_trylock(sb)) { + retval = -EBUSY; + goto out; + } + sb_write_hold = 1; + } + if (flags & MNT_DETACH) { if (!list_empty(&mnt->mnt_list)) umount_tree(mnt, 2); @@ -1468,6 +1531,13 @@ static int do_umount(struct mount *mnt, int flags) retval = 0; } } +out: + /* + * Only unblock sb_want_write() if umount of last instance failed + * If umount succeeded, no need to unblock and let it die with sb. + */ + if (sb_write_hold && retval) + __sb_write_up(sb); unlock_mount_hash(); namespace_unlock(); return retval; diff --git a/fs/super.c b/fs/super.c index eae088f..10f49e4 100644 --- a/fs/super.c +++ b/fs/super.c @@ -230,6 +230,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; s->s_shrink.flags = SHRINKER_NUMA_AWARE; + + atomic_set(&s->s_want_write_count, 0); + return s; fail: @@ -694,6 +697,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) { int retval; int remount_ro; + int sb_write_hold = 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; @@ -716,6 +720,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) return -EBUSY; remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); } + /* Don't remount ro if there is sb_want_write() waiting */ + if (!__sb_write_down_trylock(sb)) + return -EBUSY; + sb_write_hold = 1; } shrink_dcache_sb(sb); @@ -728,7 +736,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) } else { retval = sb_prepare_remount_readonly(sb); if (retval) - return retval; + goto cancel_readonly; } } @@ -757,10 +765,15 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) */ if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); + if (sb_write_hold) + __sb_write_up(sb); + return 0; cancel_readonly: sb->s_readonly_remount = 0; + if (sb_write_hold) + __sb_write_up(sb); return retval; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 42efe13..dc852e8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1305,6 +1305,12 @@ struct super_block { * Indicates how deep in a filesystem stack this SB is */ int s_stack_depth; + + /* + * sb_want_write() protector, to ensure remount ro or umount the last + * instance to return -EBUSY if there is sb_want_write() waiting. + */ + atomic_t s_want_write_count; }; extern struct timespec current_fs_time(struct super_block *sb); diff --git a/include/linux/mount.h b/include/linux/mount.h index c2c561d..abf4495 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -74,8 +74,10 @@ struct path; extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write_file(struct file *file); extern int mnt_clone_write(struct vfsmount *mnt); +extern int sb_want_write(struct super_block *sb); extern void mnt_drop_write(struct vfsmount *mnt); extern void mnt_drop_write_file(struct file *file); +extern void sb_drop_write(struct super_block *sb); extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(struct path *path); -- 2.2.2 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html