From: Omar Sandoval <osandov@xxxxxx> Currently, the sync() syscall is system-wide, so any process in a container can cause significant I/O stalls across the system by calling sync(). This is even true for filesystems which are not accessible in the process' mount namespace. This patch scopes sync() to only write out filesystems reachable in the current mount namespace, except for the initial mount namespace, which still syncs everything to avoid surprises. This fixes the broken isolation we were seeing here. Signed-off-by: Omar Sandoval <osandov@xxxxxx> --- v2->v3: Fix stupid missed unlock bug v1->v2: Grab mount_lock while iterating mounts fs/sync.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/fs/sync.c b/fs/sync.c index 6e0a2cbaf6de..654da445340a 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -17,6 +17,7 @@ #include <linux/quotaops.h> #include <linux/backing-dev.h> #include "internal.h" +#include "mount.h" #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) @@ -68,16 +69,51 @@ int sync_filesystem(struct super_block *sb) } EXPORT_SYMBOL(sync_filesystem); -static void sync_inodes_one_sb(struct super_block *sb, void *arg) +struct sb_sync { + /* + * Only sync superblocks reachable from this namespace. If NULL, sync + * everything. + */ + struct mnt_namespace *mnt_ns; + + /* ->sync_fs() wait argument. */ + int wait; +}; + +static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns) { - if (!sb_rdonly(sb)) + struct mount *mnt; + int ret = 0; + + if (!mnt_ns) + return 1; + + read_seqlock_excl(&mount_lock); + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (mnt->mnt_ns == mnt_ns) { + ret = 1; + break; + } + } + read_sequnlock_excl(&mount_lock); + return ret; +} + +static void sync_inodes_one_sb(struct super_block *sb, void *p) +{ + struct sb_sync *arg = p; + + if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns)) sync_inodes_sb(sb); } -static void sync_fs_one_sb(struct super_block *sb, void *arg) +static void sync_fs_one_sb(struct super_block *sb, void *p) { - if (!sb_rdonly(sb) && sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, *(int *)arg); + struct sb_sync *arg = p; + + if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns) && + sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, arg->wait); } static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) @@ -107,12 +143,18 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg) */ SYSCALL_DEFINE0(sync) { - int nowait = 0, wait = 1; + struct sb_sync arg = { + .mnt_ns = current->nsproxy->mnt_ns, + }; + + if (arg.mnt_ns == init_task.nsproxy->mnt_ns) + arg.mnt_ns = NULL; wakeup_flusher_threads(WB_REASON_SYNC); - iterate_supers(sync_inodes_one_sb, NULL); - iterate_supers(sync_fs_one_sb, &nowait); - iterate_supers(sync_fs_one_sb, &wait); + iterate_supers(sync_inodes_one_sb, &arg); + iterate_supers(sync_fs_one_sb, &arg); + arg.wait = 1; + iterate_supers(sync_fs_one_sb, &arg); iterate_bdevs(fdatawrite_one_bdev, NULL); iterate_bdevs(fdatawait_one_bdev, NULL); if (unlikely(laptop_mode)) @@ -122,17 +164,17 @@ SYSCALL_DEFINE0(sync) static void do_sync_work(struct work_struct *work) { - int nowait = 0; + struct sb_sync arg = {}; /* * Sync twice to reduce the possibility we skipped some inodes / pages * because they were temporarily locked */ - iterate_supers(sync_inodes_one_sb, &nowait); - iterate_supers(sync_fs_one_sb, &nowait); + iterate_supers(sync_inodes_one_sb, &arg); + iterate_supers(sync_fs_one_sb, &arg); iterate_bdevs(fdatawrite_one_bdev, NULL); - iterate_supers(sync_inodes_one_sb, &nowait); - iterate_supers(sync_fs_one_sb, &nowait); + iterate_supers(sync_inodes_one_sb, &arg); + iterate_supers(sync_fs_one_sb, &arg); iterate_bdevs(fdatawrite_one_bdev, NULL); printk("Emergency Sync complete\n"); kfree(work); -- 2.16.1