On Fri, Jan 26, 2018 at 02:58:39PM -0800, Omar Sandoval wrote: > From: Omar Sandoval <osandov@xxxxxx> > > Currently, the sync() syscall is system-wide, so any process in a > container can cause significant I/O stalls across the system by calling > sync(). This is even true for filesystems which are not accessible in > the process' mount namespace. This patch scopes sync() to only write out > filesystems reachable in the current mount namespace, except for the > initial mount namespace, which still syncs everything to avoid > surprises. This fixes the broken isolation we were seeing here. > > Signed-off-by: Omar Sandoval <osandov@xxxxxx> > --- > fs/sync.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++-------------- > 1 file changed, 51 insertions(+), 14 deletions(-) > > diff --git a/fs/sync.c b/fs/sync.c > index 6e0a2cbaf6de..bde1e3196298 100644 > --- a/fs/sync.c > +++ b/fs/sync.c > @@ -17,6 +17,7 @@ > #include <linux/quotaops.h> > #include <linux/backing-dev.h> > #include "internal.h" > +#include "mount.h" > > #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ > SYNC_FILE_RANGE_WAIT_AFTER) > @@ -68,16 +69,46 @@ int sync_filesystem(struct super_block *sb) > } > EXPORT_SYMBOL(sync_filesystem); > > -static void sync_inodes_one_sb(struct super_block *sb, void *arg) > +struct sb_sync { > + /* > + * Only sync superblocks reachable from this namespace. If NULL, sync > + * everything. > + */ > + struct mnt_namespace *mnt_ns; > + > + /* ->sync_fs() wait argument. */ > + int wait; > +}; > + > +static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns) > +{ > + struct mount *mnt; > + > + if (!mnt_ns) > + return 1; > + > + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { > + if (mnt->mnt_ns == mnt_ns) > + return 1; > + } Sigh, of course, I forgot to grab the proper locks here. Will send a v2. > + return 0; > +} > + > +static void sync_inodes_one_sb(struct super_block *sb, void *p) > { > - if (!sb_rdonly(sb)) > + struct sb_sync *arg = p; > + > + if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns)) > sync_inodes_sb(sb); > } > > -static void sync_fs_one_sb(struct super_block *sb, void *arg) > +static void sync_fs_one_sb(struct super_block *sb, void *p) > { > - if (!sb_rdonly(sb) && sb->s_op->sync_fs) > - sb->s_op->sync_fs(sb, *(int *)arg); > + struct sb_sync *arg = p; > + > + if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns) && > + sb->s_op->sync_fs) > + sb->s_op->sync_fs(sb, arg->wait); > } > > static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) > @@ -107,12 +138,18 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg) > */ > SYSCALL_DEFINE0(sync) > { > - int nowait = 0, wait = 1; > + struct sb_sync arg = { > + .mnt_ns = current->nsproxy->mnt_ns, > + }; > + > + if (arg.mnt_ns == init_task.nsproxy->mnt_ns) > + arg.mnt_ns = NULL; > > wakeup_flusher_threads(WB_REASON_SYNC); > - iterate_supers(sync_inodes_one_sb, NULL); > - iterate_supers(sync_fs_one_sb, &nowait); > - iterate_supers(sync_fs_one_sb, &wait); > + iterate_supers(sync_inodes_one_sb, &arg); > + iterate_supers(sync_fs_one_sb, &arg); > + arg.wait = 1; > + iterate_supers(sync_fs_one_sb, &arg); > iterate_bdevs(fdatawrite_one_bdev, NULL); > iterate_bdevs(fdatawait_one_bdev, NULL); > if (unlikely(laptop_mode)) > @@ -122,17 +159,17 @@ SYSCALL_DEFINE0(sync) > > static void do_sync_work(struct work_struct *work) > { > - int nowait = 0; > + struct sb_sync arg = {}; > > /* > * Sync twice to reduce the possibility we skipped some inodes / pages > * because they were temporarily locked > */ > - iterate_supers(sync_inodes_one_sb, &nowait); > - iterate_supers(sync_fs_one_sb, &nowait); > + iterate_supers(sync_inodes_one_sb, &arg); > + iterate_supers(sync_fs_one_sb, &arg); > iterate_bdevs(fdatawrite_one_bdev, NULL); > - iterate_supers(sync_inodes_one_sb, &nowait); > - iterate_supers(sync_fs_one_sb, &nowait); > + iterate_supers(sync_inodes_one_sb, &arg); > + iterate_supers(sync_fs_one_sb, &arg); > iterate_bdevs(fdatawrite_one_bdev, NULL); > printk("Emergency Sync complete\n"); > kfree(work); > -- > 2.16.1 >