[PATCH] fs: only sync() superblocks reachable from the current namespace

Omar Sandoval <osandov@xxxxxxxxxxx> · Fri, 26 Jan 2018 14:58:39 -0800

From: Omar Sandoval <osandov@xxxxxx>

Currently, the sync() syscall is system-wide, so any process in a
container can cause significant I/O stalls across the system by calling
sync(). This is even true for filesystems which are not accessible in
the process' mount namespace. This patch scopes sync() to only write out
filesystems reachable in the current mount namespace, except for the
initial mount namespace, which still syncs everything to avoid
surprises. This fixes the broken isolation we were seeing here.

Signed-off-by: Omar Sandoval <osandov@xxxxxx>
---
 fs/sync.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 14 deletions(-)

diff --git a/fs/sync.c b/fs/sync.c
index 6e0a2cbaf6de..bde1e3196298 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -17,6 +17,7 @@
 #include <linux/quotaops.h>
 #include <linux/backing-dev.h>
 #include "internal.h"
+#include "mount.h"
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
@@ -68,16 +69,46 @@ int sync_filesystem(struct super_block *sb)
 }
 EXPORT_SYMBOL(sync_filesystem);
 
-static void sync_inodes_one_sb(struct super_block *sb, void *arg)
+struct sb_sync {
+	/*
+	 * Only sync superblocks reachable from this namespace. If NULL, sync
+	 * everything.
+	 */
+	struct mnt_namespace *mnt_ns;
+
+	/* ->sync_fs() wait argument. */
+	int wait;
+};
+
+static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns)
+{
+	struct mount *mnt;
+
+	if (!mnt_ns)
+		return 1;
+
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (mnt->mnt_ns == mnt_ns)
+			return 1;
+	}
+	return 0;
+}
+
+static void sync_inodes_one_sb(struct super_block *sb, void *p)
 {
-	if (!sb_rdonly(sb))
+	struct sb_sync *arg = p;
+
+	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns))
 		sync_inodes_sb(sb);
 }
 
-static void sync_fs_one_sb(struct super_block *sb, void *arg)
+static void sync_fs_one_sb(struct super_block *sb, void *p)
 {
-	if (!sb_rdonly(sb) && sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, *(int *)arg);
+	struct sb_sync *arg = p;
+
+	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns) &&
+	    sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, arg->wait);
 }
 
 static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
@@ -107,12 +138,18 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
  */
 SYSCALL_DEFINE0(sync)
 {
-	int nowait = 0, wait = 1;
+	struct sb_sync arg = {
+		.mnt_ns = current->nsproxy->mnt_ns,
+	};
+
+	if (arg.mnt_ns == init_task.nsproxy->mnt_ns)
+		arg.mnt_ns = NULL;
 
 	wakeup_flusher_threads(WB_REASON_SYNC);
-	iterate_supers(sync_inodes_one_sb, NULL);
-	iterate_supers(sync_fs_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &wait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
+	arg.wait = 1;
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
 	iterate_bdevs(fdatawait_one_bdev, NULL);
 	if (unlikely(laptop_mode))
@@ -122,17 +159,17 @@ SYSCALL_DEFINE0(sync)
 
 static void do_sync_work(struct work_struct *work)
 {
-	int nowait = 0;
+	struct sb_sync arg = {};
 
 	/*
 	 * Sync twice to reduce the possibility we skipped some inodes / pages
 	 * because they were temporarily locked
 	 */
-	iterate_supers(sync_inodes_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
-	iterate_supers(sync_inodes_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
 	printk("Emergency Sync complete\n");
 	kfree(work);
-- 
2.16.1