[PATCH v2] fs: only sync() superblocks reachable from the current namespace

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Omar Sandoval <osandov@xxxxxx>

Currently, the sync() syscall is system-wide, so any process in a
container can cause significant I/O stalls across the system by calling
sync(). This is even true for filesystems which are not accessible in
the process' mount namespace. This patch scopes sync() to only write out
filesystems reachable in the current mount namespace, except for the
initial mount namespace, which still syncs everything to avoid
surprises. This fixes the broken isolation we were seeing here.

Signed-off-by: Omar Sandoval <osandov@xxxxxx>
---
Grab mount_lock while iterating mounts.

 fs/sync.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 53 insertions(+), 14 deletions(-)

diff --git a/fs/sync.c b/fs/sync.c
index 6e0a2cbaf6de..03d27b48b972 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -17,6 +17,7 @@
 #include <linux/quotaops.h>
 #include <linux/backing-dev.h>
 #include "internal.h"
+#include "mount.h"
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
@@ -68,16 +69,48 @@ int sync_filesystem(struct super_block *sb)
 }
 EXPORT_SYMBOL(sync_filesystem);
 
-static void sync_inodes_one_sb(struct super_block *sb, void *arg)
+struct sb_sync {
+	/*
+	 * Only sync superblocks reachable from this namespace. If NULL, sync
+	 * everything.
+	 */
+	struct mnt_namespace *mnt_ns;
+
+	/* ->sync_fs() wait argument. */
+	int wait;
+};
+
+static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns)
+{
+	struct mount *mnt;
+
+	if (!mnt_ns)
+		return 1;
+
+	read_seqlock_excl(&mount_lock);
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (mnt->mnt_ns == mnt_ns)
+			return 1;
+	}
+	read_sequnlock_excl(&mount_lock);
+	return 0;
+}
+
+static void sync_inodes_one_sb(struct super_block *sb, void *p)
 {
-	if (!sb_rdonly(sb))
+	struct sb_sync *arg = p;
+
+	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns))
 		sync_inodes_sb(sb);
 }
 
-static void sync_fs_one_sb(struct super_block *sb, void *arg)
+static void sync_fs_one_sb(struct super_block *sb, void *p)
 {
-	if (!sb_rdonly(sb) && sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, *(int *)arg);
+	struct sb_sync *arg = p;
+
+	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns) &&
+	    sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, arg->wait);
 }
 
 static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
@@ -107,12 +140,18 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
  */
 SYSCALL_DEFINE0(sync)
 {
-	int nowait = 0, wait = 1;
+	struct sb_sync arg = {
+		.mnt_ns = current->nsproxy->mnt_ns,
+	};
+
+	if (arg.mnt_ns == init_task.nsproxy->mnt_ns)
+		arg.mnt_ns = NULL;
 
 	wakeup_flusher_threads(WB_REASON_SYNC);
-	iterate_supers(sync_inodes_one_sb, NULL);
-	iterate_supers(sync_fs_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &wait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
+	arg.wait = 1;
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
 	iterate_bdevs(fdatawait_one_bdev, NULL);
 	if (unlikely(laptop_mode))
@@ -122,17 +161,17 @@ SYSCALL_DEFINE0(sync)
 
 static void do_sync_work(struct work_struct *work)
 {
-	int nowait = 0;
+	struct sb_sync arg = {};
 
 	/*
 	 * Sync twice to reduce the possibility we skipped some inodes / pages
 	 * because they were temporarily locked
 	 */
-	iterate_supers(sync_inodes_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
-	iterate_supers(sync_inodes_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
 	printk("Emergency Sync complete\n");
 	kfree(work);
-- 
2.16.1




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux