> > 在 2018年1月8日,下午8:10,Amir Goldstein <amir73il@xxxxxxxxx> 写道: > > On Mon, Jan 8, 2018 at 10:41 AM, Chengguang Xu <cgxu519@xxxxxxxxxx> wrote: >> Currently syncfs(2) call on overlayfs just simply call sync_filesystem() >> on upper_sb to synchronize whole dirty inodes in upper filesystem >> regardless of the overlay ownership of the inode. It has obvious >> shortcomings as below. >> >> (1) Performance >> Synchronization is probably heavy in most cases, especially when upper >> filesystem is not dedicated to target overlayfs. >> >> (2) Interference >> Unplanned synchronization will probably impact IO performance of the >> processes which use other overlayfs on same upper filesystem or directly >> use upper filesystem. >> >> This patch iterates overlay inodes to only sync target dirty inodes in >> upper filesystem and wait on temporary waiting list to avoid waiting on >> inodes that have started writeback afterwards. By doing this, >> it is able to reduce cost of synchronization and will not seriously impact >> IO performance of irrelative processes. > > This looks like a very good improvement! > >> >> Signed-off-by: Chengguang Xu <cgxu519@xxxxxxxxxx> >> --- >> fs/overlayfs/ovl_entry.h | 4 ++ >> fs/overlayfs/super.c | 101 +++++++++++++++++++++++++++++++++++++++++++++-- >> 2 files changed, 102 insertions(+), 3 deletions(-) >> >> diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h >> index 9d0bc03..e92c425 100644 >> --- a/fs/overlayfs/ovl_entry.h >> +++ b/fs/overlayfs/ovl_entry.h >> @@ -52,6 +52,8 @@ struct ovl_fs { >> /* Did we take the inuse lock? */ >> bool upperdir_locked; >> bool workdir_locked; >> + /* ovl inode sync list lock */ >> + spinlock_t ovl_sync_list_lock; >> }; >> >> /* private information held for every overlayfs dentry */ >> @@ -80,6 +82,8 @@ struct ovl_inode { >> >> /* synchronize copy up and more */ >> struct mutex lock; >> + /* ovl inode sync list */ >> + struct list_head sync_list; >> }; >> >> static inline struct ovl_inode *OVL_I(struct inode *inode) >> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c >> index 76440fe..17f1406 100644 >> --- a/fs/overlayfs/super.c >> +++ b/fs/overlayfs/super.c >> @@ -17,6 +17,8 @@ >> #include <linux/statfs.h> >> #include <linux/seq_file.h> >> #include <linux/posix_acl_xattr.h> >> +#include <linux/writeback.h> >> +#include <linux/blkdev.h> >> #include "overlayfs.h" >> >> MODULE_AUTHOR("Miklos Szeredi <miklos@xxxxxxxxxx>"); >> @@ -195,6 +197,7 @@ static struct inode *ovl_alloc_inode(struct super_block *sb) >> oi->__upperdentry = NULL; >> oi->lower = NULL; >> mutex_init(&oi->lock); >> + INIT_LIST_HEAD(&oi->sync_list); >> >> return &oi->vfs_inode; >> } >> @@ -252,6 +255,96 @@ static void ovl_put_super(struct super_block *sb) >> ovl_free_fs(ofs); >> } >> >> +/** >> + * ovl_sync_filesystem >> + * @sb: The overlayfs super block >> + * >> + * Sync underlying dirty inodes in upper filesystem and wait for completion. >> + * Use temporary wait list to avoid waiting on inodes that have started >> + * writeback after this point. >> + */ >> +static int ovl_sync_filesystem(struct super_block *sb) >> +{ >> + struct ovl_fs *ofs = sb->s_fs_info; >> + struct super_block *upper_sb = ofs->upper_mnt->mnt_sb; >> + struct ovl_inode *oi, *oi_next; >> + struct inode *inode, *i_next; >> + struct inode *upper_inode; >> + struct blk_plug plug; >> + LIST_HEAD(ovl_sync_list); >> + >> + struct writeback_control wbc = { >> + .sync_mode = WB_SYNC_ALL, > > Shouldn't this be WB_SYNC_NONE? We are explicitly waiting > inodes on the list afterwards… IIUC, when wait == 1 should use WB_SYNC_ALL mode. > >> + .for_sync = 1, >> + .range_start = 0, >> + .range_end = LLONG_MAX, >> + .nr_to_write = LONG_MAX, >> + }; >> + >> + blk_start_plug(&plug); >> + spin_lock(&sb->s_inode_list_lock); >> + list_for_each_entry_safe(inode, i_next, &sb->s_inodes, i_sb_list) { >> + upper_inode = ovl_inode_upper(inode); >> + if (!upper_inode) >> + continue; >> + >> + spin_lock(&upper_inode->i_lock); >> + if (upper_inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE) || >> + list_empty(&upper_inode->i_io_list)) { >> + spin_unlock(&upper_inode->i_lock); >> + continue; >> + } >> + spin_unlock(&upper_inode->i_lock); >> + >> + if (!igrab(inode)) >> + continue; >> + >> + if (!igrab(upper_inode)) { >> + iput(inode); >> + continue; >> + } >> + spin_unlock(&sb->s_inode_list_lock); >> + >> + sync_inode(upper_inode, &wbc); > > Maybe allowed to call sync_inode() under s_inode_list_lock > if using WB_SYNC_NONE??? I think if hold s_inode_list_lock long time, it will probably affect inode allocation/destruction because every inode needs to be in/out sb->s_inodes list. > Do we not need to check return value from sync_inode()? I have no idea what extra work can we do for error case, __sync_filesytem() itself does not check return code seriously. > I bet it could be EROFS if upper fs was remounted readonly > and we did not check for that before starting to sync… I’m curious is there a real use case like this? Anyway, let’s add check for safety. > >> + spin_lock(&ofs->ovl_sync_list_lock); >> + if (list_empty(&OVL_I(inode)->sync_list)) >> + list_add(&OVL_I(inode)->sync_list, &ovl_sync_list); >> + else { >> + iput(upper_inode); >> + iput(inode); >> + } >> + spin_unlock(&ofs->ovl_sync_list_lock); >> + >> + if (need_resched()) { >> + blk_finish_plug(&plug); >> + cond_resched(); >> + blk_start_plug(&plug); >> + } >> + spin_lock(&sb->s_inode_list_lock); >> + } >> + spin_unlock(&sb->s_inode_list_lock); >> + blk_finish_plug(&plug); >> + >> + mutex_lock(&upper_sb->s_sync_lock); > > I suppose mutex_lock_interruptible() is in order. I can’t see benefit for it, and considering interruption by signal I’ll replace temporary waiting list using sb->s_inodes list. > >> + list_for_each_entry_safe(oi, oi_next, &ovl_sync_list, sync_list) { >> + upper_inode = ovl_inode_upper(&oi->vfs_inode); >> + >> + filemap_fdatawait_keep_errors(upper_inode->i_mapping); >> + list_del_init(&oi->sync_list); >> + iput(upper_inode); >> + iput(&oi->vfs_inode); >> + >> + if (need_resched()) >> + cond_resched(); >> + } >> + mutex_unlock(&upper_sb->s_sync_lock); >> + >> + if (upper_sb->s_op->sync_fs) >> + upper_sb->s_op->sync_fs(upper_sb, 1); >> + >> + return sync_blockdev(upper_sb->s_bdev); >> +} >> + >> /* Sync real dirty inodes in upper filesystem (if it exists) */ >> static int ovl_sync_fs(struct super_block *sb, int wait) >> { >> @@ -266,8 +359,8 @@ static int ovl_sync_fs(struct super_block *sb, int wait) >> * If this is a sync(2) call or an emergency sync, all the super blocks >> * will be iterated, including upper_sb, so no need to do anything. >> * >> - * If this is a syncfs(2) call, then we do need to call >> - * sync_filesystem() on upper_sb, but enough if we do it when being >> + * If this is a syncfs(2) call, then we do need to call sync_inode() >> + * on underlying dirty upper_inode, but enough if we do it when being >> * called with wait == 1. >> */ >> if (!wait) >> @@ -276,7 +369,7 @@ static int ovl_sync_fs(struct super_block *sb, int wait) >> upper_sb = ofs->upper_mnt->mnt_sb; >> >> down_read(&upper_sb->s_umount); >> - ret = sync_filesystem(upper_sb); >> + ret = ovl_sync_filesystem(sb); > > Note that with this change, we are skipping the test sb_rdonly(upper_sb) > (inside sync_filesystem()). We should probably skip syncing in that case. I will do. Thanks, Chengguang. -- To unsubscribe from this list: send the line "unsubscribe linux-unionfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html