On Thu 26-09-13 21:05:06, Jan Kara wrote: > When there are processes heavily creating small files while sync(2) is > running, it can easily happen that quite some new files are created > between WB_SYNC_NONE and WB_SYNC_ALL pass of sync(2). That can happen > especially if there are several busy filesystems (remember that sync > traverses filesystems sequentially and waits in WB_SYNC_ALL phase on one > fs before starting it on another fs). Because WB_SYNC_ALL pass is slow > (e.g. causes a transaction commit and cache flush for each inode in > ext3), resulting sync(2) times are rather large. > > The following script reproduces the problem: > --- I've realized this '---' is going to screw up git-am. So I'll resend with a different separator. Sorry for the noise. Honza > function run_writers > { > for (( i = 0; i < 10; i++ )); do > mkdir $1/dir$i > for (( j = 0; j < 40000; j++ )); do > dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null > done & > done > } > > for dir in "$@"; do > run_writers $dir > done > > sleep 40 > time sync > ---- > > Fix the problem by disregarding inodes dirtied after sync(2) was called > in the WB_SYNC_ALL pass. To allow for this, sync_inodes_sb() now takes a > time stamp when sync has started which is used for setting up work for > flusher threads. > > To give some numbers, when above script is run on two ext4 filesystems on > simple SATA drive, the average sync time from 10 runs is 267.549 seconds > with standard deviation 104.799426. With the patched kernel, the average > sync time from 10 runs is 2.995 seconds with standard deviation 0.096. > > Signed-off-by: Jan Kara <jack@xxxxxxx> > --- > fs/fs-writeback.c | 17 ++++++++--------- > fs/sync.c | 15 +++++++++------ > fs/xfs/xfs_super.c | 2 +- > include/linux/writeback.h | 2 +- > 4 files changed, 19 insertions(+), 17 deletions(-) > > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c > index 9f4935b..70837da 100644 > --- a/fs/fs-writeback.c > +++ b/fs/fs-writeback.c > @@ -39,7 +39,7 @@ > struct wb_writeback_work { > long nr_pages; > struct super_block *sb; > - unsigned long *older_than_this; > + unsigned long older_than_this; > enum writeback_sync_modes sync_mode; > unsigned int tagged_writepages:1; > unsigned int for_kupdate:1; > @@ -248,8 +248,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, > > while (!list_empty(delaying_queue)) { > inode = wb_inode(delaying_queue->prev); > - if (work->older_than_this && > - inode_dirtied_after(inode, *work->older_than_this)) > + if (inode_dirtied_after(inode, work->older_than_this)) > break; > list_move(&inode->i_wb_list, &tmp); > moved++; > @@ -791,12 +790,11 @@ static long wb_writeback(struct bdi_writeback *wb, > { > unsigned long wb_start = jiffies; > long nr_pages = work->nr_pages; > - unsigned long oldest_jif; > struct inode *inode; > long progress; > > - oldest_jif = jiffies; > - work->older_than_this = &oldest_jif; > + if (!work->older_than_this) > + work->older_than_this = jiffies; > > spin_lock(&wb->list_lock); > for (;;) { > @@ -830,10 +828,10 @@ static long wb_writeback(struct bdi_writeback *wb, > * safe. > */ > if (work->for_kupdate) { > - oldest_jif = jiffies - > + work->older_than_this = jiffies - > msecs_to_jiffies(dirty_expire_interval * 10); > } else if (work->for_background) > - oldest_jif = jiffies; > + work->older_than_this = jiffies; > > trace_writeback_start(wb->bdi, work); > if (list_empty(&wb->b_io)) > @@ -1350,13 +1348,14 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb); > * This function writes and waits on any dirty inode belonging to this > * super_block. > */ > -void sync_inodes_sb(struct super_block *sb) > +void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this) > { > DECLARE_COMPLETION_ONSTACK(done); > struct wb_writeback_work work = { > .sb = sb, > .sync_mode = WB_SYNC_ALL, > .nr_pages = LONG_MAX, > + .older_than_this = older_than_this, > .range_cyclic = 0, > .done = &done, > .reason = WB_REASON_SYNC, > diff --git a/fs/sync.c b/fs/sync.c > index 905f3f6..ff96f99 100644 > --- a/fs/sync.c > +++ b/fs/sync.c > @@ -27,10 +27,11 @@ > * wait == 1 case since in that case write_inode() functions do > * sync_dirty_buffer() and thus effectively write one block at a time. > */ > -static int __sync_filesystem(struct super_block *sb, int wait) > +static int __sync_filesystem(struct super_block *sb, int wait, > + unsigned long start) > { > if (wait) > - sync_inodes_sb(sb); > + sync_inodes_sb(sb, start); > else > writeback_inodes_sb(sb, WB_REASON_SYNC); > > @@ -47,6 +48,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) > int sync_filesystem(struct super_block *sb) > { > int ret; > + unsigned long start = jiffies; > > /* > * We need to be protected against the filesystem going from > @@ -60,17 +62,17 @@ int sync_filesystem(struct super_block *sb) > if (sb->s_flags & MS_RDONLY) > return 0; > > - ret = __sync_filesystem(sb, 0); > + ret = __sync_filesystem(sb, 0, start); > if (ret < 0) > return ret; > - return __sync_filesystem(sb, 1); > + return __sync_filesystem(sb, 1, start); > } > EXPORT_SYMBOL_GPL(sync_filesystem); > > static void sync_inodes_one_sb(struct super_block *sb, void *arg) > { > if (!(sb->s_flags & MS_RDONLY)) > - sync_inodes_sb(sb); > + sync_inodes_sb(sb, *((unsigned long *)arg)); > } > > static void sync_fs_one_sb(struct super_block *sb, void *arg) > @@ -102,9 +104,10 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg) > SYSCALL_DEFINE0(sync) > { > int nowait = 0, wait = 1; > + unsigned long start = jiffies; > > wakeup_flusher_threads(0, WB_REASON_SYNC); > - iterate_supers(sync_inodes_one_sb, NULL); > + iterate_supers(sync_inodes_one_sb, &start); > iterate_supers(sync_fs_one_sb, &nowait); > iterate_supers(sync_fs_one_sb, &wait); > iterate_bdevs(fdatawrite_one_bdev, NULL); > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index 15188cc..8968f50 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -918,7 +918,7 @@ xfs_flush_inodes( > struct super_block *sb = mp->m_super; > > if (down_read_trylock(&sb->s_umount)) { > - sync_inodes_sb(sb); > + sync_inodes_sb(sb, jiffies); > up_read(&sb->s_umount); > } > } > diff --git a/include/linux/writeback.h b/include/linux/writeback.h > index 021b8a3..fc0e432 100644 > --- a/include/linux/writeback.h > +++ b/include/linux/writeback.h > @@ -97,7 +97,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, > int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); > int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, > enum wb_reason reason); > -void sync_inodes_sb(struct super_block *); > +void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this); > void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); > void inode_wait_for_writeback(struct inode *inode); > > -- > 1.8.1.4 > -- Jan Kara <jack@xxxxxxx> SUSE Labs, CR -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html