Re: [PATCH] writeback: Do not sync data dirtied after sync start

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu 26-09-13 21:05:06, Jan Kara wrote:
> When there are processes heavily creating small files while sync(2) is
> running, it can easily happen that quite some new files are created
> between WB_SYNC_NONE and WB_SYNC_ALL pass of sync(2). That can happen
> especially if there are several busy filesystems (remember that sync
> traverses filesystems sequentially and waits in WB_SYNC_ALL phase on one
> fs before starting it on another fs). Because WB_SYNC_ALL pass is slow
> (e.g. causes a transaction commit and cache flush for each inode in
> ext3), resulting sync(2) times are rather large.
> 
> The following script reproduces the problem:
> ---
  I've realized this '---' is going to screw up git-am. So I'll resend with
a different separator. Sorry for the noise.

								Honza
> function run_writers
> {
>   for (( i = 0; i < 10; i++ )); do
>     mkdir $1/dir$i
>     for (( j = 0; j < 40000; j++ )); do
>       dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null
>     done &
>   done
> }
> 
> for dir in "$@"; do
>   run_writers $dir
> done
> 
> sleep 40
> time sync
> ----
> 
> Fix the problem by disregarding inodes dirtied after sync(2) was called
> in the WB_SYNC_ALL pass. To allow for this, sync_inodes_sb() now takes a
> time stamp when sync has started which is used for setting up work for
> flusher threads.
> 
> To give some numbers, when above script is run on two ext4 filesystems on
> simple SATA drive, the average sync time from 10 runs is 267.549 seconds
> with standard deviation 104.799426. With the patched kernel, the average
> sync time from 10 runs is 2.995 seconds with standard deviation 0.096.
> 
> Signed-off-by: Jan Kara <jack@xxxxxxx>
> ---
>  fs/fs-writeback.c         | 17 ++++++++---------
>  fs/sync.c                 | 15 +++++++++------
>  fs/xfs/xfs_super.c        |  2 +-
>  include/linux/writeback.h |  2 +-
>  4 files changed, 19 insertions(+), 17 deletions(-)
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 9f4935b..70837da 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -39,7 +39,7 @@
>  struct wb_writeback_work {
>  	long nr_pages;
>  	struct super_block *sb;
> -	unsigned long *older_than_this;
> +	unsigned long older_than_this;
>  	enum writeback_sync_modes sync_mode;
>  	unsigned int tagged_writepages:1;
>  	unsigned int for_kupdate:1;
> @@ -248,8 +248,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
>  
>  	while (!list_empty(delaying_queue)) {
>  		inode = wb_inode(delaying_queue->prev);
> -		if (work->older_than_this &&
> -		    inode_dirtied_after(inode, *work->older_than_this))
> +		if (inode_dirtied_after(inode, work->older_than_this))
>  			break;
>  		list_move(&inode->i_wb_list, &tmp);
>  		moved++;
> @@ -791,12 +790,11 @@ static long wb_writeback(struct bdi_writeback *wb,
>  {
>  	unsigned long wb_start = jiffies;
>  	long nr_pages = work->nr_pages;
> -	unsigned long oldest_jif;
>  	struct inode *inode;
>  	long progress;
>  
> -	oldest_jif = jiffies;
> -	work->older_than_this = &oldest_jif;
> +	if (!work->older_than_this)
> +		work->older_than_this = jiffies;
>  
>  	spin_lock(&wb->list_lock);
>  	for (;;) {
> @@ -830,10 +828,10 @@ static long wb_writeback(struct bdi_writeback *wb,
>  		 * safe.
>  		 */
>  		if (work->for_kupdate) {
> -			oldest_jif = jiffies -
> +			work->older_than_this = jiffies -
>  				msecs_to_jiffies(dirty_expire_interval * 10);
>  		} else if (work->for_background)
> -			oldest_jif = jiffies;
> +			work->older_than_this = jiffies;
>  
>  		trace_writeback_start(wb->bdi, work);
>  		if (list_empty(&wb->b_io))
> @@ -1350,13 +1348,14 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
>   * This function writes and waits on any dirty inode belonging to this
>   * super_block.
>   */
> -void sync_inodes_sb(struct super_block *sb)
> +void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this)
>  {
>  	DECLARE_COMPLETION_ONSTACK(done);
>  	struct wb_writeback_work work = {
>  		.sb		= sb,
>  		.sync_mode	= WB_SYNC_ALL,
>  		.nr_pages	= LONG_MAX,
> +		.older_than_this = older_than_this,
>  		.range_cyclic	= 0,
>  		.done		= &done,
>  		.reason		= WB_REASON_SYNC,
> diff --git a/fs/sync.c b/fs/sync.c
> index 905f3f6..ff96f99 100644
> --- a/fs/sync.c
> +++ b/fs/sync.c
> @@ -27,10 +27,11 @@
>   * wait == 1 case since in that case write_inode() functions do
>   * sync_dirty_buffer() and thus effectively write one block at a time.
>   */
> -static int __sync_filesystem(struct super_block *sb, int wait)
> +static int __sync_filesystem(struct super_block *sb, int wait,
> +			     unsigned long start)
>  {
>  	if (wait)
> -		sync_inodes_sb(sb);
> +		sync_inodes_sb(sb, start);
>  	else
>  		writeback_inodes_sb(sb, WB_REASON_SYNC);
>  
> @@ -47,6 +48,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
>  int sync_filesystem(struct super_block *sb)
>  {
>  	int ret;
> +	unsigned long start = jiffies;
>  
>  	/*
>  	 * We need to be protected against the filesystem going from
> @@ -60,17 +62,17 @@ int sync_filesystem(struct super_block *sb)
>  	if (sb->s_flags & MS_RDONLY)
>  		return 0;
>  
> -	ret = __sync_filesystem(sb, 0);
> +	ret = __sync_filesystem(sb, 0, start);
>  	if (ret < 0)
>  		return ret;
> -	return __sync_filesystem(sb, 1);
> +	return __sync_filesystem(sb, 1, start);
>  }
>  EXPORT_SYMBOL_GPL(sync_filesystem);
>  
>  static void sync_inodes_one_sb(struct super_block *sb, void *arg)
>  {
>  	if (!(sb->s_flags & MS_RDONLY))
> -		sync_inodes_sb(sb);
> +		sync_inodes_sb(sb, *((unsigned long *)arg));
>  }
>  
>  static void sync_fs_one_sb(struct super_block *sb, void *arg)
> @@ -102,9 +104,10 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
>  SYSCALL_DEFINE0(sync)
>  {
>  	int nowait = 0, wait = 1;
> +	unsigned long start = jiffies;
>  
>  	wakeup_flusher_threads(0, WB_REASON_SYNC);
> -	iterate_supers(sync_inodes_one_sb, NULL);
> +	iterate_supers(sync_inodes_one_sb, &start);
>  	iterate_supers(sync_fs_one_sb, &nowait);
>  	iterate_supers(sync_fs_one_sb, &wait);
>  	iterate_bdevs(fdatawrite_one_bdev, NULL);
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 15188cc..8968f50 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -918,7 +918,7 @@ xfs_flush_inodes(
>  	struct super_block	*sb = mp->m_super;
>  
>  	if (down_read_trylock(&sb->s_umount)) {
> -		sync_inodes_sb(sb);
> +		sync_inodes_sb(sb, jiffies);
>  		up_read(&sb->s_umount);
>  	}
>  }
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index 021b8a3..fc0e432 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -97,7 +97,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
>  int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
>  int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
>  				  enum wb_reason reason);
> -void sync_inodes_sb(struct super_block *);
> +void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this);
>  void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
>  void inode_wait_for_writeback(struct inode *inode);
>  
> -- 
> 1.8.1.4
> 
-- 
Jan Kara <jack@xxxxxxx>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux