On Tue, Apr 14, 2020 at 09:15:29PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > > Move the inode dirty data flushing to a workqueue so that multiple > threads can take advantage of a single thread's flushing work. The > ratelimiting technique used in bdd4ee4 was not successful, because > threads that skipped the inode flush scan due to ratelimiting would > ENOSPC early, which caused occasional (but noticeable) changes in > behavior and sporadic fstest regressions. > > Therfore, make all the writer threads wait on a single inode flush, Therefore > which eliminates both the stampeding hoards of flushers and the small hordes ? :) > window in which a write could fail with ENOSPC because it lost the > ratelimit race after even another thread freed space. > > Fixes: bdd4ee4f8407 ("xfs: ratelimit inode flush on buffered write ENOSPC") > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > --- > v2: run it on the sync workqueue > --- Reviewed-by: Brian Foster <bfoster@xxxxxxxxxx> > fs/xfs/xfs_mount.h | 6 +++++- > fs/xfs/xfs_super.c | 40 ++++++++++++++++++++++------------------ > 2 files changed, 27 insertions(+), 19 deletions(-) > > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > index 50c43422fa17..b2e4598fdf7d 100644 > --- a/fs/xfs/xfs_mount.h > +++ b/fs/xfs/xfs_mount.h > @@ -167,8 +167,12 @@ typedef struct xfs_mount { > struct xfs_kobj m_error_meta_kobj; > struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; > struct xstats m_stats; /* per-fs stats */ > - struct ratelimit_state m_flush_inodes_ratelimit; > > + /* > + * Workqueue item so that we can coalesce multiple inode flush attempts > + * into a single flush. > + */ > + struct work_struct m_flush_inodes_work; > struct workqueue_struct *m_buf_workqueue; > struct workqueue_struct *m_unwritten_workqueue; > struct workqueue_struct *m_cil_workqueue; > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index abf06bf9c3f3..424bb9a2d532 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -516,6 +516,20 @@ xfs_destroy_mount_workqueues( > destroy_workqueue(mp->m_buf_workqueue); > } > > +static void > +xfs_flush_inodes_worker( > + struct work_struct *work) > +{ > + struct xfs_mount *mp = container_of(work, struct xfs_mount, > + m_flush_inodes_work); > + struct super_block *sb = mp->m_super; > + > + if (down_read_trylock(&sb->s_umount)) { > + sync_inodes_sb(sb); > + up_read(&sb->s_umount); > + } > +} > + > /* > * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK > * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting > @@ -526,15 +540,15 @@ void > xfs_flush_inodes( > struct xfs_mount *mp) > { > - struct super_block *sb = mp->m_super; > - > - if (!__ratelimit(&mp->m_flush_inodes_ratelimit)) > + /* > + * If flush_work() returns true then that means we waited for a flush > + * which was already in progress. Don't bother running another scan. > + */ > + if (flush_work(&mp->m_flush_inodes_work)) > return; > > - if (down_read_trylock(&sb->s_umount)) { > - sync_inodes_sb(sb); > - up_read(&sb->s_umount); > - } > + queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work); > + flush_work(&mp->m_flush_inodes_work); > } > > /* Catch misguided souls that try to use this interface on XFS */ > @@ -1369,17 +1383,6 @@ xfs_fc_fill_super( > if (error) > goto out_free_names; > > - /* > - * Cap the number of invocations of xfs_flush_inodes to 16 for every > - * quarter of a second. The magic numbers here were determined by > - * observation neither to cause stalls in writeback when there are a > - * lot of IO threads and the fs is near ENOSPC, nor cause any fstest > - * regressions. YMMV. > - */ > - ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16); > - ratelimit_set_flags(&mp->m_flush_inodes_ratelimit, > - RATELIMIT_MSG_ON_RELEASE); > - > error = xfs_init_mount_workqueues(mp); > if (error) > goto out_close_devices; > @@ -1752,6 +1755,7 @@ static int xfs_init_fs_context( > spin_lock_init(&mp->m_perag_lock); > mutex_init(&mp->m_growlock); > atomic_set(&mp->m_active_trans, 0); > + INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); > INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); > INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); > INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); >