On Wed, Apr 15, 2020 at 07:05:04AM -0400, Brian Foster wrote: > On Tue, Apr 14, 2020 at 09:15:29PM -0700, Darrick J. Wong wrote: > > From: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > > > > Move the inode dirty data flushing to a workqueue so that multiple > > threads can take advantage of a single thread's flushing work. The > > ratelimiting technique used in bdd4ee4 was not successful, because > > threads that skipped the inode flush scan due to ratelimiting would > > ENOSPC early, which caused occasional (but noticeable) changes in > > behavior and sporadic fstest regressions. > > > > Therfore, make all the writer threads wait on a single inode flush, > > Therefore > > > which eliminates both the stampeding hoards of flushers and the small > > hordes ? :) Fixed both of these. Thanks for the review! (Apparently I started this reply days ago and forgot to send it; the patch ofc is in -rc2...) --D > > > window in which a write could fail with ENOSPC because it lost the > > ratelimit race after even another thread freed space. > > > > Fixes: bdd4ee4f8407 ("xfs: ratelimit inode flush on buffered write ENOSPC") > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > > --- > > v2: run it on the sync workqueue > > --- > > Reviewed-by: Brian Foster <bfoster@xxxxxxxxxx> > > > fs/xfs/xfs_mount.h | 6 +++++- > > fs/xfs/xfs_super.c | 40 ++++++++++++++++++++++------------------ > > 2 files changed, 27 insertions(+), 19 deletions(-) > > > > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > > index 50c43422fa17..b2e4598fdf7d 100644 > > --- a/fs/xfs/xfs_mount.h > > +++ b/fs/xfs/xfs_mount.h > > @@ -167,8 +167,12 @@ typedef struct xfs_mount { > > struct xfs_kobj m_error_meta_kobj; > > struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; > > struct xstats m_stats; /* per-fs stats */ > > - struct ratelimit_state m_flush_inodes_ratelimit; > > > > + /* > > + * Workqueue item so that we can coalesce multiple inode flush attempts > > + * into a single flush. > > + */ > > + struct work_struct m_flush_inodes_work; > > struct workqueue_struct *m_buf_workqueue; > > struct workqueue_struct *m_unwritten_workqueue; > > struct workqueue_struct *m_cil_workqueue; > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > > index abf06bf9c3f3..424bb9a2d532 100644 > > --- a/fs/xfs/xfs_super.c > > +++ b/fs/xfs/xfs_super.c > > @@ -516,6 +516,20 @@ xfs_destroy_mount_workqueues( > > destroy_workqueue(mp->m_buf_workqueue); > > } > > > > +static void > > +xfs_flush_inodes_worker( > > + struct work_struct *work) > > +{ > > + struct xfs_mount *mp = container_of(work, struct xfs_mount, > > + m_flush_inodes_work); > > + struct super_block *sb = mp->m_super; > > + > > + if (down_read_trylock(&sb->s_umount)) { > > + sync_inodes_sb(sb); > > + up_read(&sb->s_umount); > > + } > > +} > > + > > /* > > * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK > > * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting > > @@ -526,15 +540,15 @@ void > > xfs_flush_inodes( > > struct xfs_mount *mp) > > { > > - struct super_block *sb = mp->m_super; > > - > > - if (!__ratelimit(&mp->m_flush_inodes_ratelimit)) > > + /* > > + * If flush_work() returns true then that means we waited for a flush > > + * which was already in progress. Don't bother running another scan. > > + */ > > + if (flush_work(&mp->m_flush_inodes_work)) > > return; > > > > - if (down_read_trylock(&sb->s_umount)) { > > - sync_inodes_sb(sb); > > - up_read(&sb->s_umount); > > - } > > + queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work); > > + flush_work(&mp->m_flush_inodes_work); > > } > > > > /* Catch misguided souls that try to use this interface on XFS */ > > @@ -1369,17 +1383,6 @@ xfs_fc_fill_super( > > if (error) > > goto out_free_names; > > > > - /* > > - * Cap the number of invocations of xfs_flush_inodes to 16 for every > > - * quarter of a second. The magic numbers here were determined by > > - * observation neither to cause stalls in writeback when there are a > > - * lot of IO threads and the fs is near ENOSPC, nor cause any fstest > > - * regressions. YMMV. > > - */ > > - ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16); > > - ratelimit_set_flags(&mp->m_flush_inodes_ratelimit, > > - RATELIMIT_MSG_ON_RELEASE); > > - > > error = xfs_init_mount_workqueues(mp); > > if (error) > > goto out_close_devices; > > @@ -1752,6 +1755,7 @@ static int xfs_init_fs_context( > > spin_lock_init(&mp->m_perag_lock); > > mutex_init(&mp->m_growlock); > > atomic_set(&mp->m_active_trans, 0); > > + INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); > > INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); > > INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); > > INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); > > >