On Thu, Jun 04, 2015 at 12:03:39PM +1000, Dave Chinner wrote: > Fixing this requires a tweak to the algorithm in > __xfs_buf_delwri_submit() so that we don't lock an entire list of > thousands of IOs before starting submission. In the mean time, > reducing the number of AGs will reduce the impact of this because > the delayed write submission code will skip buffers that are already > locked or pinned in memory, and hence an AG under modification at > the time submission occurs will be skipped by the delwri code. You might like to try the patch below on a test machine to see if it helps with the problem. Cheers, Dave. -- Dave Chinner david@xxxxxxxxxxxxx xfs: reduce lock hold times in buffer writeback From: Dave Chinner <dchinner@xxxxxxxxxx> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> --- fs/xfs/xfs_buf.c | 80 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bbe4e9e..8d2cc36 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1768,15 +1768,63 @@ xfs_buf_cmp( return 0; } +static void +xfs_buf_delwri_submit_buffers( + struct list_head *buffer_list, + struct list_head *io_list, + bool wait) +{ + struct xfs_buf *bp, *n; + struct blk_plug plug; + + blk_start_plug(&plug); + list_for_each_entry_safe(bp, n, buffer_list, b_list) { + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | + XBF_WRITE_FAIL); + bp->b_flags |= XBF_WRITE | XBF_ASYNC; + + /* + * We do all IO submission async. This means if we need + * to wait for IO completion we need to take an extra + * reference so the buffer is still valid on the other + * side. We need to move the buffer onto the io_list + * at this point so the caller can still access it. + */ + if (wait) { + xfs_buf_hold(bp); + list_move_tail(&bp->b_list, io_list); + } else + list_del_init(&bp->b_list); + + xfs_buf_submit(bp); + } + blk_finish_plug(&plug); +} + +/* + * submit buffers for write. + * + * When we have a large buffer list, we do not want to hold all the buffers + * locked while we block on the request queue waiting for IO dispatch. To avoid + * this problem, we lock and submit buffers in groups of 50, thereby minimising + * the lock hold times for lists which may contain thousands of objects. + * + * To do this, we sort the buffer list before we walk the list to lock and + * submit buffers, and we plug and unplug around each group of buffers we + * submit. + */ static int __xfs_buf_delwri_submit( struct list_head *buffer_list, struct list_head *io_list, bool wait) { - struct blk_plug plug; struct xfs_buf *bp, *n; + LIST_HEAD (submit_list); int pinned = 0; + int count = 0; + + list_sort(NULL, buffer_list, xfs_buf_cmp); list_for_each_entry_safe(bp, n, buffer_list, b_list) { if (!wait) { @@ -1802,30 +1850,24 @@ __xfs_buf_delwri_submit( continue; } - list_move_tail(&bp->b_list, io_list); + list_move_tail(&bp->b_list, &submit_list); trace_xfs_buf_delwri_split(bp, _RET_IP_); - } - - list_sort(NULL, io_list, xfs_buf_cmp); - - blk_start_plug(&plug); - list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); - bp->b_flags |= XBF_WRITE | XBF_ASYNC; /* - * we do all Io submission async. This means if we need to wait - * for IO completion we need to take an extra reference so the - * buffer is still valid on the other side. + * We do small batches of IO submission to minimise lock hold + * time and unnecessary writeback of buffers that are hot and + * would otherwise be relogged and hence not require immediate + * writeback. */ - if (wait) - xfs_buf_hold(bp); - else - list_del_init(&bp->b_list); + if (count++ < 50) + continue; - xfs_buf_submit(bp); + xfs_buf_delwri_submit_buffers(&submit_list, io_list, wait); + count = 0; } - blk_finish_plug(&plug); + + if (!list_empty(&submit_list)) + xfs_buf_delwri_submit_buffers(&submit_list, io_list, wait); return pinned; } _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs