On Sun, Jul 17, 2011 at 06:05:01PM +0200, Jörn Engel wrote: > xfs: > ==== ..... > seqwr 1 2 4 8 16 32 64 128 > 16384 39956 39695 39971 39913 37042 37538 36591 32179 > 8192 67934 66073 30963 29038 29852 25210 23983 28272 > 4096 89250 81417 28671 18685 12917 14870 22643 22237 > 2048 140272 120588 140665 140012 137516 139183 131330 129684 > 1024 217473 147899 210350 218526 219867 220120 219758 215166 > 512 328260 181197 211131 263533 294009 298203 301698 298013 OK, I can explain the pattern here where throughput drops off at 2-4 threads. It's not as simple as the seqrd case, but it's related to the fact that this workload is an append write workload. See the patch description below for why that matters. As it is, the numbers I get for 16k seqwr on my hardawre are as follows: seqwr 1 2 4 8 16 vanilla 3072 2734 2506 not tested... patched 2984 4156 4922 5175 5120 Looks like my hardware is topping out at ~5-6kiops no matter the block size here. Which, no matter how you look at it, is a significant improvement. ;) Cheers, Dave. -- Dave Chinner david@xxxxxxxxxxxxx xfs: don't serialise adjacent concurrent direct IO appending writes For append write workloads, extending the file requires a certain amount of exclusive locking to be done up front to ensure sanity in things like ensuring that we've zeroed any allocated regions between the old EOF and the start of the new IO. For single threads, this typically isn't a problem, and for large IOs we don't serialise enough for it to be a problem for two threads on really fast block devices. However for smaller IO and larger thread counts we have a problem. Take 4 concurrent sequential, single block sized and aligned IOs. After the first IO is submitted but before it completes, we end up with this state: IO 1 IO 2 IO 3 IO 4 +-------+-------+-------+-------+ ^ ^ | | | | | | | \- ip->i_new_size \- ip->i_size And the IO is done without exclusive locking because offset <= ip->i_size. When we submit IO 2, we see offset > ip->i_size, and grab the IO lock exclusive, because there is a chance we need to do EOF zeroing. However, there is already an IO in progress that avoids the need for IO zeroing because offset <= ip->i_new_size. hence we could avoid holding the IO lock exlcusive for this. Hence after submission of the second IO, we'd end up this state: IO 1 IO 2 IO 3 IO 4 +-------+-------+-------+-------+ ^ ^ | | | | | | | \- ip->i_new_size \- ip->i_size There is no need to grab the i_mutex of the IO lock in exclusive mode if we don't need to invalidate the page cache. Taking these locks on every direct IO effective serialises them as taking the IO lock in exclusive mode has to wait for all shared holders to drop the lock. That only happens when IO is complete, so effective it prevents dispatch of concurrent direct IO writes to the same inode. And so you can see that for the third concurrent IO, we'd avoid exclusive locking for the same reason we avoided the exclusive lock for the second IO. Fixing this is a bit more complex than that, because we need to hold a write-submission local value of ip->i_new_size to that clearing the value is only done if no other thread has updated it before our IO completes..... Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> --- fs/xfs/linux-2.6/xfs_aops.c | 7 ++++ fs/xfs/linux-2.6/xfs_file.c | 69 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 63e971e..dda9a9e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -176,6 +176,13 @@ xfs_setfilesize( if (unlikely(ioend->io_error)) return 0; + /* + * If the IO is clearly not beyond the on-disk inode size, + * return before we take locks. + */ + if (ioend->io_offset + ioend->io_size <= ip->i_d.di_size) + return 0; + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) return EAGAIN; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 16a4bf0..5b6703a 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -422,11 +422,13 @@ xfs_aio_write_isize_update( */ STATIC void xfs_aio_write_newsize_update( - struct xfs_inode *ip) + struct xfs_inode *ip, + xfs_fsize_t new_size) { - if (ip->i_new_size) { + if (new_size == ip->i_new_size) { xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; + if (new_size == ip->i_new_size) + ip->i_new_size = 0; if (ip->i_d.di_size > ip->i_size) ip->i_d.di_size = ip->i_size; xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); @@ -478,7 +480,7 @@ xfs_file_splice_write( count, flags); xfs_aio_write_isize_update(inode, ppos, ret); - xfs_aio_write_newsize_update(ip); + xfs_aio_write_newsize_update(ip, new_size); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -675,6 +677,7 @@ xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, + xfs_fsize_t *new_sizep, int *iolock) { struct inode *inode = file->f_mapping->host; @@ -682,6 +685,8 @@ xfs_file_aio_write_checks( xfs_fsize_t new_size; int error = 0; +restart: + *new_sizep = 0; error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); @@ -689,9 +694,18 @@ xfs_file_aio_write_checks( return error; } + /* + * if we are writing beyond the current EOF, only update the + * ip->i_new_size if it is larger than any other concurrent write beyond + * EOF. Regardless of whether we update ip->i_new_size, return the + * updated new_size to the caller. + */ new_size = *pos + *count; - if (new_size > ip->i_size) - ip->i_new_size = new_size; + if (new_size > ip->i_size) { + if (new_size > ip->i_new_size) + ip->i_new_size = new_size; + *new_sizep = new_size; + } if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); @@ -699,10 +713,22 @@ xfs_file_aio_write_checks( /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. + * write. Don't issue zeroing if this IO is adjacent to an IO already in + * flight. If we are currently holding the iolock shared, we need to + * update it to exclusive which involves dropping all locks and + * relocking to maintain correct locking order. If we do this, restart + * the function to ensure all checks and values are still valid. */ - if (*pos > ip->i_size) + if ((ip->i_new_size && *pos > ip->i_new_size) || + (!ip->i_new_size && *pos > ip->i_size)) { + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + goto restart; + } error = -xfs_zero_eof(ip, *pos, ip->i_size); + } xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) @@ -749,6 +775,7 @@ xfs_file_dio_aio_write( unsigned long nr_segs, loff_t pos, size_t ocount, + xfs_fsize_t *new_size, int *iolock) { struct file *file = iocb->ki_filp; @@ -769,13 +796,25 @@ xfs_file_dio_aio_write( if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; - if (unaligned_io || mapping->nrpages || pos > ip->i_size) + /* + * Tricky locking alert: if we are doing multiple concurrent sequential + * writes (e.g. via aio), we don't need to do EOF zeroing if the current + * IO is adjacent to an in-flight IO. That means for such IO we can + * avoid taking the IOLOCK exclusively. Hence we avoid checking for + * writes beyond EOF at this point when deciding what lock to take. + * We will take the IOLOCK exclusive later if necessary. + * + * This, however, means that we need a local copy of the ip->i_new_size + * value from this IO if we change it so that we can determine if we can + * clear the value from the inode when this IO completes. + */ + if (unaligned_io || mapping->nrpages) *iolock = XFS_IOLOCK_EXCL; else *iolock = XFS_IOLOCK_SHARED; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) return ret; @@ -814,6 +853,7 @@ xfs_file_buffered_aio_write( unsigned long nr_segs, loff_t pos, size_t ocount, + xfs_fsize_t *new_size, int *iolock) { struct file *file = iocb->ki_filp; @@ -827,7 +867,7 @@ xfs_file_buffered_aio_write( *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); if (ret) return ret; @@ -867,6 +907,7 @@ xfs_file_aio_write( ssize_t ret; int iolock; size_t ocount = 0; + xfs_fsize_t new_size = 0; XFS_STATS_INC(xs_write_calls); @@ -886,10 +927,10 @@ xfs_file_aio_write( if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, - ocount, &iolock); + ocount, &new_size, &iolock); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount, &iolock); + ocount, &new_size, &iolock); xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); @@ -914,7 +955,7 @@ xfs_file_aio_write( } out_unlock: - xfs_aio_write_newsize_update(ip); + xfs_aio_write_newsize_update(ip, new_size); xfs_rw_iunlock(ip, iolock); return ret; } -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html