From: Dave Chinner <dchinner@xxxxxxxxxx> A long standing problem for streaming writeÑ through the NFS server has been that the NFS server opens and closes file descriptors on an inode for every write. The result of this behaviour is that the ->release() function is called on every close and that results in XFS truncating speculative preallocation beyond the EOF. This has an adverse effect on file layout when multiple files are being written at the same time - they interleave their extents and can result in severe fragmentation. To avoid this problem, keep a count of the number of ->release calls made on an inode. For most cases, an inode is only going to be opened once for writing and then closed again during it's lifetime in cache. Hence if there are multiple ->release calls, there is a good chance that the inode is being accessed by the NFS server. Hence count up every time ->release is called while there are delalloc blocks still outstanding on the inode. If this count is non-zero when ->release is next called, then do no truncate away the speculative preallocation - leave it there so that subsequent writes do not need to reallocate the delalloc space. This will prevent interleaving of extents of different inodes written concurrently to the same AG. If we get this wrong, it is not a big deal as we truncate speculative allocation beyond EOF anyway in xfs_inactive() when the inode is thrown out of the cache. The new counter in the struct xfs_inode fits into a hole in the structure on 64 bit machines, so does not grow the size of the inode at all. Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> --- fs/xfs/xfs_bmap.c | 9 +++++- fs/xfs/xfs_dfrag.c | 13 ++++++++++ fs/xfs/xfs_iget.c | 1 + fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_vnodeops.c | 61 ++++++++++++++++++++++++++++++++----------------- 5 files changed, 62 insertions(+), 23 deletions(-) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 8abd12e..7764a4f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5471,8 +5471,13 @@ xfs_getbmap( if (error) goto out_unlock_iolock; } - - ASSERT(ip->i_delayed_blks == 0); + /* + * even after flushing the inode, there can still be delalloc + * blocks on the inode beyond EOF due to speculative + * preallocation. These are not removed until the release + * function is called or the inode is inactivated. Hence we + * cannot assert here that ip->i_delayed_blks == 0. + */ } lock = xfs_ilock_map_shared(ip); diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 3b9582c..e60490b 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -377,6 +377,19 @@ xfs_swap_extents( ip->i_d.di_format = tip->i_d.di_format; tip->i_d.di_format = tmp; + /* + * The extents in the source inode could still contain speculative + * preallocation beyond EOF (e.g. the file is open but not modified + * while defrag is in progress). In that case, we need to copy over the + * number of delalloc blocks the data fork in the source inode is + * tracking beyond EOF so that when the fork is truncated away when the + * temporary inode is unlinked we don't underrun the i_delayed_blks + * counter on that inode. + */ + ASSERT(tip->i_delayed_blks == 0); + tip->i_delayed_blks = ip->i_delayed_blks; + ip->i_delayed_blks = 0; + ilf_fields = XFS_ILOG_CORE; switch(ip->i_d.di_format) { diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 0cdd269..18991a9 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -84,6 +84,7 @@ xfs_inode_alloc( memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); ip->i_size = 0; ip->i_new_size = 0; + ip->i_dirty_releases = 0; /* prevent anyone from using this yet */ VFS_I(ip)->i_state = I_NEW; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index fb2ca2e..ea2f34e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -260,6 +260,7 @@ typedef struct xfs_inode { xfs_fsize_t i_size; /* in-memory size */ xfs_fsize_t i_new_size; /* size when write completes */ atomic_t i_iocount; /* outstanding I/O count */ + int i_dirty_releases; /* dirty ->release calls */ /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 8e4a63c..49f3a5a 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -964,29 +964,48 @@ xfs_release( xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); } - if (ip->i_d.di_nlink != 0) { - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || - ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS)) && - (!(ip->i_d.di_flags & - (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { + if (ip->i_d.di_nlink == 0) + return 0; - /* - * If we can't get the iolock just skip truncating - * the blocks past EOF because we could deadlock - * with the mmap_sem otherwise. We'll get another - * chance to drop them once the last reference to - * the inode is dropped, so we'll never leak blocks - * permanently. - */ - error = xfs_free_eofblocks(mp, ip, - XFS_FREE_EOF_TRYLOCK); - if (error) - return error; - } - } + if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || + ip->i_delayed_blks > 0)) && + (ip->i_df.if_flags & XFS_IFEXTENTS)) && + (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { + /* + * If we can't get the iolock just skip truncating the blocks + * past EOF because we could deadlock with the mmap_sem + * otherwise. We'll get another chance to drop them once the + * last reference to the inode is dropped, so we'll never leak + * blocks permanently. + * + * Further, count the number of times we get here in the life + * of this inode. If the inode is being opened, written and + * closed frequently and we have delayed allocation blocks + * oustanding (e.g. streaming writes from the NFS server), + * truncating the blocks past EOF will cause fragmentation to + * occur. + * + * In this case don't do the truncation, either, but we have to + * be careful how we detect this case. Blocks beyond EOF show + * up as i_delayed_blks even when the inode is clean, so we + * need to truncate them away first before checking for a dirty + * release. Hence on the first couple of dirty closes,we will + * still remove the speculative allocation, but then we will + * leave it in place. + */ + if (ip->i_dirty_releases > 1) + return 0; + error = xfs_free_eofblocks(mp, ip, + XFS_FREE_EOF_TRYLOCK); + if (error) + return error; + + /* delalloc blocks after truncation means it really is dirty */ + if (ip->i_delayed_blks) + ip->i_dirty_releases++; + } return 0; } -- 1.7.2.3 _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs