[PATCH 5/5] xfs: log file size updates at I/O completion time

Christoph Hellwig <hch@xxxxxxxxxxxxx> · Tue, 08 Nov 2011 03:56:19 -0500

Do not use unlogged metadata updates and the VFS dirty bit for updating
the file size after writeback.  In addition to causing various problems
with updates getting delayed for far too log this also drags in the
unscalable VFS dirty tracking, and is one of the few remaining unlogged
metadata updates.

Note that we allocate a new transaction from the I/O completion handler.
While this sounds fairly dangerous it isn't an issue in practice given
that any appending write alreay had to start a transaction in writepages
to allocate blocks, and we'll start throtteling there if we run low on
log space or memory.

We could still occasionally stall in the completion handler, but given
that we have per-filesystems workqueues for the I/O completions,
and completions that do not have to either convert unwritten extents
or update the file size are processed from interrupt context we do not
have to worry about this stalling a system to death.

In addition to that implementing log reservations from ->writepage that
are only released by a different thread requires a lot of work, and even
with that wasn't quite doable in a deadlock-free manner.

Signed-off-by: Christoph Hellwig <hch@xxxxxx>

---
 fs/xfs/xfs_aops.c |   49 ++++++++++++++++++++++++++++---------------------
 fs/xfs/xfs_file.c |   36 +++++++++++++++++++++++++++++++++++-
 2 files changed, 63 insertions(+), 22 deletions(-)

Index: linux-2.6/fs/xfs/xfs_aops.c
===================================================================

--- linux-2.6.orig/fs/xfs/xfs_aops.c	2011-11-07 12:19:32.935315100 +0100
+++ linux-2.6/fs/xfs/xfs_aops.c	2011-11-07 12:19:35.625815654 +0100
@@ -26,6 +26,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
@@ -114,22 +115,39 @@ static inline bool xfs_ioend_is_append(s
  * not extend all the way to the valid file size then restrict this update to
  * the end of the write.
  */
-STATIC void
+STATIC int
 xfs_setfilesize(
 	struct xfs_ioend	*ioend)
 {
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
 	xfs_fsize_t		isize;
+	int			error = 0;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
-	if (isize) {
-		trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
-		ip->i_d.di_size = isize;
-		xfs_mark_inode_dirty(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	if (!isize)
+		return 0;
+
+	trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+	error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
 	}
 
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	ip->i_d.di_size = isize;
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	return xfs_trans_commit(tp, 0);
 }
 
 /*
@@ -183,12 +201,10 @@ xfs_end_io(
 			ioend->io_error = -error;
 			goto done;
 		}
-	} else {
-		/*
-		 * We might have to update the on-disk file size after
-		 * extending writes.
-		 */
-		xfs_setfilesize(ioend);
+	} else if (xfs_ioend_is_append(ioend)) {
+		error = xfs_setfilesize(ioend);
+		if (error)
+			ioend->io_error = error;
 	}
 
 done:
@@ -345,19 +361,10 @@ xfs_submit_ioend_bio(
 	xfs_ioend_t		*ioend,
 	struct bio		*bio)
 {
-	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
-
 	atomic_inc(&ioend->io_remaining);
 	bio->bi_private = ioend;
 	bio->bi_end_io = xfs_end_bio;
 
-	/*
-	 * If the I/O is beyond EOF we mark the inode dirty immediately
-	 * but don't update the inode size until I/O completion.
-	 */
-	if (xfs_new_eof(ip, ioend->io_offset + ioend->io_size))
-		xfs_mark_inode_dirty(ip);
-
 	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
 }
 
Index: linux-2.6/fs/xfs/xfs_file.c
===================================================================
--- linux-2.6.orig/fs/xfs/xfs_file.c	2011-11-07 12:19:20.490314774 +0100
+++ linux-2.6/fs/xfs/xfs_file.c	2011-11-07 12:58:28.834315624 +0100
@@ -436,6 +436,36 @@ xfs_aio_write_isize_update(
 	}
 }
 
+STATIC int
+xfs_aio_write_isize_reset(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error = 0;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+	error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	if (ip->i_d.di_size <= ip->i_size) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_cancel(tp, 0);
+		return 0;
+	}
+
+	ip->i_d.di_size = ip->i_size;
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	return xfs_trans_commit(tp, 0);
+}
+
 /*
  * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
  * part of the I/O may have been written to disk before the error occurred.  In
@@ -447,14 +477,18 @@ xfs_aio_write_newsize_update(
 	struct xfs_inode	*ip,
 	xfs_fsize_t		new_size)
 {
+	bool			reset = false;
 	if (new_size == ip->i_new_size) {
 		xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
 		if (new_size == ip->i_new_size)
 			ip->i_new_size = 0;
 		if (ip->i_d.di_size > ip->i_size)
-			ip->i_d.di_size = ip->i_size;
+			reset = true;
 		xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
 	}
+
+	if (reset)
+		xfs_aio_write_isize_reset(ip);
 }
 
 /*

_______________________________________________
xfs mailing list
xfs@xxxxxxxxxxx
http://oss.sgi.com/mailman/listinfo/xfs