[PATCH 2/5] xfs: separate out inode buffer recovery a bit more

Dave Chinner <david@xxxxxxxxxxxxx> · Tue, 19 Mar 2024 13:15:21 +1100

From: Dave Chinner <dchinner@xxxxxxxxxx>

It really is a unique snowflake, so peal off from normal buffer
recovery earlier and shuffle all the unique bits into the inode
buffer recovery function.

Also, it looks like the handling of mismatched inode cluster buffer
sizes is wrong - we have to write the recovered buffer -before- we
mark it stale as we're not supposed to write stale buffers. I don't
think we check that anywhere in the buffer IO path, but lets do it
the right way anyway.

Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
---
 fs/xfs/xfs_buf_item_recover.c | 99 ++++++++++++++++++++++-------------
 1 file changed, 63 insertions(+), 36 deletions(-)

diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index dba57ee6fa6d..f994a303ad0a 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -229,7 +229,7 @@ xlog_recover_validate_buf_type(
 	 * just avoid the verification stage for non-crc filesystems
 	 */
 	if (!xfs_has_crc(mp))
-		return;
+		return 0;
 
 	magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
 	magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
@@ -407,7 +407,7 @@ xlog_recover_validate_buf_type(
 	 * skipped.
 	 */
 	if (current_lsn == NULLCOMMITLSN)
-		return 0;;
+		return 0;
 
 	if (warnmsg) {
 		xfs_warn(mp, warnmsg);
@@ -567,18 +567,22 @@ xlog_recover_this_dquot_buffer(
 }
 
 /*
- * Perform recovery for a buffer full of inodes.  In these buffers, the only
- * data which should be recovered is that which corresponds to the
- * di_next_unlinked pointers in the on disk inode structures.  The rest of the
- * data for the inodes is always logged through the inodes themselves rather
- * than the inode buffer and is recovered in xlog_recover_inode_pass2().
+ * Perform recovery for a buffer full of inodes. We don't have inode cluster
+ * buffer specific LSNs, so we always recover inode buffers if they contain
+ * inodes.
+ *
+ * In these buffers, the only inode data which should be recovered is that which
+ * corresponds to the di_next_unlinked pointers in the on disk inode structures.
+ * The rest of the data for the inodes is always logged through the inodes
+ * themselves rather than the inode buffer and is recovered in
+ * xlog_recover_inode_pass2().
  *
  * The only time when buffers full of inodes are fully recovered is when the
- * buffer is full of newly allocated inodes.  In this case the buffer will
- * not be marked as an inode buffer and so will be sent to
- * xlog_recover_do_reg_buffer() below during recovery.
+ * buffer is full of newly allocated inodes.  In this case the buffer will not
+ * be marked as an inode buffer and so xlog_recover_do_reg_buffer() will be used
+ * instead.
  */
-STATIC int
+static int
 xlog_recover_do_inode_buffer(
 	struct xfs_mount		*mp,
 	struct xlog_recover_item	*item,
@@ -598,6 +602,13 @@ xlog_recover_do_inode_buffer(
 
 	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
 
+	/*
+	 * If the magic number doesn't match, something has gone wrong. Don't
+	 * recover the buffer.
+	 */
+	if (cpu_to_be16(XFS_DINODE_MAGIC) != *((__be16 *)bp->b_addr))
+		return -EFSCORRUPTED;
+
 	/*
 	 * Post recovery validation only works properly on CRC enabled
 	 * filesystems.
@@ -677,6 +688,31 @@ xlog_recover_do_inode_buffer(
 
 	}
 
+	/*
+	 * Make sure that only inode buffers with good sizes remain valid after
+	 * recovering this buffer item.
+	 *
+	 * The kernel moves inodes in buffers of 1 block or inode_cluster_size
+	 * bytes, whichever is bigger.  The inode buffers in the log can be a
+	 * different size if the log was generated by an older kernel using
+	 * unclustered inode buffers or a newer kernel running with a different
+	 * inode cluster size.  Regardless, if the inode buffer size isn't
+	 * max(blocksize, inode_cluster_size) for *our* value of
+	 * inode_cluster_size, then we need to keep the buffer out of the buffer
+	 * cache so that the buffer won't overlap with future reads of those
+	 * inodes.
+	 *
+	 * To acheive this, we write the buffer ito recover the inodes then mark
+	 * it stale so that it won't be found on overlapping buffer lookups and
+	 * caller knows not to queue it for delayed write.
+	 */
+	if (BBTOB(bp->b_length) != M_IGEO(mp)->inode_cluster_size) {
+		int error;
+
+		error = xfs_bwrite(bp);
+		xfs_buf_stale(bp);
+		return error;
+	}
 	return 0;
 }
 
@@ -840,7 +876,6 @@ xlog_recover_get_buf_lsn(
 	magic16 = be16_to_cpu(*(__be16 *)blk);
 	switch (magic16) {
 	case XFS_DQUOT_MAGIC:
-	case XFS_DINODE_MAGIC:
 		goto recover_immediately;
 	default:
 		break;
@@ -910,6 +945,17 @@ xlog_recover_buf_commit_pass2(
 	if (error)
 		return error;
 
+	/*
+	 * Inode buffer recovery is quite unique, so go out separate ways here
+	 * to simplify the rest of the code.
+	 */
+	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+		if (error || (bp->b_flags & XBF_STALE))
+			goto out_release;
+		goto out_write;
+	}
+
 	/*
 	 * Recover the buffer only if we get an LSN from it and it's less than
 	 * the lsn of the transaction we are replaying.
@@ -946,9 +992,7 @@ xlog_recover_buf_commit_pass2(
 		goto out_release;
 	}
 
-	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
-		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
-	} else if (buf_f->blf_flags &
+	if (buf_f->blf_flags &
 		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
 		if (!xlog_recover_this_dquot_buffer(mp, log, item, bp, buf_f))
 			goto out_release;
@@ -965,28 +1009,11 @@ xlog_recover_buf_commit_pass2(
 	/*
 	 * Perform delayed write on the buffer.  Asynchronous writes will be
 	 * slower when taking into account all the buffers to be flushed.
-	 *
-	 * Also make sure that only inode buffers with good sizes stay in
-	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
-	 * or inode_cluster_size bytes, whichever is bigger.  The inode
-	 * buffers in the log can be a different size if the log was generated
-	 * by an older kernel using unclustered inode buffers or a newer kernel
-	 * running with a different inode cluster size.  Regardless, if
-	 * the inode buffer size isn't max(blocksize, inode_cluster_size)
-	 * for *our* value of inode_cluster_size, then we need to keep
-	 * the buffer out of the buffer cache so that the buffer won't
-	 * overlap with future reads of those inodes.
 	 */
-	if (XFS_DINODE_MAGIC ==
-	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
-	    (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
-		xfs_buf_stale(bp);
-		error = xfs_bwrite(bp);
-	} else {
-		ASSERT(bp->b_mount == mp);
-		bp->b_flags |= _XBF_LOGRECOVERY;
-		xfs_buf_delwri_queue(bp, buffer_list);
-	}
+out_write:
+	ASSERT(bp->b_mount == mp);
+	bp->b_flags |= _XBF_LOGRECOVERY;
+	xfs_buf_delwri_queue(bp, buffer_list);
 
 out_release:
 	xfs_buf_relse(bp);
-- 
2.43.0