[PATCH 17/17] xfs_scrub: try harder to fill the bulkstat array with bulkstat()

"Darrick J. Wong" <djwong@xxxxxxxxxx> · Thu, 06 Feb 2025 14:35:06 -0800

From: Darrick J. Wong <djwong@xxxxxxxxxx>

Sometimes, the last bulkstat record returned by the first xfrog_bulkstat
call in bulkstat_for_inumbers will contain an inumber less than the
highest allocated inode mentioned in the inumbers record.  This happens
either because the inodes have been freed, or because the the kernel
encountered a corrupt inode during bulkstat and stopped filling up the
array.

In both cases, we can call bulkstat again to try to fill up the rest of
the array.  If there are newly allocated inodes, they'll be returned; if
we've truly hit the end of the filesystem, the kernel will return zero
records; and if the first allocated inode is indeed corrupt, the kernel
will return EFSCORRUPTED.

As an optimization to avoid the single-step code, call bulkstat with an
increasing ino parameter until the bulkstat array is full or the kernel
tells us there are no bulkstat records to return.  This speeds things
up a bit in cases where the allocmask is all ones and only the second
inode is corrupt.

Signed-off-by: "Darrick J. Wong" <djwong@xxxxxxxxxx>
---
 libfrog/bitmask.h |    6 +++
 scrub/inodes.c    |  110 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/libfrog/bitmask.h b/libfrog/bitmask.h
index 719a6bfd29db38..47e39a1e09d002 100644
--- a/libfrog/bitmask.h
+++ b/libfrog/bitmask.h
@@ -42,4 +42,10 @@ static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
 	return 0;
 }
 
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfrog_highbit64(uint64_t v)
+{
+	return fls64(v) - 1;
+}
+
 #endif /* __LIBFROG_BITMASK_H_ */
diff --git a/scrub/inodes.c b/scrub/inodes.c
index 24a1dcab94c22d..2f3c87be79f783 100644
--- a/scrub/inodes.c
+++ b/scrub/inodes.c
@@ -20,6 +20,8 @@
 #include "libfrog/fsgeom.h"
 #include "libfrog/bulkstat.h"
 #include "libfrog/handle_priv.h"
+#include "bitops.h"
+#include "libfrog/bitmask.h"
 
 /*
  * Iterate a range of inodes.
@@ -56,6 +58,15 @@
  * avoid scanning inodes that are not in the inumber chunk.  In case (3) we
  * conclude that there were no inodes left to scan and terminate.
  *
+ * In case (2) and (4) we don't know why bulkstat returned fewer than C
+ * elements.  We might have found the end of the filesystem, or the kernel
+ * might have found a corrupt inode and stopped.  This we must investigate by
+ * trying to fill out the rest of the bstat array starting with the next
+ * inumber after the last bstat array element filled, and continuing until S'
+ * is beyond S0 + C, or the array is full.  Each time we succeed in loading
+ * new records, the kernel increases S' for us; if instead we encounter case
+ * (4), we can increment S' ourselves.
+ *
  * Inodes that are set in the allocmask but not set in the seen mask are the
  * corrupt inodes.  For each of these cases, we try to populate the bulkstat
  * array one inode at a time.  If the kernel returns a matching record we can
@@ -105,6 +116,87 @@ seen_mask_from_bulkstat(
 	return ret;
 }
 
+/*
+ * Try to fill the rest of orig_breq with bulkstat data by re-running bulkstat
+ * with increasing start_ino until we either hit the end of the inumbers info
+ * or fill up the bstat array with something.  Returns a bitmask of the inodes
+ * within inums that were filled by the bulkstat requests.
+ */
+static __u64
+bulkstat_the_rest(
+	struct scrub_ctx		*ctx,
+	const struct xfs_inumbers	*inums,
+	struct xfs_bulkstat_req		*orig_breq,
+	int				orig_error)
+{
+	struct xfs_bulkstat_req		*new_breq;
+	struct xfs_bulkstat		*old_bstat =
+		&orig_breq->bulkstat[orig_breq->hdr.ocount];
+	const __u64			limit_ino =
+		inums->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE;
+	__u64				start_ino = orig_breq->hdr.ino;
+	__u64				seen_mask = 0;
+	int				error;
+
+	assert(orig_breq->hdr.ocount < orig_breq->hdr.icount);
+
+	/*
+	 * If the first bulkstat returned a corruption error, that means
+	 * start_ino is corrupt.  Restart instead at the next inumber.
+	 */
+	if (orig_error == EFSCORRUPTED)
+		start_ino++;
+	if (start_ino >= limit_ino)
+		return 0;
+
+	error = -xfrog_bulkstat_alloc_req(
+			orig_breq->hdr.icount - orig_breq->hdr.ocount,
+			start_ino, &new_breq);
+	if (error)
+		return error;
+	new_breq->hdr.flags = orig_breq->hdr.flags;
+
+	do {
+		/*
+		 * Fill the new bulkstat request with stat data starting at
+		 * start_ino.
+		 */
+		error = -xfrog_bulkstat(&ctx->mnt, new_breq);
+		if (error == EFSCORRUPTED) {
+			/*
+			 * start_ino is corrupt, increment and try the next
+			 * inode.
+			 */
+			start_ino++;
+			new_breq->hdr.ino = start_ino;
+			continue;
+		}
+		if (error) {
+			/*
+			 * Any other error means the caller falls back to
+			 * single stepping.
+			 */
+			break;
+		}
+		if (new_breq->hdr.ocount == 0)
+			break;
+
+		/* Copy new results to the original bstat buffer */
+		memcpy(old_bstat, new_breq->bulkstat,
+		       new_breq->hdr.ocount * sizeof(struct xfs_bulkstat));
+		orig_breq->hdr.ocount += new_breq->hdr.ocount;
+		old_bstat += new_breq->hdr.ocount;
+		seen_mask |= seen_mask_from_bulkstat(inums, start_ino,
+					new_breq);
+
+		new_breq->hdr.icount -= new_breq->hdr.ocount;
+		start_ino = new_breq->hdr.ino;
+	} while (new_breq->hdr.icount > 0 && new_breq->hdr.ino < limit_ino);
+
+	free(new_breq);
+	return seen_mask;
+}
+
 #define cmp_int(l, r)		((l > r) - (l < r))
 
 /* Compare two bulkstat records by inumber. */
@@ -200,6 +292,12 @@ bulkstat_single_step(
 				sizeof(struct xfs_bulkstat), compare_bstat);
 }
 
+/* Return the inumber of the highest allocated inode in the inumbers data. */
+static inline uint64_t last_allocmask_ino(const struct xfs_inumbers *i)
+{
+	return i->xi_startino + xfrog_highbit64(i->xi_allocmask);
+}
+
 /*
  * Run bulkstat on an entire inode allocation group, then check that we got
  * exactly the inodes we expected.  If not, load them one at a time (or fake
@@ -229,6 +327,16 @@ bulkstat_for_inumbers(
 					inumbers->xi_startino, breq);
 	}
 
+	/*
+	 * If the last allocated inode as reported by inumbers is higher than
+	 * the last inode reported by bulkstat, two things could have happened.
+	 * Either all the inodes at the high end of the cluster were freed
+	 * since the inumbers call; or bulkstat encountered a corrupt inode and
+	 * returned early.  Try to bulkstat the rest of the array.
+	 */
+	if (last_allocmask_ino(inumbers) > last_bstat_ino(breq))
+		seen_mask |= bulkstat_the_rest(ctx, inumbers, breq, error);
+
 	/*
 	 * Bulkstat might return inodes beyond xi_startino + CHUNKSIZE.  Reduce
 	 * ocount to ignore inodes not described by the inumbers record.
@@ -241,7 +349,7 @@ bulkstat_for_inumbers(
 
 	/*
 	 * Fill in any missing inodes that are mentioned in the alloc mask but
-	 * weren't previously seen by bulkstat.
+	 * weren't previously seen by bulkstat.  These are the corrupt inodes.
 	 */
 	bulkstat_single_step(ctx, inumbers, seen_mask, breq);
 }