[PATCH 3/4] xfs_scrub: balance inode chunk scan across CPUs

"Darrick J. Wong" <djwong@xxxxxxxxxx> · Thu, 05 May 2022 09:08:35 -0700

From: Darrick J. Wong <djwong@xxxxxxxxxx>

Use the bounded workqueue functionality to spread the inode chunk scan
load across the CPUs more evenly.  First, we create per-AG workers to
walk each AG's inode btree to create inode batch work items for each
inobt record.  These items are added to a (second) bounded workqueue
that invokes BULKSTAT and invokes the caller's function on each bulkstat
record.

By splitting the work items into batches of 64 inodes instead of one
thread per AG, we keep the level of parallelism at a reasonably high
level almost all the way to the end of the inode scan if the inodes are
not evenly divided across AGs or if a few files have far more extent
records than average.

Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx>
---
 scrub/inodes.c |  336 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 248 insertions(+), 88 deletions(-)

diff --git a/scrub/inodes.c b/scrub/inodes.c
index 80af8a74..41e5fdc7 100644
--- a/scrub/inodes.c
+++ b/scrub/inodes.c
@@ -16,6 +16,7 @@
 #include "xfs_scrub.h"
 #include "common.h"
 #include "inodes.h"
+#include "descr.h"
 #include "libfrog/fsgeom.h"
 #include "libfrog/bulkstat.h"
 
@@ -49,7 +50,7 @@
 static void
 bulkstat_for_inumbers(
 	struct scrub_ctx	*ctx,
-	const char		*descr,
+	struct descr		*dsc,
 	const struct xfs_inumbers *inumbers,
 	struct xfs_bulkstat_req	*breq)
 {
@@ -65,7 +66,7 @@ bulkstat_for_inumbers(
 	if (error) {
 		char	errbuf[DESCR_BUFSZ];
 
-		str_info(ctx, descr, "%s",
+		str_info(ctx, descr_render(dsc), "%s",
 			 strerror_r(error, errbuf, DESCR_BUFSZ));
 	}
 
@@ -95,61 +96,206 @@ bulkstat_for_inumbers(
 
 /* BULKSTAT wrapper routines. */
 struct scan_inodes {
+	struct workqueue	wq_bulkstat;
 	scrub_inode_iter_fn	fn;
 	void			*arg;
+	unsigned int		nr_threads;
 	bool			aborted;
 };
 
 /*
- * Call into the filesystem for inode/bulkstat information and call our
- * iterator function.  We'll try to fill the bulkstat information in batches,
- * but we also can detect iget failures.
+ * A single unit of inode scan work.  This contains a pointer to the parent
+ * information, followed by an INUMBERS request structure, followed by a
+ * BULKSTAT request structure.  The last two are VLAs, so we can't represent
+ * them here.
  */
-static void
-scan_ag_inodes(
-	struct workqueue	*wq,
-	xfs_agnumber_t		agno,
-	void			*arg)
+struct scan_ichunk {
+	struct scan_inodes	*si;
+};
+
+static inline struct xfs_inumbers_req *
+ichunk_to_inumbers(
+	struct scan_ichunk	*ichunk)
 {
-	struct xfs_handle	handle = { };
-	char			descr[DESCR_BUFSZ];
+	char			*p = (char *)ichunk;
+
+	return (struct xfs_inumbers_req *)(p + sizeof(struct scan_ichunk));
+}
+
+static inline struct xfs_bulkstat_req *
+ichunk_to_bulkstat(
+	struct scan_ichunk	*ichunk)
+{
+	char			*p = (char *)ichunk_to_inumbers(ichunk);
+
+	return (struct xfs_bulkstat_req *)(p + XFS_INUMBERS_REQ_SIZE(1));
+}
+
+static inline int
+alloc_ichunk(
+	struct scan_inodes	*si,
+	uint32_t		agno,
+	uint64_t		startino,
+	struct scan_ichunk	**ichunkp)
+{
+	struct scan_ichunk	*ichunk;
 	struct xfs_inumbers_req	*ireq;
 	struct xfs_bulkstat_req	*breq;
-	struct scan_inodes	*si = arg;
-	struct scrub_ctx	*ctx = (struct scrub_ctx *)wq->wq_ctx;
-	struct xfs_bulkstat	*bs;
-	struct xfs_inumbers	*inumbers;
-	uint64_t		nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0);
-	int			i;
-	int			error;
-	int			stale_count = 0;
-
-	snprintf(descr, DESCR_BUFSZ, _("dev %d:%d AG %u inodes"),
+
+	ichunk = calloc(1, sizeof(struct scan_ichunk) +
+			   XFS_INUMBERS_REQ_SIZE(1) +
+			   XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE));
+	if (!ichunk)
+		return -errno;
+
+	ichunk->si = si;
+
+	ireq = ichunk_to_inumbers(ichunk);
+	ireq->hdr.icount = 1;
+	ireq->hdr.ino = startino;
+	ireq->hdr.agno = agno;
+	ireq->hdr.flags |= XFS_BULK_IREQ_AGNO;
+
+	breq = ichunk_to_bulkstat(ichunk);
+	breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE;
+
+	*ichunkp = ichunk;
+	return 0;
+}
+
+int
+render_ino_from_bulkstat(
+	struct scrub_ctx	*ctx,
+	char			*buf,
+	size_t			buflen,
+	void			*data)
+{
+	struct xfs_bulkstat	*bstat = data;
+
+	return scrub_render_ino_descr(ctx, buf, buflen, bstat->bs_ino,
+			bstat->bs_gen, NULL);
+}
+
+static int
+render_inumbers_from_agno(
+	struct scrub_ctx	*ctx,
+	char			*buf,
+	size_t			buflen,
+	void			*data)
+{
+	xfs_agnumber_t		*agno = data;
+
+	return snprintf(buf, buflen, _("dev %d:%d AG %u inodes"),
 				major(ctx->fsinfo.fs_datadev),
 				minor(ctx->fsinfo.fs_datadev),
-				agno);
+				*agno);
+}
+
+/*
+ * Call BULKSTAT for information on a single chunk's worth of inodes and call
+ * our iterator function.  We'll try to fill the bulkstat information in
+ * batches, but we also can detect iget failures.
+ */
+static void
+scan_ag_bulkstat(
+	struct workqueue	*wq,
+	xfs_agnumber_t		agno,
+	void			*arg)
+{
+	struct xfs_handle	handle = { };
+	struct scrub_ctx	*ctx = (struct scrub_ctx *)wq->wq_ctx;
+	struct scan_ichunk	*ichunk = arg;
+	struct xfs_inumbers_req	*ireq = ichunk_to_inumbers(ichunk);
+	struct xfs_bulkstat_req	*breq = ichunk_to_bulkstat(ichunk);
+	struct scan_inodes	*si = ichunk->si;
+	struct xfs_bulkstat	*bs;
+	struct xfs_inumbers	*inumbers = &ireq->inumbers[0];
+	int			i;
+	int			error;
+	int			stale_count = 0;
+	DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat);
+	DEFINE_DESCR(dsc_inumbers, ctx, render_inumbers_from_agno);
+
+	descr_set(&dsc_inumbers, &agno);
 
 	memcpy(&handle.ha_fsid, ctx->fshandle, sizeof(handle.ha_fsid));
 	handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
 			sizeof(handle.ha_fid.fid_len);
 	handle.ha_fid.fid_pad = 0;
 
-	error = -xfrog_bulkstat_alloc_req(LIBFROG_BULKSTAT_CHUNKSIZE, 0, &breq);
-	if (error) {
-		str_liberror(ctx, error, descr);
-		si->aborted = true;
-		return;
+retry:
+	bulkstat_for_inumbers(ctx, &dsc_inumbers, inumbers, breq);
+
+	/* Iterate all the inodes. */
+	bs = &breq->bulkstat[0];
+	for (i = 0; !si->aborted && i < inumbers->xi_alloccount; i++, bs++) {
+		descr_set(&dsc_bulkstat, bs);
+		handle.ha_fid.fid_ino = bs->bs_ino;
+		handle.ha_fid.fid_gen = bs->bs_gen;
+		error = si->fn(ctx, &handle, bs, si->arg);
+		switch (error) {
+		case 0:
+			break;
+		case ESTALE: {
+			stale_count++;
+			if (stale_count < 30) {
+				ireq->hdr.ino = inumbers->xi_startino;
+				error = -xfrog_inumbers(&ctx->mnt, ireq);
+				if (error)
+					goto err;
+				goto retry;
+			}
+			str_info(ctx, descr_render(&dsc_bulkstat),
+_("Changed too many times during scan; giving up."));
+			si->aborted = true;
+			goto out;
+		}
+		case ECANCELED:
+			error = 0;
+			fallthrough;
+		default:
+			goto err;
+		}
+		if (scrub_excessive_errors(ctx)) {
+			si->aborted = true;
+			goto out;
+		}
 	}
 
-	error = -xfrog_inumbers_alloc_req(1, 0, &ireq);
+err:
 	if (error) {
-		str_liberror(ctx, error, descr);
-		free(breq);
+		str_liberror(ctx, error, descr_render(&dsc_bulkstat));
 		si->aborted = true;
-		return;
 	}
-	inumbers = &ireq->inumbers[0];
-	xfrog_inumbers_set_ag(ireq, agno);
+out:
+	free(ichunk);
+}
+
+/*
+ * Call INUMBERS for information about inode chunks, then queue the inumbers
+ * responses in the bulkstat workqueue.  This helps us maximize CPU parallelism
+ * if the filesystem AGs are not evenly loaded.
+ */
+static void
+scan_ag_inumbers(
+	struct workqueue	*wq,
+	xfs_agnumber_t		agno,
+	void			*arg)
+{
+	struct scan_ichunk	*ichunk = NULL;
+	struct scan_inodes	*si = arg;
+	struct scrub_ctx	*ctx = (struct scrub_ctx *)wq->wq_ctx;
+	struct xfs_inumbers_req	*ireq;
+	uint64_t		nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0);
+	int			error;
+	DEFINE_DESCR(dsc, ctx, render_inumbers_from_agno);
+
+	descr_set(&dsc, &agno);
+
+	error = alloc_ichunk(si, agno, 0, &ichunk);
+	if (error)
+		goto err;
+	ireq = ichunk_to_inumbers(ichunk);
 
 	/* Find the inode chunk & alloc mask */
 	error = -xfrog_inumbers(&ctx->mnt, ireq);
@@ -158,8 +304,8 @@ scan_ag_inodes(
 		 * Make sure that we always make forward progress while we
 		 * scan the inode btree.
 		 */
-		if (nextino > inumbers->xi_startino) {
-			str_corrupt(ctx, descr,
+		if (nextino > ireq->inumbers[0].xi_startino) {
+			str_corrupt(ctx, descr_render(&dsc),
 	_("AG %u inode btree is corrupt near agino %lu, got %lu"), agno,
 				cvt_ino_to_agino(&ctx->mnt, nextino),
 				cvt_ino_to_agino(&ctx->mnt,
@@ -169,64 +315,53 @@ scan_ag_inodes(
 		}
 		nextino = ireq->hdr.ino;
 
-		/*
-		 * We can have totally empty inode chunks on filesystems where
-		 * there are more than 64 inodes per block.  Skip these.
-		 */
-		if (inumbers->xi_alloccount == 0)
-			goto igrp_retry;
-
-		bulkstat_for_inumbers(ctx, descr, inumbers, breq);
-
-		/* Iterate all the inodes. */
-		for (i = 0, bs = breq->bulkstat;
-		     !si->aborted && i < inumbers->xi_alloccount;
-		     i++, bs++) {
-			handle.ha_fid.fid_ino = bs->bs_ino;
-			handle.ha_fid.fid_gen = bs->bs_gen;
-			error = si->fn(ctx, &handle, bs, si->arg);
-			switch (error) {
-			case 0:
-				break;
-			case ESTALE: {
-				char	idescr[DESCR_BUFSZ];
-
-				stale_count++;
-				if (stale_count < 30) {
-					ireq->hdr.ino = inumbers->xi_startino;
-					goto igrp_retry;
-				}
-				scrub_render_ino_descr(ctx, idescr, DESCR_BUFSZ,
-						bs->bs_ino, bs->bs_gen, NULL);
-				str_info(ctx, idescr,
-_("Changed too many times during scan; giving up."));
-				break;
-			}
-			case ECANCELED:
-				error = 0;
-				fallthrough;
-			default:
-				goto err;
-			}
-			if (scrub_excessive_errors(ctx)) {
+		if (ireq->inumbers[0].xi_alloccount == 0) {
+			/*
+			 * We can have totally empty inode chunks on
+			 * filesystems where there are more than 64 inodes per
+			 * block.  Skip these.
+			 */
+			;
+		} else if (si->nr_threads > 0) {
+			/* Queue this inode chunk on the bulkstat workqueue. */
+			error = -workqueue_add(&si->wq_bulkstat,
+					scan_ag_bulkstat, agno, ichunk);
+			if (error) {
 				si->aborted = true;
+				str_liberror(ctx, error,
+						_("queueing bulkstat work"));
 				goto out;
 			}
+			ichunk = NULL;
+		} else {
+			/*
+			 * Only one thread, call bulkstat directly.  Remember,
+			 * ichunk is freed by the worker before returning.
+			 */
+			scan_ag_bulkstat(wq, agno, ichunk);
+			ichunk = NULL;
+			if (si->aborted)
+				break;
 		}
 
-		stale_count = 0;
-igrp_retry:
+		if (!ichunk) {
+			error = alloc_ichunk(si, agno, nextino, &ichunk);
+			if (error)
+				goto err;
+		}
+		ireq = ichunk_to_inumbers(ichunk);
+
 		error = -xfrog_inumbers(&ctx->mnt, ireq);
 	}
 
 err:
 	if (error) {
-		str_liberror(ctx, error, descr);
+		str_liberror(ctx, error, descr_render(&dsc));
 		si->aborted = true;
 	}
 out:
-	free(ireq);
-	free(breq);
+	if (ichunk)
+		free(ichunk);
 }
 
 /*
@@ -242,33 +377,58 @@ scrub_scan_all_inodes(
 	struct scan_inodes	si = {
 		.fn		= fn,
 		.arg		= arg,
+		.nr_threads	= scrub_nproc_workqueue(ctx),
 	};
 	xfs_agnumber_t		agno;
-	struct workqueue	wq;
+	struct workqueue	wq_inumbers;
+	unsigned int		max_bulkstat;
 	int			ret;
 
-	ret = -workqueue_create(&wq, (struct xfs_mount *)ctx,
-			scrub_nproc_workqueue(ctx));
+	/*
+	 * The bulkstat workqueue should queue at most one inobt block's worth
+	 * of inode chunk records per worker thread.  If we're running in
+	 * single thread mode (nr_threads==0) then we skip the workqueues.
+	 */
+	max_bulkstat = si.nr_threads * (ctx->mnt.fsgeom.blocksize / 16);
+
+	ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx,
+			si.nr_threads, max_bulkstat);
 	if (ret) {
 		str_liberror(ctx, ret, _("creating bulkstat workqueue"));
 		return -1;
 	}
 
+	ret = -workqueue_create(&wq_inumbers, (struct xfs_mount *)ctx,
+			si.nr_threads);
+	if (ret) {
+		str_liberror(ctx, ret, _("creating inumbers workqueue"));
+		si.aborted = true;
+		goto kill_bulkstat;
+	}
+
 	for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) {
-		ret = -workqueue_add(&wq, scan_ag_inodes, agno, &si);
+		ret = -workqueue_add(&wq_inumbers, scan_ag_inumbers, agno, &si);
 		if (ret) {
 			si.aborted = true;
-			str_liberror(ctx, ret, _("queueing bulkstat work"));
+			str_liberror(ctx, ret, _("queueing inumbers work"));
 			break;
 		}
 	}
 
-	ret = -workqueue_terminate(&wq);
+	ret = -workqueue_terminate(&wq_inumbers);
+	if (ret) {
+		si.aborted = true;
+		str_liberror(ctx, ret, _("finishing inumbers work"));
+	}
+	workqueue_destroy(&wq_inumbers);
+
+kill_bulkstat:
+	ret = -workqueue_terminate(&si.wq_bulkstat);
 	if (ret) {
 		si.aborted = true;
 		str_liberror(ctx, ret, _("finishing bulkstat work"));
 	}
-	workqueue_destroy(&wq);
+	workqueue_destroy(&si.wq_bulkstat);
 
 	return si.aborted ? -1 : 0;
 }