Re: [PATCH 04/21] xfs: repair the AGF and AGFL

Allison Henderson <allison.henderson@xxxxxxxxxx> · Wed, 27 Jun 2018 09:44:53 -0700

On 06/26/2018 07:19 PM, Dave Chinner wrote:
On Sun, Jun 24, 2018 at 12:23:54PM -0700, Darrick J. Wong wrote:
From: Darrick J. Wong <darrick.wong@xxxxxxxxxx>

Regenerate the AGF and AGFL from the rmap data.

Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>

[...]

+/* Information for finding AGF-rooted btrees */
+enum {
+	REPAIR_AGF_BNOBT = 0,
+	REPAIR_AGF_CNTBT,
+	REPAIR_AGF_RMAPBT,
+	REPAIR_AGF_REFCOUNTBT,
+	REPAIR_AGF_END,
+	REPAIR_AGF_MAX
+};

Why can't you just use XFS_BTNUM_* for these btree type descriptors?

+
+static const struct xfs_repair_find_ag_btree repair_agf[] = {
+	[REPAIR_AGF_BNOBT] = {
+		.rmap_owner = XFS_RMAP_OWN_AG,
+		.buf_ops = &xfs_allocbt_buf_ops,
+		.magic = XFS_ABTB_CRC_MAGIC,
+	},
+	[REPAIR_AGF_CNTBT] = {
+		.rmap_owner = XFS_RMAP_OWN_AG,
+		.buf_ops = &xfs_allocbt_buf_ops,
+		.magic = XFS_ABTC_CRC_MAGIC,
+	},

I had to stop and think about why this only supports the v5 types.
i.e. we're rebuilding from rmap info, so this will never run on v4
filesystems, hence we only care about v5 types (i.e. *CRC_MAGIC).
Perhaps a one-line comment to remind readers of this?

+	[REPAIR_AGF_RMAPBT] = {
+		.rmap_owner = XFS_RMAP_OWN_AG,
+		.buf_ops = &xfs_rmapbt_buf_ops,
+		.magic = XFS_RMAP_CRC_MAGIC,
+	},
+	[REPAIR_AGF_REFCOUNTBT] = {
+		.rmap_owner = XFS_RMAP_OWN_REFC,
+		.buf_ops = &xfs_refcountbt_buf_ops,
+		.magic = XFS_REFC_CRC_MAGIC,
+	},
+	[REPAIR_AGF_END] = {
+		.buf_ops = NULL,
+	},
+};
+
+/*
+ * Find the btree roots.  This is /also/ a chicken and egg problem because we
+ * have to use the rmapbt (rooted in the AGF) to find the btrees rooted in the
+ * AGF.  We also have no idea if the btrees make any sense.  If we hit obvious
+ * corruptions in those btrees we'll bail out.
+ */
+STATIC int
+xfs_repair_agf_find_btrees(
+	struct xfs_scrub_context	*sc,
+	struct xfs_buf			*agf_bp,
+	struct xfs_repair_find_ag_btree	*fab,
+	struct xfs_buf			*agfl_bp)
+{
+	struct xfs_agf			*old_agf = XFS_BUF_TO_AGF(agf_bp);
+	int				error;
+
+	/* Go find the root data. */
+	memcpy(fab, repair_agf, sizeof(repair_agf));

Why are we initialising fab here, instead of in the caller where it
is declared and passed to various functions? Given there is only a
single declaration of this structure, why do we need a global static
const table initialiser just to copy it here - why isn't it
initialised at the declaration point?

+	error = xfs_repair_find_ag_btree_roots(sc, agf_bp, fab, agfl_bp);
+	if (error)
+		return error;
+
+	/* We must find the bnobt, cntbt, and rmapbt roots. */
+	if (fab[REPAIR_AGF_BNOBT].root == NULLAGBLOCK ||
+	    fab[REPAIR_AGF_BNOBT].height > XFS_BTREE_MAXLEVELS ||
+	    fab[REPAIR_AGF_CNTBT].root == NULLAGBLOCK ||
+	    fab[REPAIR_AGF_CNTBT].height > XFS_BTREE_MAXLEVELS ||
+	    fab[REPAIR_AGF_RMAPBT].root == NULLAGBLOCK ||
+	    fab[REPAIR_AGF_RMAPBT].height > XFS_BTREE_MAXLEVELS)
+		return -EFSCORRUPTED;
+
+	/*
+	 * We relied on the rmapbt to reconstruct the AGF.  If we get a
+	 * different root then something's seriously wrong.
+	 */
+	if (fab[REPAIR_AGF_RMAPBT].root !=
+	    be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi]))
+		return -EFSCORRUPTED;
+
+	/* We must find the refcountbt root if that feature is enabled. */
+	if (xfs_sb_version_hasreflink(&sc->mp->m_sb) &&
+	    (fab[REPAIR_AGF_REFCOUNTBT].root == NULLAGBLOCK ||
+	     fab[REPAIR_AGF_REFCOUNTBT].height > XFS_BTREE_MAXLEVELS))
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/* Set btree root information in an AGF. */
+STATIC void
+xfs_repair_agf_set_roots(
+	struct xfs_scrub_context	*sc,
+	struct xfs_agf			*agf,
+	struct xfs_repair_find_ag_btree	*fab)
+{
+	agf->agf_roots[XFS_BTNUM_BNOi] =
+			cpu_to_be32(fab[REPAIR_AGF_BNOBT].root);
+	agf->agf_levels[XFS_BTNUM_BNOi] =
+			cpu_to_be32(fab[REPAIR_AGF_BNOBT].height);
+
+	agf->agf_roots[XFS_BTNUM_CNTi] =
+			cpu_to_be32(fab[REPAIR_AGF_CNTBT].root);
+	agf->agf_levels[XFS_BTNUM_CNTi] =
+			cpu_to_be32(fab[REPAIR_AGF_CNTBT].height);
+
+	agf->agf_roots[XFS_BTNUM_RMAPi] =
+			cpu_to_be32(fab[REPAIR_AGF_RMAPBT].root);
+	agf->agf_levels[XFS_BTNUM_RMAPi] =
+			cpu_to_be32(fab[REPAIR_AGF_RMAPBT].height);
+
+	if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) {
+		agf->agf_refcount_root =
+				cpu_to_be32(fab[REPAIR_AGF_REFCOUNTBT].root);
+		agf->agf_refcount_level =
+				cpu_to_be32(fab[REPAIR_AGF_REFCOUNTBT].height);
+	}
+}
+
+/*
+ * Reinitialize the AGF header, making an in-core copy of the old contents so
+ * that we know which in-core state needs to be reinitialized.
+ */
+STATIC void
+xfs_repair_agf_init_header(
+	struct xfs_scrub_context	*sc,
+	struct xfs_buf			*agf_bp,
+	struct xfs_agf			*old_agf)
+{
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_agf			*agf = XFS_BUF_TO_AGF(agf_bp);
+
+	memcpy(old_agf, agf, sizeof(*old_agf));
+	memset(agf, 0, BBTOB(agf_bp->b_length));
+	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+	agf->agf_seqno = cpu_to_be32(sc->sa.agno);
+	agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp, sc->sa.agno));
+	agf->agf_flfirst = old_agf->agf_flfirst;
+	agf->agf_fllast = old_agf->agf_fllast;
+	agf->agf_flcount = old_agf->agf_flcount;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+}

Do we need to clear pag->pagf_init here so that it gets
re-initialised next time someone reads the AGF?

+
+/* Update the AGF btree counters by walking the btrees. */
+STATIC int
+xfs_repair_agf_update_btree_counters(
+	struct xfs_scrub_context	*sc,
+	struct xfs_buf			*agf_bp)
+{
+	struct xfs_repair_agf_allocbt	raa = { .sc = sc };
+	struct xfs_btree_cur		*cur = NULL;
+	struct xfs_agf			*agf = XFS_BUF_TO_AGF(agf_bp);
+	struct xfs_mount		*mp = sc->mp;
+	xfs_agblock_t			btreeblks;
+	xfs_agblock_t			blocks;
+	int				error;
+
+	/* Update the AGF counters from the bnobt. */
+	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
+			XFS_BTNUM_BNO);
+	error = xfs_alloc_query_all(cur, xfs_repair_agf_walk_allocbt, &raa);
+	if (error)
+		goto err;
+	error = xfs_btree_count_blocks(cur, &blocks);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	btreeblks = blocks - 1;
+	agf->agf_freeblks = cpu_to_be32(raa.freeblks);
+	agf->agf_longest = cpu_to_be32(raa.longest);

This function updates more than the AGF btree counters. :P

+
+	/* Update the AGF counters from the cntbt. */
+	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
+			XFS_BTNUM_CNT);
+	error = xfs_btree_count_blocks(cur, &blocks);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	btreeblks += blocks - 1;
+
+	/* Update the AGF counters from the rmapbt. */
+	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+	error = xfs_btree_count_blocks(cur, &blocks);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	agf->agf_rmap_blocks = cpu_to_be32(blocks);
+	btreeblks += blocks - 1;
+
+	agf->agf_btreeblks = cpu_to_be32(btreeblks);
+
+	/* Update the AGF counters from the refcountbt. */
+	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+		cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
+				sc->sa.agno, NULL);
+		error = xfs_btree_count_blocks(cur, &blocks);
+		if (error)
+			goto err;
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		agf->agf_refcount_blocks = cpu_to_be32(blocks);
+	}
+
+	return 0;
+err:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/* Trigger reinitialization of the in-core data. */
+STATIC int
+xfs_repair_agf_reinit_incore(
+	struct xfs_scrub_context	*sc,
+	struct xfs_agf			*agf,
+	const struct xfs_agf		*old_agf)
+{
+	struct xfs_perag		*pag;
+
+	/* XXX: trigger fdblocks recalculation */
+
+	/* Now reinitialize the in-core counters if necessary. */
+	pag = sc->sa.pag;
+	if (!pag->pagf_init)
+		return 0;
+
+	pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
+	pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+	pag->pagf_longest = be32_to_cpu(agf->agf_longest);
+	pag->pagf_levels[XFS_BTNUM_BNOi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+	pag->pagf_levels[XFS_BTNUM_CNTi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+	pag->pagf_levels[XFS_BTNUM_RMAPi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+	pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);

Ok, so we reinit the pagf bits here, but....

+
+	return 0;
+}
+
+/* Repair the AGF. */
+int
+xfs_repair_agf(
+	struct xfs_scrub_context	*sc)
+{
+	struct xfs_repair_find_ag_btree	fab[REPAIR_AGF_MAX];
+	struct xfs_agf			old_agf;
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_buf			*agf_bp;
+	struct xfs_buf			*agfl_bp;
+	struct xfs_agf			*agf;
+	int				error;
+
+	/* We require the rmapbt to rebuild anything. */
+	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+		return -EOPNOTSUPP;
+
+	xfs_scrub_perag_get(sc->mp, &sc->sa);
+	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
+	if (error)
+		return error;
+	agf_bp->b_ops = &xfs_agf_buf_ops;
+	agf = XFS_BUF_TO_AGF(agf_bp);
+
+	/*
+	 * Load the AGFL so that we can screen out OWN_AG blocks that are on
+	 * the AGFL now; these blocks might have once been part of the
+	 * bno/cnt/rmap btrees but are not now.  This is a chicken and egg
+	 * problem: the AGF is corrupt, so we have to trust the AGFL contents
+	 * because we can't do any serious cross-referencing with any of the
+	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
+	 * then we'll bail out.
+	 */
+	error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp);
+	if (error)
+		return error;
+
+	/*
+	 * Spot-check the AGFL blocks; if they're obviously corrupt then
+	 * there's nothing we can do but bail out.
+	 */
+	error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp,
+			xfs_repair_agf_check_agfl_block, sc);
+	if (error)
+		return error;
+
+	/*
+	 * Find the AGF btree roots.  See the comment for this function for
+	 * more information about the limitations of this repairer; this is
+	 * also a chicken-and-egg situation.
+	 */
+	error = xfs_repair_agf_find_btrees(sc, agf_bp, fab, agfl_bp);
+	if (error)
+		return error;

Comment could be better written.

	/*
	 * Find the AGF btree roots. This is also a chicken-and-egg
	 * situation - see xfs_repair_agf_find_btrees() for details.
	 */

+
+	/* Start rewriting the header and implant the btrees we found. */
+	xfs_repair_agf_init_header(sc, agf_bp, &old_agf);
+	xfs_repair_agf_set_roots(sc, agf, fab);
+	error = xfs_repair_agf_update_btree_counters(sc, agf_bp);
+	if (error)
+		goto out_revert;

If we fail here, the pagf information is invalid, hence I think we
really do need to clear pagf_init before we start rebuilding the new
AGF. Yes, I can see we revert the AGF info, but this seems like a
landmine waiting to be tripped over.

+	/* Reinitialize in-core state. */
+	error = xfs_repair_agf_reinit_incore(sc, agf, &old_agf);
+	if (error)
+		goto out_revert;
+
+	/* Write this to disk. */
+	xfs_trans_buf_set_type(sc->tp, agf_bp, XFS_BLFT_AGF_BUF);
+	xfs_trans_log_buf(sc->tp, agf_bp, 0, BBTOB(agf_bp->b_length) - 1);
+	return 0;
+
+out_revert:
+	memcpy(agf, &old_agf, sizeof(old_agf));
+	return error;
+}
+
+/* AGFL */
+
+struct xfs_repair_agfl {
+	struct xfs_repair_extent_list	agmeta_list;
+	struct xfs_repair_extent_list	*freesp_list;
+	struct xfs_scrub_context	*sc;
+};
+
+/* Record all freespace information. */
+STATIC int
+xfs_repair_agfl_rmap_fn(
+	struct xfs_btree_cur		*cur,
+	struct xfs_rmap_irec		*rec,
+	void				*priv)
+{
+	struct xfs_repair_agfl		*ra = priv;
+	xfs_fsblock_t			fsb;
+	int				error = 0;
+
+	if (xfs_scrub_should_terminate(ra->sc, &error))
+		return error;
+
+	/* Record all the OWN_AG blocks. */
+	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
+		fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+				rec->rm_startblock);
+		error = xfs_repair_collect_btree_extent(ra->sc,
+				ra->freesp_list, fsb, rec->rm_blockcount);
+		if (error)
+			return error;
+	}
+
+	return xfs_repair_collect_btree_cur_blocks(ra->sc, cur,
+			xfs_repair_collect_btree_cur_blocks_in_extent_list,

Urk. The function name lengths is getting out of hand. I'm very
tempted to suggest we should shorten the namespace of all this
like s/xfs_repair_/xr_/ and s/xfs_scrub_/xs_/, etc just to make them
shorter and easier to read.

Oh, wait, did I say that out loud? :P

Something to think about, anyway.

Well they are sort of long, but TBH I think i still kind of appreciate 
the extra verbiage.  I have seen other projects do things like adopt a 
sort of 3 or 4 letter abbreviation (like maybe xfs_scrb or xfs_repr). 
Helps to cut down on the verbosity while still not loosing too much of 
what it is supposed to mean.  Just another idea to consider. :-)

+			&ra->agmeta_list);
+}
+
+/* Add a btree block to the agmeta list. */
+STATIC int
+xfs_repair_agfl_visit_btblock(

I find the name a bit confusing - AGFLs don't have btree blocks.
Yes, I know that it's a xfs_btree_visit_blocks() callback but I
think s/visit/collect/ makes more sense. i.e. it tells us what we
are doing with the btree block, rather than making it sound like we
are walking AGFL btree blocks...

+/*
+ * Map out all the non-AGFL OWN_AG space in this AG so that we can deduce
+ * which blocks belong to the AGFL.
+ */
+STATIC int
+xfs_repair_agfl_find_extents(

Same here - xr_agfl_collect_free_extents()?

+	struct xfs_scrub_context	*sc,
+	struct xfs_buf			*agf_bp,
+	struct xfs_repair_extent_list	*agfl_extents,
+	xfs_agblock_t			*flcount)
+{
+	struct xfs_repair_agfl		ra;
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_btree_cur		*cur;
+	struct xfs_repair_extent	*rae;
+	int				error;
+
+	ra.sc = sc;
+	ra.freesp_list = agfl_extents;
+	xfs_repair_init_extent_list(&ra.agmeta_list);
+
+	/* Find all space used by the free space btrees & rmapbt. */
+	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+	error = xfs_rmap_query_all(cur, xfs_repair_agfl_rmap_fn, &ra);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+	/* Find all space used by bnobt. */

Needs clarification.

	/* Find all the in use bnobt blocks */

+	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
+			XFS_BTNUM_BNO);
+	error = xfs_btree_visit_blocks(cur, xfs_repair_agfl_visit_btblock, &ra);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+	/* Find all space used by cntbt. */

	/* Find all the in use cntbt blocks */

+	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
+			XFS_BTNUM_CNT);
+	error = xfs_btree_visit_blocks(cur, xfs_repair_agfl_visit_btblock, &ra);
+	if (error)
+		goto err;
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+	/*
+	 * Drop the freesp meta blocks that are in use by btrees.
+	 * The remaining blocks /should/ be AGFL blocks.
+	 */
+	error = xfs_repair_subtract_extents(sc, agfl_extents, &ra.agmeta_list);
+	xfs_repair_cancel_btree_extents(sc, &ra.agmeta_list);
+	if (error)
+		return error;
+
+	/* Calculate the new AGFL size. */
+	*flcount = 0;
+	for_each_xfs_repair_extent(rae, agfl_extents) {
+		*flcount += rae->len;
+		if (*flcount > xfs_agfl_size(mp))
+			break;
+	}
+	if (*flcount > xfs_agfl_size(mp))
+		*flcount = xfs_agfl_size(mp);

Ok, so flcount is clamped here. What happens to all the remaining
agfl_extents beyond flcount?

+	return 0;
+
+err:

Ok, what cleans up all the extents we've recorded in ra on error?

+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/* Update the AGF and reset the in-core state. */
+STATIC int
+xfs_repair_agfl_update_agf(
+	struct xfs_scrub_context	*sc,
+	struct xfs_buf			*agf_bp,
+	xfs_agblock_t			flcount)
+{
+	struct xfs_agf			*agf = XFS_BUF_TO_AGF(agf_bp);
+
	ASSERT(*flcount <= xfs_agfl_size(mp));

+	/* XXX: trigger fdblocks recalculation */
+
+	/* Update the AGF counters. */
+	if (sc->sa.pag->pagf_init)
+		sc->sa.pag->pagf_flcount = flcount;
+	agf->agf_flfirst = cpu_to_be32(0);
+	agf->agf_flcount = cpu_to_be32(flcount);
+	agf->agf_fllast = cpu_to_be32(flcount - 1);
+
+	xfs_alloc_log_agf(sc->tp, agf_bp,
+			XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
+	return 0;
+}
+
+/* Write out a totally new AGFL. */
+STATIC void
+xfs_repair_agfl_init_header(
+	struct xfs_scrub_context	*sc,
+	struct xfs_buf			*agfl_bp,
+	struct xfs_repair_extent_list	*agfl_extents,
+	xfs_agblock_t			flcount)
+{
+	struct xfs_mount		*mp = sc->mp;
+	__be32				*agfl_bno;
+	struct xfs_repair_extent	*rae;
+	struct xfs_repair_extent	*n;
+	struct xfs_agfl			*agfl;
+	xfs_agblock_t			agbno;
+	unsigned int			fl_off;
+
	ASSERT(*flcount <= xfs_agfl_size(mp));

+	/* Start rewriting the header. */
+	agfl = XFS_BUF_TO_AGFL(agfl_bp);
+	memset(agfl, 0xFF, BBTOB(agfl_bp->b_length));
+	agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+	agfl->agfl_seqno = cpu_to_be32(sc->sa.agno);
+	uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
+
+	/* Fill the AGFL with the remaining blocks. */
+	fl_off = 0;
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agfl_bp);
+	for_each_xfs_repair_extent_safe(rae, n, agfl_extents) {
+		agbno = XFS_FSB_TO_AGBNO(mp, rae->fsbno);
+
+		trace_xfs_repair_agfl_insert(mp, sc->sa.agno, agbno, rae->len);
+
+		while (rae->len > 0 && fl_off < flcount) {
+			agfl_bno[fl_off] = cpu_to_be32(agbno);
+			fl_off++;
+			agbno++;
+			rae->fsbno++;
+			rae->len--;
+		}

This only works correctly if flcount <= xfs_agfl_size, which is why
I'm suggesting some asserts.

+
+		if (rae->len)
+			break;
+		list_del(&rae->list);
+		kmem_free(rae);
+	}
+
+	/* Write AGF and AGFL to disk. */
+	xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF);
+	xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1);
+}
+
+/* Repair the AGFL. */
+int
+xfs_repair_agfl(
+	struct xfs_scrub_context	*sc)
+{
+	struct xfs_owner_info		oinfo;
+	struct xfs_repair_extent_list	agfl_extents;
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_buf			*agf_bp;
+	struct xfs_buf			*agfl_bp;
+	xfs_agblock_t			flcount;
+	int				error;
+
+	/* We require the rmapbt to rebuild anything. */
+	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+		return -EOPNOTSUPP;
+
+	xfs_scrub_perag_get(sc->mp, &sc->sa);
+	xfs_repair_init_extent_list(&agfl_extents);
+
+	/*
+	 * Read the AGF so that we can query the rmapbt.  We hope that there's
+	 * nothing wrong with the AGF, but all the AG header repair functions
+	 * have this chicken-and-egg problem.
+	 */
+	error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp);
+	if (error)
+		return error;
+	if (!agf_bp)
+		return -ENOMEM;
+
+	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGFL_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL);
+	if (error)
+		return error;
+	agfl_bp->b_ops = &xfs_agfl_buf_ops;
+
+	/*
+	 * Compute the set of old AGFL blocks by subtracting from the list of
+	 * OWN_AG blocks the list of blocks owned by all other OWN_AG metadata
+	 * (bnobt, cntbt, rmapbt).  These are the old AGFL blocks, so return
+	 * that list and the number of blocks we're actually going to put back
+	 * on the AGFL.
+	 */

That comment belongs on the function, not here. All we need here is
something like:

	/* Gather all the extents we're going to put on the new AGFL. */

+	error = xfs_repair_agfl_find_extents(sc, agf_bp, &agfl_extents,
+			&flcount);
+	if (error)
+		goto err;
+
+	/*
+	 * Update AGF and AGFL.  We reset the global free block counter when
+	 * we adjust the AGF flcount (which can fail) so avoid updating any
+	 * bufers until we know that part works.

buffers

+	 */
+	error = xfs_repair_agfl_update_agf(sc, agf_bp, flcount);
+	if (error)
+		goto err;
+	xfs_repair_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount);
+
+	/*
+	 * Ok, the AGFL should be ready to go now.  Roll the transaction so
+	 * that we can free any AGFL overflow.
+	 */

Why does rolling the transaction allow us to free the overflow?
Shouldn't the comment say something like "Roll to the transaction to
make the new AGFL permanent before we start using it when returning
the residual AGFL freespace overflow back to the AGF freespace
btrees."

+	sc->sa.agf_bp = agf_bp;
+	sc->sa.agfl_bp = agfl_bp;
+	error = xfs_repair_roll_ag_trans(sc);
+	if (error)
+		goto err;
+
+	/* Dump any AGFL overflow. */
+	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+	return xfs_repair_reap_btree_extents(sc, &agfl_extents, &oinfo,
+			XFS_AG_RESV_AGFL);
+err:
+	xfs_repair_cancel_btree_extents(sc, &agfl_extents);
+	return error;
+}

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 326be4e8b71e..bcdaa8df18f6 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -127,9 +127,12 @@ xfs_repair_roll_ag_trans(
  	int				error;
  
  	/* Keep the AG header buffers locked so we can keep going. */
-	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
-	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
-	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
+	if (sc->sa.agi_bp)
+		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
+	if (sc->sa.agf_bp)
+		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+	if (sc->sa.agfl_bp)
+		xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
  
  	/* Roll the transaction. */
  	error = xfs_trans_roll(&sc->tp);
@@ -137,9 +140,12 @@ xfs_repair_roll_ag_trans(
  		goto out_release;
  
  	/* Join AG headers to the new transaction. */
-	xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
-	xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
-	xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
+	if (sc->sa.agi_bp)
+		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
+	if (sc->sa.agf_bp)
+		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+	if (sc->sa.agfl_bp)
+		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
  
  	return 0;
  
@@ -149,9 +155,12 @@ xfs_repair_roll_ag_trans(
  	 * buffers will be released during teardown on our way out
  	 * of the kernel.
  	 */
-	xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
-	xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
-	xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
+	if (sc->sa.agi_bp)
+		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
+	if (sc->sa.agf_bp)
+		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
+	if (sc->sa.agfl_bp)
+		xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
  
  	return error;
  }
@@ -408,6 +417,85 @@ xfs_repair_collect_btree_extent(
  	return 0;
  }
  
+/*
+ * Help record all btree blocks seen while iterating all records of a btree.
+ *
+ * We know that the btree query_all function starts at the left edge and walks
+ * towards the right edge of the tree.  Therefore, we know that we can walk up
+ * the btree cursor towards the root; if the pointer for a given level points
+ * to the first record/key in that block, we haven't seen this block before;
+ * and therefore we need to remember that we saw this block in the btree.
+ *
+ * So if our btree is:
+ *
+ *    4
+ *  / | \
+ * 1  2  3
+ *
+ * Pretend for this example that each leaf block has 100 btree records.  For
+ * the first btree record, we'll observe that bc_ptrs[0] == 1, so we record
+ * that we saw block 1.  Then we observe that bc_ptrs[1] == 1, so we record
+ * block 4.  The list is [1, 4].
+ *
+ * For the second btree record, we see that bc_ptrs[0] == 2, so we exit the
+ * loop.  The list remains [1, 4].
+ *
+ * For the 101st btree record, we've moved onto leaf block 2.  Now
+ * bc_ptrs[0] == 1 again, so we record that we saw block 2.  We see that
+ * bc_ptrs[1] == 2, so we exit the loop.  The list is now [1, 4, 2].
+ *
+ * For the 102nd record, bc_ptrs[0] == 2, so we continue.
+ *
+ * For the 201st record, we've moved on to leaf block 3.  bc_ptrs[0] == 1, so
+ * we add 3 to the list.  Now it is [1, 4, 2, 3].
+ *
+ * For the 300th record we just exit, with the list being [1, 4, 2, 3].
+ *
+ * The *iter_fn can return XFS_BTREE_QUERY_RANGE_ABORT to stop, 0 to keep
+ * iterating, or the usual negative error code.
+ */
+int
+xfs_repair_collect_btree_cur_blocks(
+	struct xfs_scrub_context	*sc,
+	struct xfs_btree_cur		*cur,
+	int				(*iter_fn)(struct xfs_scrub_context *sc,
+						   xfs_fsblock_t fsbno,
+						   xfs_fsblock_t len,
+						   void *priv),
+	void				*priv)
+{
+	struct xfs_buf			*bp;
+	xfs_fsblock_t			fsb;
+	int				i;
+	int				error;
+
+	for (i = 0; i < cur->bc_nlevels && cur->bc_ptrs[i] == 1; i++) {
+		xfs_btree_get_block(cur, i, &bp);
+		if (!bp)
+			continue;
+		fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
+		error = iter_fn(sc, fsb, 1, priv);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Simple adapter to connect xfs_repair_collect_btree_extent to
+ * xfs_repair_collect_btree_cur_blocks.
+ */
+int
+xfs_repair_collect_btree_cur_blocks_in_extent_list(
+	struct xfs_scrub_context	*sc,
+	xfs_fsblock_t			fsbno,
+	xfs_fsblock_t			len,
+	void				*priv)
+{
+	return xfs_repair_collect_btree_extent(sc, priv, fsbno, len);
+}
+
  /*
   * An error happened during the rebuild so the transaction will be cancelled.
   * The fs will shut down, and the administrator has to unmount and run repair.
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index ef47826b6725..f2af5923aa75 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -48,9 +48,20 @@ xfs_repair_init_extent_list(
  
  #define for_each_xfs_repair_extent_safe(rbe, n, exlist) \
  	list_for_each_entry_safe((rbe), (n), &(exlist)->list, list)
+#define for_each_xfs_repair_extent(rbe, exlist) \
+	list_for_each_entry((rbe), &(exlist)->list, list)
  int xfs_repair_collect_btree_extent(struct xfs_scrub_context *sc,
  		struct xfs_repair_extent_list *btlist, xfs_fsblock_t fsbno,
  		xfs_extlen_t len);
+int xfs_repair_collect_btree_cur_blocks(struct xfs_scrub_context *sc,
+		struct xfs_btree_cur *cur,
+		int (*iter_fn)(struct xfs_scrub_context *sc,
+			       xfs_fsblock_t fsbno, xfs_fsblock_t len,
+			       void *priv),
+		void *priv);
+int xfs_repair_collect_btree_cur_blocks_in_extent_list(
+		struct xfs_scrub_context *sc, xfs_fsblock_t fsbno,
+		xfs_fsblock_t len, void *priv);
  void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc,
  		struct xfs_repair_extent_list *btlist);
  int xfs_repair_subtract_extents(struct xfs_scrub_context *sc,
@@ -89,6 +100,8 @@ int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
  
  int xfs_repair_probe(struct xfs_scrub_context *sc);
  int xfs_repair_superblock(struct xfs_scrub_context *sc);
+int xfs_repair_agf(struct xfs_scrub_context *sc);
+int xfs_repair_agfl(struct xfs_scrub_context *sc);
  
  #else
  
@@ -112,6 +125,8 @@ xfs_repair_calc_ag_resblks(
  
  #define xfs_repair_probe		xfs_repair_notsupported
  #define xfs_repair_superblock		xfs_repair_notsupported
+#define xfs_repair_agf			xfs_repair_notsupported
+#define xfs_repair_agfl			xfs_repair_notsupported
  
  #endif /* CONFIG_XFS_ONLINE_REPAIR */
  
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 58ae76b3a421..8e11c3c699fb 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -208,13 +208,13 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
  		.type	= ST_PERAG,
  		.setup	= xfs_scrub_setup_fs,
  		.scrub	= xfs_scrub_agf,
-		.repair	= xfs_repair_notsupported,
+		.repair	= xfs_repair_agf,
  	},
  	[XFS_SCRUB_TYPE_AGFL]= {	/* agfl */
  		.type	= ST_PERAG,
  		.setup	= xfs_scrub_setup_fs,
  		.scrub	= xfs_scrub_agfl,
-		.repair	= xfs_repair_notsupported,
+		.repair	= xfs_repair_agfl,
  	},
  	[XFS_SCRUB_TYPE_AGI] = {	/* agi */
  		.type	= ST_PERAG,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 524f543c5b82..c08785cf83a9 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -126,6 +126,60 @@ xfs_trans_dup(
  	return ntp;
  }
  
+/*
+ * Try to reserve more blocks for a transaction.  The single use case we
+ * support is for online repair -- use a transaction to gather data without
+ * fear of btree cycle deadlocks; calculate how many blocks we really need
+ * from that data; and only then start modifying data.  This can fail due to
+ * ENOSPC, so we have to be able to cancel the transaction.
+ */
+int
+xfs_trans_reserve_more(
+	struct xfs_trans	*tp,
+	uint			blocks,
+	uint			rtextents)

This isn't used in this patch - seems out of place here. Committed
to the wrong patch?

Cheers,

Dave.

--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html